End of training

Browse files

Files changed (4) hide show

all_results.json +6 -6
runs/Feb13_14-20-30_DESKTOP-T04IOFP/events.out.tfevents.1739484113.DESKTOP-T04IOFP.18948.1 +2 -2
test_results.json +6 -6
trainer_state.json +1697 -32

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 1.0,
-    "eval_accuracy": 0.3036231884057971,
-    "eval_loss": 1.1199791431427002,
-    "eval_runtime": 998.4286,
-    "eval_samples_per_second": 2.764,
-    "eval_steps_per_second": 0.23
 }

 {
+    "epoch": 9.096017699115045,
+    "eval_accuracy": 0.7652173913043478,
+    "eval_loss": 0.6633358597755432,
+    "eval_runtime": 903.9019,
+    "eval_samples_per_second": 3.053,
+    "eval_steps_per_second": 0.096
 }

runs/Feb13_14-20-30_DESKTOP-T04IOFP/events.out.tfevents.1739484113.DESKTOP-T04IOFP.18948.1 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:97d06f627fc897366d84267d4315593eb20adf44ed9b17eaab13baf6f5470c60
-size 411

 version https://git-lfs.github.com/spec/v1
+oid sha256:7b2e7e3e087679270790296c73aa48b295a4bed2330633c7469e2f885d7654aa
+size 734

test_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 1.0,
-    "eval_accuracy": 0.3036231884057971,
-    "eval_loss": 1.1199791431427002,
-    "eval_runtime": 998.4286,
-    "eval_samples_per_second": 2.764,
-    "eval_steps_per_second": 0.23
 }

 {
+    "epoch": 9.096017699115045,
+    "eval_accuracy": 0.7652173913043478,
+    "eval_loss": 0.6633358597755432,
+    "eval_runtime": 903.9019,
+    "eval_samples_per_second": 3.053,
+    "eval_steps_per_second": 0.096
 }

trainer_state.json CHANGED Viewed

@@ -1,50 +1,1715 @@
 {
-  "best_metric": 0.3359073359073359,
-  "best_model_checkpoint": "videomae-small-finetuned-kinetics-finetuned-2\\checkpoint-10",
-  "epoch": 1.0,
   "eval_steps": 500,
-  "global_step": 10,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 1.0,
-      "grad_norm": 2.350801944732666,
-      "learning_rate": 0.0,
-      "loss": 1.1384,
       "step": 10
     },
     {
-      "epoch": 1.0,
-      "eval_accuracy": 0.3359073359073359,
-      "eval_loss": 1.1075096130371094,
-      "eval_runtime": 1097.4829,
-      "eval_samples_per_second": 2.596,
-      "eval_steps_per_second": 0.217,
-      "step": 10
     },
     {
-      "epoch": 1.0,
-      "step": 10,
-      "total_flos": 3.794361829687296e+16,
-      "train_loss": 1.138401412963867,
-      "train_runtime": 1152.3511,
-      "train_samples_per_second": 0.104,
-      "train_steps_per_second": 0.009
     },
     {
-      "epoch": 1.0,
-      "eval_accuracy": 0.3036231884057971,
-      "eval_loss": 1.1199791431427002,
-      "eval_runtime": 998.4286,
-      "eval_samples_per_second": 2.764,
-      "eval_steps_per_second": 0.23,
-      "step": 10
     }
   ],
   "logging_steps": 10,
-  "max_steps": 10,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 9223372036854775807,
   "save_steps": 500,
@@ -60,8 +1725,8 @@
       "attributes": {}
     }
   },
-  "total_flos": 3.794361829687296e+16,
-  "train_batch_size": 12,
   "trial_name": null,
   "trial_params": null
 }

 {
+  "best_metric": 0.7665847665847666,
+  "best_model_checkpoint": "videomae-small-finetuned-kinetics-finetuned-2\\checkpoint-2260",
+  "epoch": 9.096017699115045,
   "eval_steps": 500,
+  "global_step": 2260,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.004424778761061947,
+      "grad_norm": 2.1747984886169434,
+      "learning_rate": 2.2123893805309734e-06,
+      "loss": 1.0543,
       "step": 10
     },
     {
+      "epoch": 0.008849557522123894,
+      "grad_norm": 1.4438425302505493,
+      "learning_rate": 4.424778761061947e-06,
+      "loss": 1.0544,
+      "step": 20
     },
     {
+      "epoch": 0.01327433628318584,
+      "grad_norm": 1.5373220443725586,
+      "learning_rate": 6.6371681415929215e-06,
+      "loss": 1.0551,
+      "step": 30
     },
     {
+      "epoch": 0.017699115044247787,
+      "grad_norm": 1.477241039276123,
+      "learning_rate": 8.849557522123894e-06,
+      "loss": 1.0596,
+      "step": 40
+    },
+    {
+      "epoch": 0.022123893805309734,
+      "grad_norm": 1.7869963645935059,
+      "learning_rate": 1.1061946902654869e-05,
+      "loss": 1.0579,
+      "step": 50
+    },
+    {
+      "epoch": 0.02654867256637168,
+      "grad_norm": 1.406799554824829,
+      "learning_rate": 1.3274336283185843e-05,
+      "loss": 1.064,
+      "step": 60
+    },
+    {
+      "epoch": 0.030973451327433628,
+      "grad_norm": 1.6087795495986938,
+      "learning_rate": 1.5486725663716813e-05,
+      "loss": 1.0334,
+      "step": 70
+    },
+    {
+      "epoch": 0.035398230088495575,
+      "grad_norm": 1.539732575416565,
+      "learning_rate": 1.7699115044247787e-05,
+      "loss": 1.0502,
+      "step": 80
+    },
+    {
+      "epoch": 0.03982300884955752,
+      "grad_norm": 1.7329670190811157,
+      "learning_rate": 1.991150442477876e-05,
+      "loss": 1.0408,
+      "step": 90
+    },
+    {
+      "epoch": 0.04424778761061947,
+      "grad_norm": 1.849195957183838,
+      "learning_rate": 2.2123893805309738e-05,
+      "loss": 1.0403,
+      "step": 100
+    },
+    {
+      "epoch": 0.048672566371681415,
+      "grad_norm": 1.7182987928390503,
+      "learning_rate": 2.433628318584071e-05,
+      "loss": 1.0406,
+      "step": 110
+    },
+    {
+      "epoch": 0.05309734513274336,
+      "grad_norm": 1.8291207551956177,
+      "learning_rate": 2.6548672566371686e-05,
+      "loss": 1.0188,
+      "step": 120
+    },
+    {
+      "epoch": 0.05752212389380531,
+      "grad_norm": 1.8635715246200562,
+      "learning_rate": 2.8761061946902656e-05,
+      "loss": 1.0132,
+      "step": 130
+    },
+    {
+      "epoch": 0.061946902654867256,
+      "grad_norm": 2.1121060848236084,
+      "learning_rate": 3.097345132743363e-05,
+      "loss": 1.0194,
+      "step": 140
+    },
+    {
+      "epoch": 0.06637168141592921,
+      "grad_norm": 1.605365514755249,
+      "learning_rate": 3.3185840707964604e-05,
+      "loss": 1.0276,
+      "step": 150
+    },
+    {
+      "epoch": 0.07079646017699115,
+      "grad_norm": 1.4162675142288208,
+      "learning_rate": 3.5398230088495574e-05,
+      "loss": 1.0155,
+      "step": 160
+    },
+    {
+      "epoch": 0.0752212389380531,
+      "grad_norm": 2.037018060684204,
+      "learning_rate": 3.7610619469026545e-05,
+      "loss": 0.9921,
+      "step": 170
+    },
+    {
+      "epoch": 0.07964601769911504,
+      "grad_norm": 1.6612162590026855,
+      "learning_rate": 3.982300884955752e-05,
+      "loss": 0.9814,
+      "step": 180
+    },
+    {
+      "epoch": 0.084070796460177,
+      "grad_norm": 1.4722100496292114,
+      "learning_rate": 4.20353982300885e-05,
+      "loss": 1.0092,
+      "step": 190
+    },
+    {
+      "epoch": 0.08849557522123894,
+      "grad_norm": 1.9917899370193481,
+      "learning_rate": 4.4247787610619477e-05,
+      "loss": 0.9742,
+      "step": 200
+    },
+    {
+      "epoch": 0.09292035398230089,
+      "grad_norm": 1.5193358659744263,
+      "learning_rate": 4.646017699115045e-05,
+      "loss": 0.9647,
+      "step": 210
+    },
+    {
+      "epoch": 0.09734513274336283,
+      "grad_norm": 1.4704391956329346,
+      "learning_rate": 4.867256637168142e-05,
+      "loss": 0.9644,
+      "step": 220
+    },
+    {
+      "epoch": 0.10044247787610619,
+      "eval_accuracy": 0.5777465777465778,
+      "eval_loss": 0.9732658863067627,
+      "eval_runtime": 1124.1661,
+      "eval_samples_per_second": 2.534,
+      "eval_steps_per_second": 0.08,
+      "step": 227
+    },
+    {
+      "epoch": 1.0013274336283187,
+      "grad_norm": 1.666140079498291,
+      "learning_rate": 4.990167158308752e-05,
+      "loss": 0.967,
+      "step": 230
+    },
+    {
+      "epoch": 1.0057522123893805,
+      "grad_norm": 1.6128636598587036,
+      "learning_rate": 4.9655850540806295e-05,
+      "loss": 0.9457,
+      "step": 240
+    },
+    {
+      "epoch": 1.0101769911504426,
+      "grad_norm": 1.2806826829910278,
+      "learning_rate": 4.941002949852507e-05,
+      "loss": 0.9482,
+      "step": 250
+    },
+    {
+      "epoch": 1.0146017699115044,
+      "grad_norm": 1.5180962085723877,
+      "learning_rate": 4.9164208456243856e-05,
+      "loss": 0.9656,
+      "step": 260
+    },
+    {
+      "epoch": 1.0190265486725665,
+      "grad_norm": 1.306519627571106,
+      "learning_rate": 4.891838741396263e-05,
+      "loss": 0.9615,
+      "step": 270
+    },
+    {
+      "epoch": 1.0234513274336283,
+      "grad_norm": 1.5663535594940186,
+      "learning_rate": 4.867256637168142e-05,
+      "loss": 0.9306,
+      "step": 280
+    },
+    {
+      "epoch": 1.0278761061946902,
+      "grad_norm": 1.3962496519088745,
+      "learning_rate": 4.8426745329400195e-05,
+      "loss": 0.9359,
+      "step": 290
+    },
+    {
+      "epoch": 1.0323008849557522,
+      "grad_norm": 1.3210822343826294,
+      "learning_rate": 4.818092428711898e-05,
+      "loss": 0.8929,
+      "step": 300
+    },
+    {
+      "epoch": 1.036725663716814,
+      "grad_norm": 1.5808290243148804,
+      "learning_rate": 4.7935103244837756e-05,
+      "loss": 0.8904,
+      "step": 310
+    },
+    {
+      "epoch": 1.0411504424778761,
+      "grad_norm": 1.4897513389587402,
+      "learning_rate": 4.768928220255654e-05,
+      "loss": 0.9128,
+      "step": 320
+    },
+    {
+      "epoch": 1.045575221238938,
+      "grad_norm": 1.4278603792190552,
+      "learning_rate": 4.7443461160275324e-05,
+      "loss": 0.9177,
+      "step": 330
+    },
+    {
+      "epoch": 1.05,
+      "grad_norm": 2.067490577697754,
+      "learning_rate": 4.71976401179941e-05,
+      "loss": 0.8882,
+      "step": 340
+    },
+    {
+      "epoch": 1.0544247787610619,
+      "grad_norm": 1.3919837474822998,
+      "learning_rate": 4.6951819075712886e-05,
+      "loss": 0.8824,
+      "step": 350
+    },
+    {
+      "epoch": 1.058849557522124,
+      "grad_norm": 1.398645281791687,
+      "learning_rate": 4.670599803343166e-05,
+      "loss": 0.905,
+      "step": 360
+    },
+    {
+      "epoch": 1.0632743362831858,
+      "grad_norm": 1.4657193422317505,
+      "learning_rate": 4.646017699115045e-05,
+      "loss": 0.8687,
+      "step": 370
+    },
+    {
+      "epoch": 1.0676991150442479,
+      "grad_norm": 1.2234872579574585,
+      "learning_rate": 4.6214355948869224e-05,
+      "loss": 0.8918,
+      "step": 380
+    },
+    {
+      "epoch": 1.0721238938053097,
+      "grad_norm": 1.321500539779663,
+      "learning_rate": 4.596853490658801e-05,
+      "loss": 0.8582,
+      "step": 390
+    },
+    {
+      "epoch": 1.0765486725663718,
+      "grad_norm": 1.3090804815292358,
+      "learning_rate": 4.5722713864306786e-05,
+      "loss": 0.868,
+      "step": 400
+    },
+    {
+      "epoch": 1.0809734513274336,
+      "grad_norm": 1.2257106304168701,
+      "learning_rate": 4.547689282202557e-05,
+      "loss": 0.8837,
+      "step": 410
+    },
+    {
+      "epoch": 1.0853982300884957,
+      "grad_norm": 1.2699074745178223,
+      "learning_rate": 4.523107177974435e-05,
+      "loss": 0.8863,
+      "step": 420
+    },
+    {
+      "epoch": 1.0898230088495575,
+      "grad_norm": 1.791901707649231,
+      "learning_rate": 4.498525073746313e-05,
+      "loss": 0.8883,
+      "step": 430
+    },
+    {
+      "epoch": 1.0942477876106196,
+      "grad_norm": 1.2082570791244507,
+      "learning_rate": 4.473942969518191e-05,
+      "loss": 0.8544,
+      "step": 440
+    },
+    {
+      "epoch": 1.0986725663716814,
+      "grad_norm": 1.3323192596435547,
+      "learning_rate": 4.449360865290069e-05,
+      "loss": 0.8823,
+      "step": 450
+    },
+    {
+      "epoch": 1.1004424778761062,
+      "eval_accuracy": 0.6676026676026676,
+      "eval_loss": 0.869060754776001,
+      "eval_runtime": 1028.2799,
+      "eval_samples_per_second": 2.771,
+      "eval_steps_per_second": 0.088,
+      "step": 454
+    },
+    {
+      "epoch": 2.0026548672566373,
+      "grad_norm": 1.2703664302825928,
+      "learning_rate": 4.4247787610619477e-05,
+      "loss": 0.846,
+      "step": 460
+    },
+    {
+      "epoch": 2.007079646017699,
+      "grad_norm": 1.2533842325210571,
+      "learning_rate": 4.4001966568338254e-05,
+      "loss": 0.8445,
+      "step": 470
+    },
+    {
+      "epoch": 2.011504424778761,
+      "grad_norm": 1.2655799388885498,
+      "learning_rate": 4.375614552605704e-05,
+      "loss": 0.8526,
+      "step": 480
+    },
+    {
+      "epoch": 2.015929203539823,
+      "grad_norm": 1.3053220510482788,
+      "learning_rate": 4.351032448377581e-05,
+      "loss": 0.8576,
+      "step": 490
+    },
+    {
+      "epoch": 2.020353982300885,
+      "grad_norm": 1.160129189491272,
+      "learning_rate": 4.326450344149459e-05,
+      "loss": 0.8595,
+      "step": 500
+    },
+    {
+      "epoch": 2.0247787610619468,
+      "grad_norm": 1.4154675006866455,
+      "learning_rate": 4.301868239921337e-05,
+      "loss": 0.8563,
+      "step": 510
+    },
+    {
+      "epoch": 2.029203539823009,
+      "grad_norm": 1.166509747505188,
+      "learning_rate": 4.2772861356932154e-05,
+      "loss": 0.8431,
+      "step": 520
+    },
+    {
+      "epoch": 2.033628318584071,
+      "grad_norm": 1.4635576009750366,
+      "learning_rate": 4.252704031465093e-05,
+      "loss": 0.8099,
+      "step": 530
+    },
+    {
+      "epoch": 2.038053097345133,
+      "grad_norm": 1.205285906791687,
+      "learning_rate": 4.2281219272369715e-05,
+      "loss": 0.8308,
+      "step": 540
+    },
+    {
+      "epoch": 2.0424778761061946,
+      "grad_norm": 1.521293044090271,
+      "learning_rate": 4.20353982300885e-05,
+      "loss": 0.8257,
+      "step": 550
+    },
+    {
+      "epoch": 2.0469026548672566,
+      "grad_norm": 1.212964415550232,
+      "learning_rate": 4.178957718780728e-05,
+      "loss": 0.8587,
+      "step": 560
+    },
+    {
+      "epoch": 2.0513274336283187,
+      "grad_norm": 1.156423807144165,
+      "learning_rate": 4.154375614552606e-05,
+      "loss": 0.812,
+      "step": 570
+    },
+    {
+      "epoch": 2.0557522123893803,
+      "grad_norm": 1.274183988571167,
+      "learning_rate": 4.129793510324484e-05,
+      "loss": 0.838,
+      "step": 580
+    },
+    {
+      "epoch": 2.0601769911504424,
+      "grad_norm": 1.8873164653778076,
+      "learning_rate": 4.105211406096362e-05,
+      "loss": 0.8067,
+      "step": 590
+    },
+    {
+      "epoch": 2.0646017699115045,
+      "grad_norm": 1.1822903156280518,
+      "learning_rate": 4.08062930186824e-05,
+      "loss": 0.7949,
+      "step": 600
+    },
+    {
+      "epoch": 2.0690265486725665,
+      "grad_norm": 1.2289406061172485,
+      "learning_rate": 4.0560471976401183e-05,
+      "loss": 0.8064,
+      "step": 610
+    },
+    {
+      "epoch": 2.073451327433628,
+      "grad_norm": 1.1396745443344116,
+      "learning_rate": 4.031465093411996e-05,
+      "loss": 0.8183,
+      "step": 620
+    },
+    {
+      "epoch": 2.07787610619469,
+      "grad_norm": 1.041188359260559,
+      "learning_rate": 4.0068829891838745e-05,
+      "loss": 0.8024,
+      "step": 630
+    },
+    {
+      "epoch": 2.0823008849557523,
+      "grad_norm": 1.2631102800369263,
+      "learning_rate": 3.982300884955752e-05,
+      "loss": 0.8168,
+      "step": 640
+    },
+    {
+      "epoch": 2.0867256637168143,
+      "grad_norm": 1.06856107711792,
+      "learning_rate": 3.9577187807276306e-05,
+      "loss": 0.7869,
+      "step": 650
+    },
+    {
+      "epoch": 2.091150442477876,
+      "grad_norm": 1.4375030994415283,
+      "learning_rate": 3.9331366764995083e-05,
+      "loss": 0.8004,
+      "step": 660
+    },
+    {
+      "epoch": 2.095575221238938,
+      "grad_norm": 1.1425267457962036,
+      "learning_rate": 3.908554572271387e-05,
+      "loss": 0.7626,
+      "step": 670
+    },
+    {
+      "epoch": 2.1,
+      "grad_norm": 1.9019882678985596,
+      "learning_rate": 3.883972468043265e-05,
+      "loss": 0.826,
+      "step": 680
+    },
+    {
+      "epoch": 2.1004424778761064,
+      "eval_accuracy": 0.7093717093717093,
+      "eval_loss": 0.8009993433952332,
+      "eval_runtime": 986.4291,
+      "eval_samples_per_second": 2.888,
+      "eval_steps_per_second": 0.091,
+      "step": 681
+    },
+    {
+      "epoch": 3.0039823008849558,
+      "grad_norm": 1.6283575296401978,
+      "learning_rate": 3.859390363815143e-05,
+      "loss": 0.7981,
+      "step": 690
+    },
+    {
+      "epoch": 3.008407079646018,
+      "grad_norm": 1.1492269039154053,
+      "learning_rate": 3.834808259587021e-05,
+      "loss": 0.8298,
+      "step": 700
+    },
+    {
+      "epoch": 3.0128318584070795,
+      "grad_norm": 1.1298381090164185,
+      "learning_rate": 3.810226155358899e-05,
+      "loss": 0.7982,
+      "step": 710
+    },
+    {
+      "epoch": 3.0172566371681415,
+      "grad_norm": 1.1305561065673828,
+      "learning_rate": 3.7856440511307774e-05,
+      "loss": 0.7666,
+      "step": 720
+    },
+    {
+      "epoch": 3.0216814159292036,
+      "grad_norm": 1.1142364740371704,
+      "learning_rate": 3.7610619469026545e-05,
+      "loss": 0.7775,
+      "step": 730
+    },
+    {
+      "epoch": 3.0261061946902656,
+      "grad_norm": 1.7376995086669922,
+      "learning_rate": 3.736479842674533e-05,
+      "loss": 0.7879,
+      "step": 740
+    },
+    {
+      "epoch": 3.0305309734513273,
+      "grad_norm": 1.1918601989746094,
+      "learning_rate": 3.711897738446411e-05,
+      "loss": 0.7587,
+      "step": 750
+    },
+    {
+      "epoch": 3.0349557522123893,
+      "grad_norm": 1.3769606351852417,
+      "learning_rate": 3.687315634218289e-05,
+      "loss": 0.7916,
+      "step": 760
+    },
+    {
+      "epoch": 3.0393805309734514,
+      "grad_norm": 1.269805669784546,
+      "learning_rate": 3.6627335299901674e-05,
+      "loss": 0.8069,
+      "step": 770
+    },
+    {
+      "epoch": 3.0438053097345135,
+      "grad_norm": 1.045404314994812,
+      "learning_rate": 3.638151425762045e-05,
+      "loss": 0.7894,
+      "step": 780
+    },
+    {
+      "epoch": 3.048230088495575,
+      "grad_norm": 1.1676949262619019,
+      "learning_rate": 3.6135693215339236e-05,
+      "loss": 0.8034,
+      "step": 790
+    },
+    {
+      "epoch": 3.052654867256637,
+      "grad_norm": 1.8563295602798462,
+      "learning_rate": 3.588987217305801e-05,
+      "loss": 0.7366,
+      "step": 800
+    },
+    {
+      "epoch": 3.057079646017699,
+      "grad_norm": 1.0208163261413574,
+      "learning_rate": 3.56440511307768e-05,
+      "loss": 0.7332,
+      "step": 810
+    },
+    {
+      "epoch": 3.0615044247787613,
+      "grad_norm": 1.3837121725082397,
+      "learning_rate": 3.5398230088495574e-05,
+      "loss": 0.7421,
+      "step": 820
+    },
+    {
+      "epoch": 3.065929203539823,
+      "grad_norm": 1.1950623989105225,
+      "learning_rate": 3.515240904621436e-05,
+      "loss": 0.7696,
+      "step": 830
+    },
+    {
+      "epoch": 3.070353982300885,
+      "grad_norm": 1.3049529790878296,
+      "learning_rate": 3.4906588003933136e-05,
+      "loss": 0.7897,
+      "step": 840
+    },
+    {
+      "epoch": 3.074778761061947,
+      "grad_norm": 1.3270063400268555,
+      "learning_rate": 3.466076696165192e-05,
+      "loss": 0.7884,
+      "step": 850
+    },
+    {
+      "epoch": 3.0792035398230087,
+      "grad_norm": 1.4187606573104858,
+      "learning_rate": 3.44149459193707e-05,
+      "loss": 0.7526,
+      "step": 860
+    },
+    {
+      "epoch": 3.0836283185840707,
+      "grad_norm": 1.0789133310317993,
+      "learning_rate": 3.416912487708948e-05,
+      "loss": 0.7535,
+      "step": 870
+    },
+    {
+      "epoch": 3.088053097345133,
+      "grad_norm": 1.1631813049316406,
+      "learning_rate": 3.3923303834808265e-05,
+      "loss": 0.7579,
+      "step": 880
+    },
+    {
+      "epoch": 3.092477876106195,
+      "grad_norm": 1.1757570505142212,
+      "learning_rate": 3.367748279252704e-05,
+      "loss": 0.79,
+      "step": 890
+    },
+    {
+      "epoch": 3.0969026548672565,
+      "grad_norm": 1.0601412057876587,
+      "learning_rate": 3.343166175024583e-05,
+      "loss": 0.7422,
+      "step": 900
+    },
+    {
+      "epoch": 3.1004424778761064,
+      "eval_accuracy": 0.7371007371007371,
+      "eval_loss": 0.7514493465423584,
+      "eval_runtime": 1046.1945,
+      "eval_samples_per_second": 2.723,
+      "eval_steps_per_second": 0.086,
+      "step": 908
+    },
+    {
+      "epoch": 4.000884955752213,
+      "grad_norm": 1.757256269454956,
+      "learning_rate": 3.3185840707964604e-05,
+      "loss": 0.7465,
+      "step": 910
+    },
+    {
+      "epoch": 4.005309734513275,
+      "grad_norm": 1.1318788528442383,
+      "learning_rate": 3.294001966568339e-05,
+      "loss": 0.7494,
+      "step": 920
+    },
+    {
+      "epoch": 4.009734513274337,
+      "grad_norm": 1.104137659072876,
+      "learning_rate": 3.2694198623402165e-05,
+      "loss": 0.7696,
+      "step": 930
+    },
+    {
+      "epoch": 4.014159292035398,
+      "grad_norm": 1.0731704235076904,
+      "learning_rate": 3.244837758112095e-05,
+      "loss": 0.748,
+      "step": 940
+    },
+    {
+      "epoch": 4.01858407079646,
+      "grad_norm": 1.1554673910140991,
+      "learning_rate": 3.220255653883973e-05,
+      "loss": 0.79,
+      "step": 950
+    },
+    {
+      "epoch": 4.023008849557522,
+      "grad_norm": 1.2223293781280518,
+      "learning_rate": 3.1956735496558504e-05,
+      "loss": 0.7281,
+      "step": 960
+    },
+    {
+      "epoch": 4.027433628318584,
+      "grad_norm": 1.1225980520248413,
+      "learning_rate": 3.171091445427729e-05,
+      "loss": 0.7219,
+      "step": 970
+    },
+    {
+      "epoch": 4.031858407079646,
+      "grad_norm": 1.0847655534744263,
+      "learning_rate": 3.1465093411996065e-05,
+      "loss": 0.7481,
+      "step": 980
+    },
+    {
+      "epoch": 4.036283185840708,
+      "grad_norm": 1.3014447689056396,
+      "learning_rate": 3.121927236971485e-05,
+      "loss": 0.7305,
+      "step": 990
+    },
+    {
+      "epoch": 4.04070796460177,
+      "grad_norm": 1.1104828119277954,
+      "learning_rate": 3.097345132743363e-05,
+      "loss": 0.7211,
+      "step": 1000
+    },
+    {
+      "epoch": 4.0451327433628315,
+      "grad_norm": 1.0301592350006104,
+      "learning_rate": 3.072763028515241e-05,
+      "loss": 0.7676,
+      "step": 1010
+    },
+    {
+      "epoch": 4.0495575221238935,
+      "grad_norm": 1.1068618297576904,
+      "learning_rate": 3.048180924287119e-05,
+      "loss": 0.7078,
+      "step": 1020
+    },
+    {
+      "epoch": 4.053982300884956,
+      "grad_norm": 1.4518685340881348,
+      "learning_rate": 3.0235988200589972e-05,
+      "loss": 0.7706,
+      "step": 1030
+    },
+    {
+      "epoch": 4.058407079646018,
+      "grad_norm": 0.9889740347862244,
+      "learning_rate": 2.9990167158308753e-05,
+      "loss": 0.7207,
+      "step": 1040
+    },
+    {
+      "epoch": 4.06283185840708,
+      "grad_norm": 1.1339548826217651,
+      "learning_rate": 2.9744346116027534e-05,
+      "loss": 0.7423,
+      "step": 1050
+    },
+    {
+      "epoch": 4.067256637168142,
+      "grad_norm": 1.3356149196624756,
+      "learning_rate": 2.9498525073746314e-05,
+      "loss": 0.7114,
+      "step": 1060
+    },
+    {
+      "epoch": 4.071681415929204,
+      "grad_norm": 1.0467292070388794,
+      "learning_rate": 2.9252704031465095e-05,
+      "loss": 0.6998,
+      "step": 1070
+    },
+    {
+      "epoch": 4.076106194690266,
+      "grad_norm": 1.0783181190490723,
+      "learning_rate": 2.9006882989183876e-05,
+      "loss": 0.7544,
+      "step": 1080
+    },
+    {
+      "epoch": 4.080530973451327,
+      "grad_norm": 1.0915840864181519,
+      "learning_rate": 2.8761061946902656e-05,
+      "loss": 0.7145,
+      "step": 1090
+    },
+    {
+      "epoch": 4.084955752212389,
+      "grad_norm": 1.646979570388794,
+      "learning_rate": 2.8515240904621437e-05,
+      "loss": 0.7145,
+      "step": 1100
+    },
+    {
+      "epoch": 4.089380530973451,
+      "grad_norm": 1.3354220390319824,
+      "learning_rate": 2.8269419862340218e-05,
+      "loss": 0.7329,
+      "step": 1110
+    },
+    {
+      "epoch": 4.093805309734513,
+      "grad_norm": 1.1131007671356201,
+      "learning_rate": 2.8023598820059e-05,
+      "loss": 0.6812,
+      "step": 1120
+    },
+    {
+      "epoch": 4.098230088495575,
+      "grad_norm": 1.260852575302124,
+      "learning_rate": 2.777777777777778e-05,
+      "loss": 0.7206,
+      "step": 1130
+    },
+    {
+      "epoch": 4.100442477876106,
+      "eval_accuracy": 0.7507897507897507,
+      "eval_loss": 0.7169636487960815,
+      "eval_runtime": 1087.6772,
+      "eval_samples_per_second": 2.619,
+      "eval_steps_per_second": 0.083,
+      "step": 1135
+    },
+    {
+      "epoch": 5.002212389380531,
+      "grad_norm": 1.1113187074661255,
+      "learning_rate": 2.753195673549656e-05,
+      "loss": 0.6946,
+      "step": 1140
+    },
+    {
+      "epoch": 5.006637168141593,
+      "grad_norm": 1.0982720851898193,
+      "learning_rate": 2.7286135693215344e-05,
+      "loss": 0.7624,
+      "step": 1150
+    },
+    {
+      "epoch": 5.011061946902655,
+      "grad_norm": 1.3066320419311523,
+      "learning_rate": 2.7040314650934125e-05,
+      "loss": 0.7023,
+      "step": 1160
+    },
+    {
+      "epoch": 5.015486725663717,
+      "grad_norm": 0.9856134057044983,
+      "learning_rate": 2.6794493608652905e-05,
+      "loss": 0.6863,
+      "step": 1170
+    },
+    {
+      "epoch": 5.019911504424779,
+      "grad_norm": 1.2251431941986084,
+      "learning_rate": 2.6548672566371686e-05,
+      "loss": 0.7327,
+      "step": 1180
+    },
+    {
+      "epoch": 5.024336283185841,
+      "grad_norm": 0.9257007837295532,
+      "learning_rate": 2.6302851524090467e-05,
+      "loss": 0.7002,
+      "step": 1190
+    },
+    {
+      "epoch": 5.028761061946903,
+      "grad_norm": 0.9643080830574036,
+      "learning_rate": 2.605703048180924e-05,
+      "loss": 0.6906,
+      "step": 1200
+    },
+    {
+      "epoch": 5.033185840707965,
+      "grad_norm": 1.1749197244644165,
+      "learning_rate": 2.581120943952802e-05,
+      "loss": 0.7327,
+      "step": 1210
+    },
+    {
+      "epoch": 5.037610619469026,
+      "grad_norm": 1.1557971239089966,
+      "learning_rate": 2.5565388397246802e-05,
+      "loss": 0.6845,
+      "step": 1220
+    },
+    {
+      "epoch": 5.042035398230088,
+      "grad_norm": 1.2948167324066162,
+      "learning_rate": 2.5319567354965586e-05,
+      "loss": 0.7152,
+      "step": 1230
+    },
+    {
+      "epoch": 5.04646017699115,
+      "grad_norm": 0.9071692228317261,
+      "learning_rate": 2.5073746312684367e-05,
+      "loss": 0.701,
+      "step": 1240
+    },
+    {
+      "epoch": 5.050884955752212,
+      "grad_norm": 1.0012332201004028,
+      "learning_rate": 2.4827925270403147e-05,
+      "loss": 0.7141,
+      "step": 1250
+    },
+    {
+      "epoch": 5.0553097345132745,
+      "grad_norm": 1.4386842250823975,
+      "learning_rate": 2.4582104228121928e-05,
+      "loss": 0.7405,
+      "step": 1260
+    },
+    {
+      "epoch": 5.0597345132743365,
+      "grad_norm": 1.1797550916671753,
+      "learning_rate": 2.433628318584071e-05,
+      "loss": 0.6717,
+      "step": 1270
+    },
+    {
+      "epoch": 5.064159292035399,
+      "grad_norm": 1.0993843078613281,
+      "learning_rate": 2.409046214355949e-05,
+      "loss": 0.6832,
+      "step": 1280
+    },
+    {
+      "epoch": 5.06858407079646,
+      "grad_norm": 0.9947831630706787,
+      "learning_rate": 2.384464110127827e-05,
+      "loss": 0.7164,
+      "step": 1290
+    },
+    {
+      "epoch": 5.073008849557522,
+      "grad_norm": 1.335589051246643,
+      "learning_rate": 2.359882005899705e-05,
+      "loss": 0.7541,
+      "step": 1300
+    },
+    {
+      "epoch": 5.077433628318584,
+      "grad_norm": 1.3028713464736938,
+      "learning_rate": 2.335299901671583e-05,
+      "loss": 0.7167,
+      "step": 1310
+    },
+    {
+      "epoch": 5.081858407079646,
+      "grad_norm": 1.1588274240493774,
+      "learning_rate": 2.3107177974434612e-05,
+      "loss": 0.7194,
+      "step": 1320
+    },
+    {
+      "epoch": 5.086283185840708,
+      "grad_norm": 0.9976550340652466,
+      "learning_rate": 2.2861356932153393e-05,
+      "loss": 0.7196,
+      "step": 1330
+    },
+    {
+      "epoch": 5.09070796460177,
+      "grad_norm": 0.8966324925422668,
+      "learning_rate": 2.2615535889872174e-05,
+      "loss": 0.7268,
+      "step": 1340
+    },
+    {
+      "epoch": 5.095132743362832,
+      "grad_norm": 1.0999760627746582,
+      "learning_rate": 2.2369714847590954e-05,
+      "loss": 0.7284,
+      "step": 1350
+    },
+    {
+      "epoch": 5.099557522123894,
+      "grad_norm": 1.032329797744751,
+      "learning_rate": 2.2123893805309738e-05,
+      "loss": 0.6806,
+      "step": 1360
+    },
+    {
+      "epoch": 5.100442477876106,
+      "eval_accuracy": 0.7542997542997543,
+      "eval_loss": 0.6924355626106262,
+      "eval_runtime": 1080.3798,
+      "eval_samples_per_second": 2.637,
+      "eval_steps_per_second": 0.083,
+      "step": 1362
+    },
+    {
+      "epoch": 6.0035398230088495,
+      "grad_norm": 1.2827861309051514,
+      "learning_rate": 2.187807276302852e-05,
+      "loss": 0.768,
+      "step": 1370
+    },
+    {
+      "epoch": 6.0079646017699115,
+      "grad_norm": 1.1958467960357666,
+      "learning_rate": 2.1632251720747296e-05,
+      "loss": 0.7209,
+      "step": 1380
+    },
+    {
+      "epoch": 6.012389380530974,
+      "grad_norm": 1.0269005298614502,
+      "learning_rate": 2.1386430678466077e-05,
+      "loss": 0.7095,
+      "step": 1390
+    },
+    {
+      "epoch": 6.016814159292036,
+      "grad_norm": 0.9638125896453857,
+      "learning_rate": 2.1140609636184858e-05,
+      "loss": 0.6557,
+      "step": 1400
+    },
+    {
+      "epoch": 6.021238938053098,
+      "grad_norm": 1.3249033689498901,
+      "learning_rate": 2.089478859390364e-05,
+      "loss": 0.7129,
+      "step": 1410
+    },
+    {
+      "epoch": 6.025663716814159,
+      "grad_norm": 1.1330633163452148,
+      "learning_rate": 2.064896755162242e-05,
+      "loss": 0.6472,
+      "step": 1420
+    },
+    {
+      "epoch": 6.030088495575221,
+      "grad_norm": 1.1240506172180176,
+      "learning_rate": 2.04031465093412e-05,
+      "loss": 0.6748,
+      "step": 1430
+    },
+    {
+      "epoch": 6.034513274336283,
+      "grad_norm": 1.091794729232788,
+      "learning_rate": 2.015732546705998e-05,
+      "loss": 0.7125,
+      "step": 1440
+    },
+    {
+      "epoch": 6.038938053097345,
+      "grad_norm": 0.892193078994751,
+      "learning_rate": 1.991150442477876e-05,
+      "loss": 0.7045,
+      "step": 1450
+    },
+    {
+      "epoch": 6.043362831858407,
+      "grad_norm": 1.1631165742874146,
+      "learning_rate": 1.9665683382497542e-05,
+      "loss": 0.7432,
+      "step": 1460
+    },
+    {
+      "epoch": 6.047787610619469,
+      "grad_norm": 0.9747660160064697,
+      "learning_rate": 1.9419862340216326e-05,
+      "loss": 0.6965,
+      "step": 1470
+    },
+    {
+      "epoch": 6.052212389380531,
+      "grad_norm": 0.9529617428779602,
+      "learning_rate": 1.9174041297935107e-05,
+      "loss": 0.6725,
+      "step": 1480
+    },
+    {
+      "epoch": 6.056637168141593,
+      "grad_norm": 1.0859458446502686,
+      "learning_rate": 1.8928220255653887e-05,
+      "loss": 0.6744,
+      "step": 1490
+    },
+    {
+      "epoch": 6.0610619469026545,
+      "grad_norm": 1.4146475791931152,
+      "learning_rate": 1.8682399213372664e-05,
+      "loss": 0.6668,
+      "step": 1500
+    },
+    {
+      "epoch": 6.065486725663717,
+      "grad_norm": 1.5994105339050293,
+      "learning_rate": 1.8436578171091445e-05,
+      "loss": 0.6941,
+      "step": 1510
+    },
+    {
+      "epoch": 6.069911504424779,
+      "grad_norm": 1.22159743309021,
+      "learning_rate": 1.8190757128810226e-05,
+      "loss": 0.7201,
+      "step": 1520
+    },
+    {
+      "epoch": 6.074336283185841,
+      "grad_norm": 1.196835994720459,
+      "learning_rate": 1.7944936086529007e-05,
+      "loss": 0.6506,
+      "step": 1530
+    },
+    {
+      "epoch": 6.078761061946903,
+      "grad_norm": 0.9212129712104797,
+      "learning_rate": 1.7699115044247787e-05,
+      "loss": 0.6931,
+      "step": 1540
+    },
+    {
+      "epoch": 6.083185840707965,
+      "grad_norm": 1.0447068214416504,
+      "learning_rate": 1.7453294001966568e-05,
+      "loss": 0.7107,
+      "step": 1550
+    },
+    {
+      "epoch": 6.087610619469027,
+      "grad_norm": 1.2515738010406494,
+      "learning_rate": 1.720747295968535e-05,
+      "loss": 0.7178,
+      "step": 1560
+    },
+    {
+      "epoch": 6.092035398230088,
+      "grad_norm": 1.4285340309143066,
+      "learning_rate": 1.6961651917404133e-05,
+      "loss": 0.6929,
+      "step": 1570
+    },
+    {
+      "epoch": 6.09646017699115,
+      "grad_norm": 1.3406482934951782,
+      "learning_rate": 1.6715830875122913e-05,
+      "loss": 0.6826,
+      "step": 1580
+    },
+    {
+      "epoch": 6.100442477876106,
+      "eval_accuracy": 0.7585117585117586,
+      "eval_loss": 0.675682008266449,
+      "eval_runtime": 1089.8885,
+      "eval_samples_per_second": 2.614,
+      "eval_steps_per_second": 0.083,
+      "step": 1589
+    },
+    {
+      "epoch": 7.000442477876106,
+      "grad_norm": 1.920408844947815,
+      "learning_rate": 1.6470009832841694e-05,
+      "loss": 0.6845,
+      "step": 1590
+    },
+    {
+      "epoch": 7.004867256637168,
+      "grad_norm": 1.2930206060409546,
+      "learning_rate": 1.6224188790560475e-05,
+      "loss": 0.6906,
+      "step": 1600
+    },
+    {
+      "epoch": 7.00929203539823,
+      "grad_norm": 1.010541319847107,
+      "learning_rate": 1.5978367748279252e-05,
+      "loss": 0.7002,
+      "step": 1610
+    },
+    {
+      "epoch": 7.013716814159292,
+      "grad_norm": 1.3391730785369873,
+      "learning_rate": 1.5732546705998033e-05,
+      "loss": 0.6841,
+      "step": 1620
+    },
+    {
+      "epoch": 7.018141592920354,
+      "grad_norm": 1.4052796363830566,
+      "learning_rate": 1.5486725663716813e-05,
+      "loss": 0.6611,
+      "step": 1630
+    },
+    {
+      "epoch": 7.022566371681416,
+      "grad_norm": 1.2141647338867188,
+      "learning_rate": 1.5240904621435596e-05,
+      "loss": 0.6856,
+      "step": 1640
+    },
+    {
+      "epoch": 7.026991150442478,
+      "grad_norm": 1.3713358640670776,
+      "learning_rate": 1.4995083579154376e-05,
+      "loss": 0.6978,
+      "step": 1650
+    },
+    {
+      "epoch": 7.03141592920354,
+      "grad_norm": 0.9116381406784058,
+      "learning_rate": 1.4749262536873157e-05,
+      "loss": 0.696,
+      "step": 1660
+    },
+    {
+      "epoch": 7.035840707964602,
+      "grad_norm": 1.1704223155975342,
+      "learning_rate": 1.4503441494591938e-05,
+      "loss": 0.6695,
+      "step": 1670
+    },
+    {
+      "epoch": 7.040265486725664,
+      "grad_norm": 1.1459695100784302,
+      "learning_rate": 1.4257620452310719e-05,
+      "loss": 0.6575,
+      "step": 1680
+    },
+    {
+      "epoch": 7.044690265486726,
+      "grad_norm": 1.098761796951294,
+      "learning_rate": 1.40117994100295e-05,
+      "loss": 0.6905,
+      "step": 1690
+    },
+    {
+      "epoch": 7.049115044247787,
+      "grad_norm": 1.2097493410110474,
+      "learning_rate": 1.376597836774828e-05,
+      "loss": 0.7086,
+      "step": 1700
+    },
+    {
+      "epoch": 7.053539823008849,
+      "grad_norm": 1.3922789096832275,
+      "learning_rate": 1.3520157325467062e-05,
+      "loss": 0.6473,
+      "step": 1710
+    },
+    {
+      "epoch": 7.057964601769911,
+      "grad_norm": 1.064712405204773,
+      "learning_rate": 1.3274336283185843e-05,
+      "loss": 0.6987,
+      "step": 1720
+    },
+    {
+      "epoch": 7.062389380530973,
+      "grad_norm": 1.1013967990875244,
+      "learning_rate": 1.302851524090462e-05,
+      "loss": 0.6593,
+      "step": 1730
+    },
+    {
+      "epoch": 7.0668141592920355,
+      "grad_norm": 0.9073940515518188,
+      "learning_rate": 1.2782694198623401e-05,
+      "loss": 0.6873,
+      "step": 1740
+    },
+    {
+      "epoch": 7.0712389380530976,
+      "grad_norm": 1.1259492635726929,
+      "learning_rate": 1.2536873156342183e-05,
+      "loss": 0.6954,
+      "step": 1750
+    },
+    {
+      "epoch": 7.07566371681416,
+      "grad_norm": 0.8687026500701904,
+      "learning_rate": 1.2291052114060964e-05,
+      "loss": 0.6854,
+      "step": 1760
+    },
+    {
+      "epoch": 7.080088495575222,
+      "grad_norm": 1.687637448310852,
+      "learning_rate": 1.2045231071779745e-05,
+      "loss": 0.6781,
+      "step": 1770
+    },
+    {
+      "epoch": 7.084513274336283,
+      "grad_norm": 1.0679877996444702,
+      "learning_rate": 1.1799410029498525e-05,
+      "loss": 0.7,
+      "step": 1780
+    },
+    {
+      "epoch": 7.088938053097345,
+      "grad_norm": 1.1482867002487183,
+      "learning_rate": 1.1553588987217306e-05,
+      "loss": 0.6494,
+      "step": 1790
+    },
+    {
+      "epoch": 7.093362831858407,
+      "grad_norm": 1.042781949043274,
+      "learning_rate": 1.1307767944936087e-05,
+      "loss": 0.6776,
+      "step": 1800
+    },
+    {
+      "epoch": 7.097787610619469,
+      "grad_norm": 0.8880053162574768,
+      "learning_rate": 1.1061946902654869e-05,
+      "loss": 0.6756,
+      "step": 1810
+    },
+    {
+      "epoch": 7.100442477876106,
+      "eval_accuracy": 0.7630747630747631,
+      "eval_loss": 0.6651633381843567,
+      "eval_runtime": 1032.3877,
+      "eval_samples_per_second": 2.76,
+      "eval_steps_per_second": 0.087,
+      "step": 1816
+    },
+    {
+      "epoch": 8.001769911504425,
+      "grad_norm": 0.9909548163414001,
+      "learning_rate": 1.0816125860373648e-05,
+      "loss": 0.6837,
+      "step": 1820
+    },
+    {
+      "epoch": 8.006194690265486,
+      "grad_norm": 0.943515419960022,
+      "learning_rate": 1.0570304818092429e-05,
+      "loss": 0.665,
+      "step": 1830
+    },
+    {
+      "epoch": 8.01061946902655,
+      "grad_norm": 0.9931963682174683,
+      "learning_rate": 1.032448377581121e-05,
+      "loss": 0.6664,
+      "step": 1840
+    },
+    {
+      "epoch": 8.01504424778761,
+      "grad_norm": 0.8919005393981934,
+      "learning_rate": 1.007866273352999e-05,
+      "loss": 0.7089,
+      "step": 1850
+    },
+    {
+      "epoch": 8.019469026548673,
+      "grad_norm": 1.1344377994537354,
+      "learning_rate": 9.832841691248771e-06,
+      "loss": 0.6589,
+      "step": 1860
+    },
+    {
+      "epoch": 8.023893805309735,
+      "grad_norm": 1.4625028371810913,
+      "learning_rate": 9.587020648967553e-06,
+      "loss": 0.6771,
+      "step": 1870
+    },
+    {
+      "epoch": 8.028318584070796,
+      "grad_norm": 0.8750028610229492,
+      "learning_rate": 9.341199606686332e-06,
+      "loss": 0.6856,
+      "step": 1880
+    },
+    {
+      "epoch": 8.032743362831859,
+      "grad_norm": 1.1741547584533691,
+      "learning_rate": 9.095378564405113e-06,
+      "loss": 0.693,
+      "step": 1890
+    },
+    {
+      "epoch": 8.03716814159292,
+      "grad_norm": 1.0879665613174438,
+      "learning_rate": 8.849557522123894e-06,
+      "loss": 0.7053,
+      "step": 1900
+    },
+    {
+      "epoch": 8.041592920353983,
+      "grad_norm": 0.9571520686149597,
+      "learning_rate": 8.603736479842674e-06,
+      "loss": 0.6415,
+      "step": 1910
+    },
+    {
+      "epoch": 8.046017699115044,
+      "grad_norm": 1.060584545135498,
+      "learning_rate": 8.357915437561457e-06,
+      "loss": 0.6809,
+      "step": 1920
+    },
+    {
+      "epoch": 8.050442477876107,
+      "grad_norm": 1.0861510038375854,
+      "learning_rate": 8.112094395280237e-06,
+      "loss": 0.6771,
+      "step": 1930
+    },
+    {
+      "epoch": 8.054867256637168,
+      "grad_norm": 1.0085254907608032,
+      "learning_rate": 7.866273352999016e-06,
+      "loss": 0.6639,
+      "step": 1940
+    },
+    {
+      "epoch": 8.05929203539823,
+      "grad_norm": 1.0106779336929321,
+      "learning_rate": 7.620452310717798e-06,
+      "loss": 0.6665,
+      "step": 1950
+    },
+    {
+      "epoch": 8.063716814159292,
+      "grad_norm": 1.03801691532135,
+      "learning_rate": 7.374631268436579e-06,
+      "loss": 0.6768,
+      "step": 1960
+    },
+    {
+      "epoch": 8.068141592920353,
+      "grad_norm": 1.242561936378479,
+      "learning_rate": 7.128810226155359e-06,
+      "loss": 0.6154,
+      "step": 1970
+    },
+    {
+      "epoch": 8.072566371681416,
+      "grad_norm": 0.9356180429458618,
+      "learning_rate": 6.88298918387414e-06,
+      "loss": 0.6564,
+      "step": 1980
+    },
+    {
+      "epoch": 8.076991150442478,
+      "grad_norm": 1.3207311630249023,
+      "learning_rate": 6.6371681415929215e-06,
+      "loss": 0.7182,
+      "step": 1990
+    },
+    {
+      "epoch": 8.08141592920354,
+      "grad_norm": 1.6134599447250366,
+      "learning_rate": 6.3913470993117005e-06,
+      "loss": 0.6538,
+      "step": 2000
+    },
+    {
+      "epoch": 8.085840707964602,
+      "grad_norm": 1.317514419555664,
+      "learning_rate": 6.145526057030482e-06,
+      "loss": 0.6756,
+      "step": 2010
+    },
+    {
+      "epoch": 8.090265486725663,
+      "grad_norm": 1.5709384679794312,
+      "learning_rate": 5.899705014749263e-06,
+      "loss": 0.646,
+      "step": 2020
+    },
+    {
+      "epoch": 8.094690265486726,
+      "grad_norm": 0.9243668913841248,
+      "learning_rate": 5.653883972468043e-06,
+      "loss": 0.6908,
+      "step": 2030
+    },
+    {
+      "epoch": 8.099115044247787,
+      "grad_norm": 0.859581708908081,
+      "learning_rate": 5.408062930186824e-06,
+      "loss": 0.6964,
+      "step": 2040
+    },
+    {
+      "epoch": 8.100442477876106,
+      "eval_accuracy": 0.7655317655317655,
+      "eval_loss": 0.6591010093688965,
+      "eval_runtime": 997.7811,
+      "eval_samples_per_second": 2.855,
+      "eval_steps_per_second": 0.09,
+      "step": 2043
+    },
+    {
+      "epoch": 9.003097345132744,
+      "grad_norm": 1.7191717624664307,
+      "learning_rate": 5.162241887905605e-06,
+      "loss": 0.6788,
+      "step": 2050
+    },
+    {
+      "epoch": 9.007522123893805,
+      "grad_norm": 1.2770448923110962,
+      "learning_rate": 4.9164208456243854e-06,
+      "loss": 0.6437,
+      "step": 2060
+    },
+    {
+      "epoch": 9.011946902654866,
+      "grad_norm": 1.0872125625610352,
+      "learning_rate": 4.670599803343166e-06,
+      "loss": 0.6984,
+      "step": 2070
+    },
+    {
+      "epoch": 9.01637168141593,
+      "grad_norm": 0.8692576289176941,
+      "learning_rate": 4.424778761061947e-06,
+      "loss": 0.6377,
+      "step": 2080
+    },
+    {
+      "epoch": 9.02079646017699,
+      "grad_norm": 0.9820857048034668,
+      "learning_rate": 4.178957718780728e-06,
+      "loss": 0.6671,
+      "step": 2090
+    },
+    {
+      "epoch": 9.025221238938054,
+      "grad_norm": 1.6207181215286255,
+      "learning_rate": 3.933136676499508e-06,
+      "loss": 0.683,
+      "step": 2100
+    },
+    {
+      "epoch": 9.029646017699115,
+      "grad_norm": 1.0268634557724,
+      "learning_rate": 3.6873156342182893e-06,
+      "loss": 0.6381,
+      "step": 2110
+    },
+    {
+      "epoch": 9.034070796460178,
+      "grad_norm": 1.0367904901504517,
+      "learning_rate": 3.44149459193707e-06,
+      "loss": 0.6969,
+      "step": 2120
+    },
+    {
+      "epoch": 9.038495575221239,
+      "grad_norm": 1.4884945154190063,
+      "learning_rate": 3.1956735496558502e-06,
+      "loss": 0.6342,
+      "step": 2130
+    },
+    {
+      "epoch": 9.042920353982302,
+      "grad_norm": 1.086195707321167,
+      "learning_rate": 2.9498525073746313e-06,
+      "loss": 0.6649,
+      "step": 2140
+    },
+    {
+      "epoch": 9.047345132743363,
+      "grad_norm": 1.0393646955490112,
+      "learning_rate": 2.704031465093412e-06,
+      "loss": 0.6646,
+      "step": 2150
+    },
+    {
+      "epoch": 9.051769911504424,
+      "grad_norm": 1.188118815422058,
+      "learning_rate": 2.4582104228121927e-06,
+      "loss": 0.6681,
+      "step": 2160
+    },
+    {
+      "epoch": 9.056194690265487,
+      "grad_norm": 1.3695433139801025,
+      "learning_rate": 2.2123893805309734e-06,
+      "loss": 0.667,
+      "step": 2170
+    },
+    {
+      "epoch": 9.060619469026548,
+      "grad_norm": 1.166283130645752,
+      "learning_rate": 1.966568338249754e-06,
+      "loss": 0.679,
+      "step": 2180
+    },
+    {
+      "epoch": 9.065044247787611,
+      "grad_norm": 0.956119179725647,
+      "learning_rate": 1.720747295968535e-06,
+      "loss": 0.6528,
+      "step": 2190
+    },
+    {
+      "epoch": 9.069469026548672,
+      "grad_norm": 1.2216787338256836,
+      "learning_rate": 1.4749262536873157e-06,
+      "loss": 0.6857,
+      "step": 2200
+    },
+    {
+      "epoch": 9.073893805309735,
+      "grad_norm": 1.2176066637039185,
+      "learning_rate": 1.2291052114060964e-06,
+      "loss": 0.6849,
+      "step": 2210
+    },
+    {
+      "epoch": 9.078318584070797,
+      "grad_norm": 0.9874552488327026,
+      "learning_rate": 9.83284169124877e-07,
+      "loss": 0.6788,
+      "step": 2220
+    },
+    {
+      "epoch": 9.082743362831858,
+      "grad_norm": 1.053183913230896,
+      "learning_rate": 7.374631268436578e-07,
+      "loss": 0.6791,
+      "step": 2230
+    },
+    {
+      "epoch": 9.08716814159292,
+      "grad_norm": 1.2449108362197876,
+      "learning_rate": 4.916420845624385e-07,
+      "loss": 0.663,
+      "step": 2240
+    },
+    {
+      "epoch": 9.091592920353982,
+      "grad_norm": 0.9063498377799988,
+      "learning_rate": 2.4582104228121926e-07,
+      "loss": 0.6794,
+      "step": 2250
+    },
+    {
+      "epoch": 9.096017699115045,
+      "grad_norm": 1.1721782684326172,
+      "learning_rate": 0.0,
+      "loss": 0.6943,
+      "step": 2260
+    },
+    {
+      "epoch": 9.096017699115045,
+      "eval_accuracy": 0.7665847665847666,
+      "eval_loss": 0.6571447253227234,
+      "eval_runtime": 1006.2992,
+      "eval_samples_per_second": 2.831,
+      "eval_steps_per_second": 0.089,
+      "step": 2260
+    },
+    {
+      "epoch": 9.096017699115045,
+      "step": 2260,
+      "total_flos": 2.277913504770854e+19,
+      "train_loss": 0.7694057713567684,
+      "train_runtime": 44318.5607,
+      "train_samples_per_second": 1.632,
+      "train_steps_per_second": 0.051
+    },
+    {
+      "epoch": 9.096017699115045,
+      "eval_accuracy": 0.7652173913043478,
+      "eval_loss": 0.663335919380188,
+      "eval_runtime": 925.6485,
+      "eval_samples_per_second": 2.982,
+      "eval_steps_per_second": 0.094,
+      "step": 2260
+    },
+    {
+      "epoch": 9.096017699115045,
+      "eval_accuracy": 0.7652173913043478,
+      "eval_loss": 0.6633358597755432,
+      "eval_runtime": 903.9019,
+      "eval_samples_per_second": 3.053,
+      "eval_steps_per_second": 0.096,
+      "step": 2260
     }
   ],
   "logging_steps": 10,
+  "max_steps": 2260,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 9223372036854775807,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 2.277913504770854e+19,
+  "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null
 }