Model save

Browse files

Files changed (7) hide show

README.md +4 -6
all_results.json +6 -6
config.json +1 -1
model.safetensors +1 -1
train_results.json +6 -6
trainer_state.json +581 -1582
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,19 +1,17 @@
 ---
 base_model: Qwen/Qwen2.5-1.5B-Instruct
-datasets: HuggingFaceH4/Bespoke-Stratos-17k
 library_name: transformers
-model_name: Qwen/Qwen2.5-1.5B-Instruct
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - sft
 licence: license
 ---
-# Model Card for Qwen/Qwen2.5-1.5B-Instruct
-This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [HuggingFaceH4/Bespoke-Stratos-17k](https://huggingface.co/datasets/HuggingFaceH4/Bespoke-Stratos-17k) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shiqi_1/huggingface/runs/2klnrejm)
 This model was trained with SFT.

 ---
 base_model: Qwen/Qwen2.5-1.5B-Instruct
 library_name: transformers
+model_name: Qwen2.5-1.5B-Open-R1-Distill
 tags:
 - generated_from_trainer
 - trl
 - sft
 licence: license
 ---
+# Model Card for Qwen2.5-1.5B-Open-R1-Distill
+This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shiqi_1/huggingface/runs/ak49qf9r)
 This model was trained with SFT.

all_results.json CHANGED Viewed

@@ -1,14 +1,14 @@
 {
-    "epoch": 0.9994447529150472,
     "eval_loss": 0.7437080144882202,
     "eval_runtime": 13.7531,
     "eval_samples": 100,
     "eval_samples_per_second": 9.307,
     "eval_steps_per_second": 2.327,
-    "total_flos": 76902580617216.0,
-    "train_loss": 0.7594085027553417,
-    "train_runtime": 8931.1449,
     "train_samples": 16610,
-    "train_samples_per_second": 2.42,
-    "train_steps_per_second": 0.151
 }

 {
+    "epoch": 0.999259807549963,
     "eval_loss": 0.7437080144882202,
     "eval_runtime": 13.7531,
     "eval_samples": 100,
     "eval_samples_per_second": 9.307,
     "eval_steps_per_second": 2.327,
+    "total_flos": 76888336760832.0,
+    "train_loss": 0.7675936229140671,
+    "train_runtime": 4627.4844,
     "train_samples": 16610,
+    "train_samples_per_second": 4.67,
+    "train_steps_per_second": 0.146
 }

config.json CHANGED Viewed

@@ -23,7 +23,7 @@
   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.49.0.dev0",
-  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

   "tie_word_embeddings": true,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.49.0.dev0",
+  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 151936
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b31468cf55d32779e4c1fd9e1d076cf687a4ebdc81989daf2bba4471cc9f355e
 size 3087467144

 version https://git-lfs.github.com/spec/v1
+oid sha256:af43e3f1e581c8d60a24df84d884fb99613a2dff51fa2c3605e5a1fc0cbe43d2
 size 3087467144

train_results.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-    "epoch": 0.9994447529150472,
-    "total_flos": 76902580617216.0,
-    "train_loss": 0.7594085027553417,
-    "train_runtime": 8931.1449,
     "train_samples": 16610,
-    "train_samples_per_second": 2.42,
-    "train_steps_per_second": 0.151
 }

 {
+    "epoch": 0.999259807549963,
+    "total_flos": 76888336760832.0,
+    "train_loss": 0.7675936229140671,
+    "train_runtime": 4627.4844,
     "train_samples": 16610,
+    "train_samples_per_second": 4.67,
+    "train_steps_per_second": 0.146
 }

trainer_state.json CHANGED Viewed

@@ -1,2019 +1,1018 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.9994447529150472,
   "eval_steps": 100,
-  "global_step": 1350,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0037016472330186935,
-      "grad_norm": 2.6187397645490957,
-      "learning_rate": 7.407407407407407e-07,
-      "loss": 1.1051,
       "step": 5
     },
     {
-      "epoch": 0.007403294466037387,
-      "grad_norm": 2.626149854731043,
-      "learning_rate": 1.4814814814814815e-06,
-      "loss": 1.0488,
       "step": 10
     },
     {
-      "epoch": 0.01110494169905608,
-      "grad_norm": 2.1918725080390606,
-      "learning_rate": 2.222222222222222e-06,
-      "loss": 1.0739,
       "step": 15
     },
     {
-      "epoch": 0.014806588932074774,
-      "grad_norm": 1.664203092963064,
-      "learning_rate": 2.962962962962963e-06,
-      "loss": 1.0591,
       "step": 20
     },
     {
-      "epoch": 0.018508236165093468,
-      "grad_norm": 1.6572531314700638,
-      "learning_rate": 3.7037037037037037e-06,
-      "loss": 1.0203,
       "step": 25
     },
     {
-      "epoch": 0.02220988339811216,
-      "grad_norm": 1.2520837580448105,
-      "learning_rate": 4.444444444444444e-06,
-      "loss": 1.0001,
       "step": 30
     },
     {
-      "epoch": 0.025911530631130855,
-      "grad_norm": 1.1838717775548426,
-      "learning_rate": 5.185185185185185e-06,
-      "loss": 0.937,
       "step": 35
     },
     {
-      "epoch": 0.029613177864149548,
-      "grad_norm": 1.0877804851125874,
-      "learning_rate": 5.925925925925926e-06,
-      "loss": 0.9428,
       "step": 40
     },
     {
-      "epoch": 0.03331482509716824,
-      "grad_norm": 1.0468818723804836,
-      "learning_rate": 6.666666666666667e-06,
-      "loss": 0.9189,
       "step": 45
     },
     {
-      "epoch": 0.037016472330186935,
-      "grad_norm": 0.9963468132619961,
-      "learning_rate": 7.4074074074074075e-06,
-      "loss": 0.8983,
       "step": 50
     },
     {
-      "epoch": 0.040718119563205625,
-      "grad_norm": 0.9587759497954821,
-      "learning_rate": 8.148148148148148e-06,
-      "loss": 0.8727,
       "step": 55
     },
     {
-      "epoch": 0.04441976679622432,
-      "grad_norm": 0.9190997603177266,
-      "learning_rate": 8.888888888888888e-06,
-      "loss": 0.8744,
       "step": 60
     },
     {
-      "epoch": 0.04812141402924301,
-      "grad_norm": 0.9023307139402018,
-      "learning_rate": 9.62962962962963e-06,
-      "loss": 0.8342,
       "step": 65
     },
     {
-      "epoch": 0.05182306126226171,
-      "grad_norm": 1.0636801138652445,
-      "learning_rate": 1.037037037037037e-05,
-      "loss": 0.8617,
       "step": 70
     },
     {
-      "epoch": 0.0555247084952804,
-      "grad_norm": 0.921690264948927,
-      "learning_rate": 1.1111111111111113e-05,
-      "loss": 0.8508,
       "step": 75
     },
     {
-      "epoch": 0.059226355728299096,
-      "grad_norm": 0.9474476132241261,
-      "learning_rate": 1.1851851851851852e-05,
-      "loss": 0.8151,
       "step": 80
     },
     {
-      "epoch": 0.06292800296131779,
-      "grad_norm": 0.8997909905563047,
-      "learning_rate": 1.2592592592592593e-05,
-      "loss": 0.8292,
       "step": 85
     },
     {
-      "epoch": 0.06662965019433648,
-      "grad_norm": 0.9441138820892802,
-      "learning_rate": 1.3333333333333333e-05,
-      "loss": 0.8196,
       "step": 90
     },
     {
-      "epoch": 0.07033129742735518,
-      "grad_norm": 0.928380523664138,
-      "learning_rate": 1.4074074074074075e-05,
-      "loss": 0.8431,
       "step": 95
     },
     {
-      "epoch": 0.07403294466037387,
-      "grad_norm": 0.913846755266702,
-      "learning_rate": 1.4814814814814815e-05,
-      "loss": 0.8401,
       "step": 100
     },
     {
-      "epoch": 0.07403294466037387,
-      "eval_loss": 0.8453992605209351,
-      "eval_runtime": 13.9552,
-      "eval_samples_per_second": 9.172,
-      "eval_steps_per_second": 2.293,
       "step": 100
     },
     {
-      "epoch": 0.07773459189339256,
-      "grad_norm": 1.0066526224903312,
-      "learning_rate": 1.555555555555556e-05,
-      "loss": 0.8371,
       "step": 105
     },
     {
-      "epoch": 0.08143623912641125,
-      "grad_norm": 0.9627584187646963,
-      "learning_rate": 1.6296296296296297e-05,
-      "loss": 0.8401,
       "step": 110
     },
     {
-      "epoch": 0.08513788635942994,
-      "grad_norm": 1.043835378412796,
-      "learning_rate": 1.7037037037037038e-05,
-      "loss": 0.8056,
       "step": 115
     },
     {
-      "epoch": 0.08883953359244864,
-      "grad_norm": 1.0717871960568617,
-      "learning_rate": 1.7777777777777777e-05,
-      "loss": 0.8279,
       "step": 120
     },
     {
-      "epoch": 0.09254118082546733,
-      "grad_norm": 0.9708258016301404,
-      "learning_rate": 1.851851851851852e-05,
-      "loss": 0.7864,
       "step": 125
     },
     {
-      "epoch": 0.09624282805848602,
-      "grad_norm": 0.9639104638720157,
-      "learning_rate": 1.925925925925926e-05,
-      "loss": 0.8442,
       "step": 130
     },
     {
-      "epoch": 0.09994447529150471,
-      "grad_norm": 1.062054926130498,
-      "learning_rate": 2e-05,
-      "loss": 0.7993,
       "step": 135
     },
     {
-      "epoch": 0.10364612252452342,
-      "grad_norm": 1.0677672852122793,
-      "learning_rate": 1.9999164298554375e-05,
-      "loss": 0.8285,
       "step": 140
     },
     {
-      "epoch": 0.10734776975754211,
-      "grad_norm": 1.0144467644285207,
-      "learning_rate": 1.9996657333896875e-05,
-      "loss": 0.8313,
       "step": 145
     },
     {
-      "epoch": 0.1110494169905608,
-      "grad_norm": 1.179093283782147,
-      "learning_rate": 1.9992479525042305e-05,
-      "loss": 0.788,
       "step": 150
     },
     {
-      "epoch": 0.11475106422357949,
-      "grad_norm": 1.0894219697825935,
-      "learning_rate": 1.9986631570270835e-05,
-      "loss": 0.8223,
       "step": 155
     },
     {
-      "epoch": 0.11845271145659819,
-      "grad_norm": 0.9611330729714898,
-      "learning_rate": 1.9979114447011323e-05,
-      "loss": 0.7773,
       "step": 160
     },
     {
-      "epoch": 0.12215435868961688,
-      "grad_norm": 0.8836942084290859,
-      "learning_rate": 1.996992941167792e-05,
-      "loss": 0.7919,
       "step": 165
     },
     {
-      "epoch": 0.12585600592263557,
-      "grad_norm": 0.9325794007240239,
-      "learning_rate": 1.9959077999460094e-05,
-      "loss": 0.7888,
       "step": 170
     },
     {
-      "epoch": 0.12955765315565426,
-      "grad_norm": 0.9951559311754377,
-      "learning_rate": 1.9946562024066018e-05,
-      "loss": 0.7817,
       "step": 175
     },
     {
-      "epoch": 0.13325930038867295,
-      "grad_norm": 0.9681713465677686,
-      "learning_rate": 1.9932383577419432e-05,
-      "loss": 0.8005,
       "step": 180
     },
     {
-      "epoch": 0.13696094762169164,
-      "grad_norm": 1.0088921380567866,
-      "learning_rate": 1.991654502931001e-05,
-      "loss": 0.7749,
       "step": 185
     },
     {
-      "epoch": 0.14066259485471036,
-      "grad_norm": 1.0845994864602486,
-      "learning_rate": 1.9899049026997272e-05,
-      "loss": 0.7994,
       "step": 190
     },
     {
-      "epoch": 0.14436424208772905,
-      "grad_norm": 0.8934316580644518,
-      "learning_rate": 1.9879898494768093e-05,
-      "loss": 0.8145,
       "step": 195
     },
     {
-      "epoch": 0.14806588932074774,
-      "grad_norm": 1.0410789338035273,
-      "learning_rate": 1.9859096633447965e-05,
-      "loss": 0.8111,
       "step": 200
     },
     {
-      "epoch": 0.14806588932074774,
-      "eval_loss": 0.8161085844039917,
-      "eval_runtime": 13.9613,
-      "eval_samples_per_second": 9.168,
-      "eval_steps_per_second": 2.292,
       "step": 200
     },
     {
-      "epoch": 0.15176753655376643,
-      "grad_norm": 0.9827709342036304,
-      "learning_rate": 1.9836646919866012e-05,
-      "loss": 0.7804,
       "step": 205
     },
     {
-      "epoch": 0.15546918378678512,
-      "grad_norm": 1.003474849989548,
-      "learning_rate": 1.9812553106273848e-05,
-      "loss": 0.7971,
       "step": 210
     },
     {
-      "epoch": 0.1591708310198038,
-      "grad_norm": 0.9703054353773543,
-      "learning_rate": 1.9786819219718443e-05,
-      "loss": 0.7735,
       "step": 215
     },
     {
-      "epoch": 0.1628724782528225,
-      "grad_norm": 0.8887456196929027,
-      "learning_rate": 1.9759449561369036e-05,
-      "loss": 0.7913,
       "step": 220
     },
     {
-      "epoch": 0.1665741254858412,
-      "grad_norm": 0.9432643025255308,
-      "learning_rate": 1.973044870579824e-05,
-      "loss": 0.7996,
       "step": 225
     },
     {
-      "epoch": 0.17027577271885988,
-      "grad_norm": 0.839226549249778,
-      "learning_rate": 1.9699821500217436e-05,
-      "loss": 0.7226,
       "step": 230
     },
     {
-      "epoch": 0.1739774199518786,
-      "grad_norm": 1.091867770357073,
-      "learning_rate": 1.9667573063666622e-05,
-      "loss": 0.7977,
       "step": 235
     },
     {
-      "epoch": 0.1776790671848973,
-      "grad_norm": 0.8957970876878486,
-      "learning_rate": 1.9633708786158803e-05,
-      "loss": 0.7824,
       "step": 240
     },
     {
-      "epoch": 0.18138071441791598,
-      "grad_norm": 0.9277895788218994,
-      "learning_rate": 1.959823432777912e-05,
-      "loss": 0.8142,
       "step": 245
     },
     {
-      "epoch": 0.18508236165093467,
-      "grad_norm": 0.9204433681878583,
-      "learning_rate": 1.95611556177388e-05,
-      "loss": 0.763,
       "step": 250
     },
     {
-      "epoch": 0.18878400888395336,
-      "grad_norm": 0.8426481109707195,
-      "learning_rate": 1.9522478853384154e-05,
-      "loss": 0.7579,
       "step": 255
     },
     {
-      "epoch": 0.19248565611697205,
-      "grad_norm": 0.9563840160596006,
-      "learning_rate": 1.9482210499160767e-05,
-      "loss": 0.8039,
       "step": 260
     },
     {
-      "epoch": 0.19618730334999074,
-      "grad_norm": 0.876265917540201,
-      "learning_rate": 1.9440357285533e-05,
-      "loss": 0.7426,
       "step": 265
     },
     {
-      "epoch": 0.19988895058300943,
-      "grad_norm": 0.8616754726225433,
-      "learning_rate": 1.9396926207859085e-05,
-      "loss": 0.7934,
       "step": 270
     },
     {
-      "epoch": 0.20359059781602815,
-      "grad_norm": 0.9982907243202536,
-      "learning_rate": 1.93519245252219e-05,
-      "loss": 0.7933,
       "step": 275
     },
     {
-      "epoch": 0.20729224504904684,
-      "grad_norm": 0.9048851945663438,
-      "learning_rate": 1.9305359759215686e-05,
-      "loss": 0.7904,
       "step": 280
     },
     {
-      "epoch": 0.21099389228206553,
-      "grad_norm": 0.9101761515606357,
-      "learning_rate": 1.9257239692688907e-05,
-      "loss": 0.7603,
       "step": 285
     },
     {
-      "epoch": 0.21469553951508422,
-      "grad_norm": 0.9117583933872578,
-      "learning_rate": 1.9207572368443386e-05,
-      "loss": 0.7861,
       "step": 290
     },
     {
-      "epoch": 0.2183971867481029,
-      "grad_norm": 1.0650351555940032,
-      "learning_rate": 1.9156366087890062e-05,
-      "loss": 0.788,
       "step": 295
     },
     {
-      "epoch": 0.2220988339811216,
-      "grad_norm": 0.969249362194927,
-      "learning_rate": 1.9103629409661468e-05,
-      "loss": 0.7513,
       "step": 300
     },
     {
-      "epoch": 0.2220988339811216,
-      "eval_loss": 0.799350380897522,
-      "eval_runtime": 13.9587,
-      "eval_samples_per_second": 9.17,
-      "eval_steps_per_second": 2.292,
       "step": 300
     },
     {
-      "epoch": 0.2258004812141403,
-      "grad_norm": 0.9004354980972079,
-      "learning_rate": 1.9049371148181253e-05,
-      "loss": 0.7873,
       "step": 305
     },
     {
-      "epoch": 0.22950212844715898,
-      "grad_norm": 0.9646842722396891,
-      "learning_rate": 1.8993600372190933e-05,
-      "loss": 0.811,
       "step": 310
     },
     {
-      "epoch": 0.23320377568017767,
-      "grad_norm": 0.9534884432642204,
-      "learning_rate": 1.8936326403234125e-05,
-      "loss": 0.8029,
       "step": 315
     },
     {
-      "epoch": 0.23690542291319638,
-      "grad_norm": 1.0197381583907712,
-      "learning_rate": 1.8877558814098564e-05,
-      "loss": 0.8012,
       "step": 320
     },
     {
-      "epoch": 0.24060707014621507,
-      "grad_norm": 0.8801464820705825,
-      "learning_rate": 1.881730742721608e-05,
-      "loss": 0.7725,
       "step": 325
     },
     {
-      "epoch": 0.24430871737923376,
-      "grad_norm": 0.9133382986967378,
-      "learning_rate": 1.8755582313020912e-05,
-      "loss": 0.7792,
       "step": 330
     },
     {
-      "epoch": 0.24801036461225245,
-      "grad_norm": 0.9131840239823398,
-      "learning_rate": 1.8692393788266477e-05,
-      "loss": 0.7902,
       "step": 335
     },
     {
-      "epoch": 0.25171201184527114,
-      "grad_norm": 0.9107021211247343,
-      "learning_rate": 1.8627752414301087e-05,
-      "loss": 0.7952,
       "step": 340
     },
     {
-      "epoch": 0.25541365907828983,
-      "grad_norm": 0.8995888119431379,
-      "learning_rate": 1.8561668995302668e-05,
-      "loss": 0.7901,
       "step": 345
     },
     {
-      "epoch": 0.2591153063113085,
-      "grad_norm": 0.852186358263588,
-      "learning_rate": 1.8494154576472976e-05,
-      "loss": 0.7695,
       "step": 350
     },
     {
-      "epoch": 0.2628169535443272,
-      "grad_norm": 0.8396394207930936,
-      "learning_rate": 1.8425220442191496e-05,
-      "loss": 0.7635,
       "step": 355
     },
     {
-      "epoch": 0.2665186007773459,
-      "grad_norm": 0.8979065139575588,
-      "learning_rate": 1.8354878114129368e-05,
-      "loss": 0.7397,
       "step": 360
     },
     {
-      "epoch": 0.2702202480103646,
-      "grad_norm": 0.9830492467939624,
-      "learning_rate": 1.8283139349323632e-05,
-      "loss": 0.8034,
       "step": 365
     },
     {
-      "epoch": 0.2739218952433833,
-      "grad_norm": 0.8322915324627383,
-      "learning_rate": 1.8210016138212186e-05,
-      "loss": 0.7463,
       "step": 370
     },
     {
-      "epoch": 0.277623542476402,
-      "grad_norm": 0.758748760459231,
-      "learning_rate": 1.8135520702629677e-05,
-      "loss": 0.7175,
       "step": 375
     },
     {
-      "epoch": 0.2813251897094207,
-      "grad_norm": 0.9061675038903559,
-      "learning_rate": 1.8059665493764745e-05,
-      "loss": 0.7905,
       "step": 380
     },
     {
-      "epoch": 0.2850268369424394,
-      "grad_norm": 0.9120525610140781,
-      "learning_rate": 1.7982463190078928e-05,
-      "loss": 0.7422,
       "step": 385
     },
     {
-      "epoch": 0.2887284841754581,
-      "grad_norm": 0.8683320457225452,
-      "learning_rate": 1.7903926695187595e-05,
-      "loss": 0.7975,
       "step": 390
     },
     {
-      "epoch": 0.2924301314084768,
-      "grad_norm": 0.8975271488943286,
-      "learning_rate": 1.78240691357032e-05,
-      "loss": 0.766,
       "step": 395
     },
     {
-      "epoch": 0.2961317786414955,
-      "grad_norm": 0.8416591840774865,
-      "learning_rate": 1.7742903859041324e-05,
-      "loss": 0.7762,
       "step": 400
     },
     {
-      "epoch": 0.2961317786414955,
-      "eval_loss": 0.7873815298080444,
-      "eval_runtime": 13.9696,
-      "eval_samples_per_second": 9.163,
-      "eval_steps_per_second": 2.291,
       "step": 400
     },
     {
-      "epoch": 0.29983342587451417,
-      "grad_norm": 0.8046330563391425,
-      "learning_rate": 1.766044443118978e-05,
-      "loss": 0.7349,
       "step": 405
     },
     {
-      "epoch": 0.30353507310753286,
-      "grad_norm": 0.9009575122141973,
-      "learning_rate": 1.757670463444118e-05,
-      "loss": 0.8051,
       "step": 410
     },
     {
-      "epoch": 0.30723672034055155,
-      "grad_norm": 0.8753276872380578,
-      "learning_rate": 1.749169846508936e-05,
-      "loss": 0.7377,
       "step": 415
     },
     {
-      "epoch": 0.31093836757357024,
-      "grad_norm": 0.9131143743513088,
-      "learning_rate": 1.740544013109005e-05,
-      "loss": 0.7686,
       "step": 420
     },
     {
-      "epoch": 0.31464001480658893,
-      "grad_norm": 0.961063073180003,
-      "learning_rate": 1.7317944049686125e-05,
-      "loss": 0.7572,
       "step": 425
     },
     {
-      "epoch": 0.3183416620396076,
-      "grad_norm": 0.9242168018337253,
-      "learning_rate": 1.722922484499793e-05,
-      "loss": 0.7541,
       "step": 430
     },
     {
-      "epoch": 0.3220433092726263,
-      "grad_norm": 0.8102879065641984,
-      "learning_rate": 1.7139297345578992e-05,
-      "loss": 0.7306,
       "step": 435
     },
     {
-      "epoch": 0.325744956505645,
-      "grad_norm": 0.8772119423935111,
-      "learning_rate": 1.7048176581937562e-05,
-      "loss": 0.7861,
       "step": 440
     },
     {
-      "epoch": 0.3294466037386637,
-      "grad_norm": 0.9421126607604352,
-      "learning_rate": 1.6955877784024418e-05,
-      "loss": 0.7536,
       "step": 445
     },
     {
-      "epoch": 0.3331482509716824,
-      "grad_norm": 0.855630117564788,
-      "learning_rate": 1.686241637868734e-05,
-      "loss": 0.7909,
       "step": 450
     },
     {
-      "epoch": 0.33684989820470107,
-      "grad_norm": 0.9554030247369313,
-      "learning_rate": 1.676780798709262e-05,
-      "loss": 0.7732,
       "step": 455
     },
     {
-      "epoch": 0.34055154543771976,
-      "grad_norm": 0.8311286009688098,
-      "learning_rate": 1.6672068422114195e-05,
-      "loss": 0.8165,
       "step": 460
     },
     {
-      "epoch": 0.3442531926707385,
-      "grad_norm": 0.9484301771196552,
-      "learning_rate": 1.657521368569064e-05,
-      "loss": 0.7728,
       "step": 465
     },
     {
-      "epoch": 0.3479548399037572,
-      "grad_norm": 0.9544835313642219,
-      "learning_rate": 1.647725996615059e-05,
-      "loss": 0.7393,
       "step": 470
     },
     {
-      "epoch": 0.3516564871367759,
-      "grad_norm": 0.8209462524094394,
-      "learning_rate": 1.637822363550706e-05,
-      "loss": 0.7568,
       "step": 475
     },
     {
-      "epoch": 0.3553581343697946,
-      "grad_norm": 0.8358271111736775,
-      "learning_rate": 1.627812124672099e-05,
-      "loss": 0.7617,
       "step": 480
     },
     {
-      "epoch": 0.35905978160281327,
-      "grad_norm": 0.9023899903905284,
-      "learning_rate": 1.6176969530934573e-05,
-      "loss": 0.7367,
       "step": 485
     },
     {
-      "epoch": 0.36276142883583196,
-      "grad_norm": 0.9096525737441111,
-      "learning_rate": 1.6074785394674835e-05,
-      "loss": 0.7623,
       "step": 490
     },
     {
-      "epoch": 0.36646307606885065,
-      "grad_norm": 0.8825159281951512,
-      "learning_rate": 1.5971585917027864e-05,
-      "loss": 0.7428,
       "step": 495
     },
     {
-      "epoch": 0.37016472330186934,
-      "grad_norm": 0.8614364489136819,
-      "learning_rate": 1.586738834678418e-05,
-      "loss": 0.7879,
       "step": 500
     },
     {
-      "epoch": 0.37016472330186934,
-      "eval_loss": 0.7775759100914001,
-      "eval_runtime": 13.9703,
-      "eval_samples_per_second": 9.162,
-      "eval_steps_per_second": 2.291,
       "step": 500
     },
     {
-      "epoch": 0.373866370534888,
-      "grad_norm": 0.825428849040592,
-      "learning_rate": 1.5762210099555804e-05,
-      "loss": 0.7282,
       "step": 505
     },
     {
-      "epoch": 0.3775680177679067,
-      "grad_norm": 0.8915281904864577,
-      "learning_rate": 1.5656068754865388e-05,
-      "loss": 0.7478,
       "step": 510
     },
     {
-      "epoch": 0.3812696650009254,
-      "grad_norm": 0.8495286520615039,
-      "learning_rate": 1.554898205320797e-05,
-      "loss": 0.7189,
       "step": 515
     },
     {
-      "epoch": 0.3849713122339441,
-      "grad_norm": 0.8346171153567965,
-      "learning_rate": 1.5440967893085827e-05,
-      "loss": 0.7418,
       "step": 520
     },
     {
-      "epoch": 0.3886729594669628,
-      "grad_norm": 0.8454426246332264,
-      "learning_rate": 1.5332044328016916e-05,
-      "loss": 0.7427,
       "step": 525
     },
     {
-      "epoch": 0.3923746066999815,
-      "grad_norm": 0.8172271580093257,
-      "learning_rate": 1.5222229563517385e-05,
-      "loss": 0.766,
       "step": 530
     },
     {
-      "epoch": 0.39607625393300017,
-      "grad_norm": 0.756080108149318,
-      "learning_rate": 1.5111541954058733e-05,
-      "loss": 0.7552,
       "step": 535
     },
     {
-      "epoch": 0.39977790116601886,
-      "grad_norm": 0.9286111422412047,
-      "learning_rate": 1.5000000000000002e-05,
-      "loss": 0.7139,
       "step": 540
     },
     {
-      "epoch": 0.40347954839903755,
-      "grad_norm": 0.901270025112976,
-      "learning_rate": 1.4887622344495643e-05,
-      "loss": 0.766,
       "step": 545
     },
     {
-      "epoch": 0.4071811956320563,
-      "grad_norm": 0.8518885453623255,
-      "learning_rate": 1.4774427770379492e-05,
-      "loss": 0.7524,
       "step": 550
     },
     {
-      "epoch": 0.410882842865075,
-      "grad_norm": 0.7850923823705602,
-      "learning_rate": 1.4660435197025391e-05,
-      "loss": 0.7303,
       "step": 555
     },
     {
-      "epoch": 0.41458449009809367,
-      "grad_norm": 0.8809118291053211,
-      "learning_rate": 1.4545663677185007e-05,
-      "loss": 0.7508,
       "step": 560
     },
     {
-      "epoch": 0.41828613733111236,
-      "grad_norm": 0.9204090822346441,
-      "learning_rate": 1.4430132393803353e-05,
-      "loss": 0.7577,
       "step": 565
     },
     {
-      "epoch": 0.42198778456413105,
-      "grad_norm": 0.88540873125467,
-      "learning_rate": 1.4313860656812537e-05,
-      "loss": 0.7451,
       "step": 570
     },
     {
-      "epoch": 0.42568943179714974,
-      "grad_norm": 0.9404522480973243,
-      "learning_rate": 1.4196867899904292e-05,
-      "loss": 0.7628,
       "step": 575
     },
     {
-      "epoch": 0.42939107903016843,
-      "grad_norm": 0.9134791919932267,
-      "learning_rate": 1.4079173677281836e-05,
-      "loss": 0.7958,
       "step": 580
     },
     {
-      "epoch": 0.4330927262631871,
-      "grad_norm": 0.8547063134065127,
-      "learning_rate": 1.396079766039157e-05,
-      "loss": 0.7423,
       "step": 585
     },
     {
-      "epoch": 0.4367943734962058,
-      "grad_norm": 0.8271676690566563,
-      "learning_rate": 1.3841759634635177e-05,
-      "loss": 0.7353,
       "step": 590
     },
     {
-      "epoch": 0.4404960207292245,
-      "grad_norm": 0.8262219798664319,
-      "learning_rate": 1.3722079496062702e-05,
-      "loss": 0.7601,
       "step": 595
     },
     {
-      "epoch": 0.4441976679622432,
-      "grad_norm": 0.9475599443388361,
-      "learning_rate": 1.3601777248047105e-05,
-      "loss": 0.7595,
       "step": 600
     },
     {
-      "epoch": 0.4441976679622432,
-      "eval_loss": 0.772025465965271,
-      "eval_runtime": 13.9495,
-      "eval_samples_per_second": 9.176,
-      "eval_steps_per_second": 2.294,
       "step": 600
     },
     {
-      "epoch": 0.4478993151952619,
-      "grad_norm": 0.9511942323230904,
-      "learning_rate": 1.3480872997940906e-05,
-      "loss": 0.7392,
       "step": 605
     },
     {
-      "epoch": 0.4516009624282806,
-      "grad_norm": 0.8807518033426519,
-      "learning_rate": 1.3359386953715423e-05,
-      "loss": 0.7627,
       "step": 610
     },
     {
-      "epoch": 0.45530260966129926,
-      "grad_norm": 0.8362844884857521,
-      "learning_rate": 1.3237339420583213e-05,
-      "loss": 0.7117,
       "step": 615
     },
     {
-      "epoch": 0.45900425689431795,
-      "grad_norm": 0.8150528468524839,
-      "learning_rate": 1.3114750797604248e-05,
-      "loss": 0.747,
       "step": 620
     },
     {
-      "epoch": 0.46270590412733664,
-      "grad_norm": 0.8362983698312987,
-      "learning_rate": 1.2991641574276419e-05,
-      "loss": 0.7419,
       "step": 625
     },
     {
-      "epoch": 0.46640755136035533,
-      "grad_norm": 0.9050332105138127,
-      "learning_rate": 1.2868032327110904e-05,
-      "loss": 0.752,
       "step": 630
     },
     {
-      "epoch": 0.4701091985933741,
-      "grad_norm": 0.9133538944566808,
-      "learning_rate": 1.2743943716193017e-05,
-      "loss": 0.7402,
       "step": 635
     },
     {
-      "epoch": 0.47381084582639277,
-      "grad_norm": 0.8619981266901199,
-      "learning_rate": 1.261939648172906e-05,
-      "loss": 0.7226,
       "step": 640
     },
     {
-      "epoch": 0.47751249305941146,
-      "grad_norm": 0.7987558090786966,
-      "learning_rate": 1.2494411440579814e-05,
-      "loss": 0.7342,
       "step": 645
     },
     {
-      "epoch": 0.48121414029243015,
-      "grad_norm": 0.8486334284082256,
-      "learning_rate": 1.2369009482781191e-05,
-      "loss": 0.7366,
       "step": 650
     },
     {
-      "epoch": 0.48491578752544884,
-      "grad_norm": 0.8017574485719782,
-      "learning_rate": 1.2243211568052678e-05,
-      "loss": 0.7557,
       "step": 655
     },
     {
-      "epoch": 0.48861743475846753,
-      "grad_norm": 0.7936854819266611,
-      "learning_rate": 1.211703872229411e-05,
-      "loss": 0.7365,
       "step": 660
     },
     {
-      "epoch": 0.4923190819914862,
-      "grad_norm": 0.8228422615712129,
-      "learning_rate": 1.1990512034071407e-05,
-      "loss": 0.7357,
       "step": 665
     },
     {
-      "epoch": 0.4960207292245049,
-      "grad_norm": 0.8799075927771709,
-      "learning_rate": 1.1863652651091824e-05,
-      "loss": 0.7483,
       "step": 670
     },
     {
-      "epoch": 0.4997223764575236,
-      "grad_norm": 0.7966461968332786,
-      "learning_rate": 1.1736481776669307e-05,
-      "loss": 0.7624,
-      "step": 675
-    },
-    {
-      "epoch": 0.5034240236905423,
-      "grad_norm": 0.8566725582276603,
-      "learning_rate": 1.1609020666180574e-05,
-      "loss": 0.7285,
-      "step": 680
-    },
-    {
-      "epoch": 0.507125670923561,
-      "grad_norm": 0.8886455444981616,
-      "learning_rate": 1.1481290623512491e-05,
-      "loss": 0.7679,
-      "step": 685
-    },
-    {
-      "epoch": 0.5108273181565797,
-      "grad_norm": 0.9012557873417587,
-      "learning_rate": 1.1353312997501313e-05,
-      "loss": 0.7078,
-      "step": 690
-    },
-    {
-      "epoch": 0.5145289653895984,
-      "grad_norm": 0.8446186262555797,
-      "learning_rate": 1.1225109178364456e-05,
-      "loss": 0.7482,
-      "step": 695
-    },
-    {
-      "epoch": 0.518230612622617,
-      "grad_norm": 0.8295700413779421,
-      "learning_rate": 1.1096700594125318e-05,
-      "loss": 0.7021,
-      "step": 700
-    },
-    {
-      "epoch": 0.518230612622617,
-      "eval_loss": 0.7634979486465454,
-      "eval_runtime": 13.9953,
-      "eval_samples_per_second": 9.146,
-      "eval_steps_per_second": 2.286,
-      "step": 700
-    },
-    {
-      "epoch": 0.5219322598556357,
-      "grad_norm": 0.8742356168350276,
-      "learning_rate": 1.0968108707031792e-05,
-      "loss": 0.7329,
-      "step": 705
-    },
-    {
-      "epoch": 0.5256339070886544,
-      "grad_norm": 0.8314714738177891,
-      "learning_rate": 1.0839355009969068e-05,
-      "loss": 0.7513,
-      "step": 710
-    },
-    {
-      "epoch": 0.5293355543216731,
-      "grad_norm": 0.8420640531277049,
-      "learning_rate": 1.0710461022867303e-05,
-      "loss": 0.7794,
-      "step": 715
-    },
-    {
-      "epoch": 0.5330372015546918,
-      "grad_norm": 0.8414308108330245,
-      "learning_rate": 1.0581448289104759e-05,
-      "loss": 0.7408,
-      "step": 720
-    },
-    {
-      "epoch": 0.5367388487877105,
-      "grad_norm": 0.907927579037995,
-      "learning_rate": 1.0452338371907065e-05,
-      "loss": 0.752,
-      "step": 725
-    },
-    {
-      "epoch": 0.5404404960207292,
-      "grad_norm": 0.9077406887678768,
-      "learning_rate": 1.0323152850743107e-05,
-      "loss": 0.7479,
-      "step": 730
-    },
-    {
-      "epoch": 0.5441421432537479,
-      "grad_norm": 0.8374984177588956,
-      "learning_rate": 1.0193913317718245e-05,
-      "loss": 0.7382,
-      "step": 735
-    },
-    {
-      "epoch": 0.5478437904867666,
-      "grad_norm": 0.7995233932837665,
-      "learning_rate": 1.0064641373965394e-05,
-      "loss": 0.7367,
-      "step": 740
-    },
-    {
-      "epoch": 0.5515454377197853,
-      "grad_norm": 0.9526825161626277,
-      "learning_rate": 9.935358626034607e-06,
-      "loss": 0.7558,
-      "step": 745
-    },
-    {
-      "epoch": 0.555247084952804,
-      "grad_norm": 0.8708097624074678,
-      "learning_rate": 9.806086682281759e-06,
-      "loss": 0.7544,
-      "step": 750
-    },
-    {
-      "epoch": 0.5589487321858226,
-      "grad_norm": 0.7982114810095368,
-      "learning_rate": 9.676847149256894e-06,
-      "loss": 0.7529,
-      "step": 755
-    },
-    {
-      "epoch": 0.5626503794188414,
-      "grad_norm": 0.8524546420192315,
-      "learning_rate": 9.547661628092938e-06,
-      "loss": 0.7387,
-      "step": 760
-    },
-    {
-      "epoch": 0.5663520266518601,
-      "grad_norm": 0.8546073655934241,
-      "learning_rate": 9.418551710895243e-06,
-      "loss": 0.7317,
-      "step": 765
-    },
-    {
-      "epoch": 0.5700536738848788,
-      "grad_norm": 0.8807333695737656,
-      "learning_rate": 9.289538977132702e-06,
-      "loss": 0.686,
-      "step": 770
-    },
-    {
-      "epoch": 0.5737553211178975,
-      "grad_norm": 0.844968358054644,
-      "learning_rate": 9.160644990030932e-06,
-      "loss": 0.7556,
-      "step": 775
-    },
-    {
-      "epoch": 0.5774569683509162,
-      "grad_norm": 0.7662463262387722,
-      "learning_rate": 9.03189129296821e-06,
-      "loss": 0.7265,
-      "step": 780
-    },
-    {
-      "epoch": 0.5811586155839349,
-      "grad_norm": 0.8068107490785559,
-      "learning_rate": 8.903299405874685e-06,
-      "loss": 0.7244,
-      "step": 785
-    },
-    {
-      "epoch": 0.5848602628169536,
-      "grad_norm": 0.9017752536168412,
-      "learning_rate": 8.774890821635548e-06,
-      "loss": 0.7151,
-      "step": 790
-    },
-    {
-      "epoch": 0.5885619100499723,
-      "grad_norm": 0.7797811553984105,
-      "learning_rate": 8.646687002498692e-06,
-      "loss": 0.7145,
-      "step": 795
-    },
-    {
-      "epoch": 0.592263557282991,
-      "grad_norm": 0.8674556616777656,
-      "learning_rate": 8.518709376487515e-06,
-      "loss": 0.7357,
-      "step": 800
-    },
-    {
-      "epoch": 0.592263557282991,
-      "eval_loss": 0.7576203346252441,
-      "eval_runtime": 13.9497,
-      "eval_samples_per_second": 9.176,
-      "eval_steps_per_second": 2.294,
-      "step": 800
-    },
-    {
-      "epoch": 0.5959652045160097,
-      "grad_norm": 0.8467559903107525,
-      "learning_rate": 8.390979333819427e-06,
-      "loss": 0.7085,
-      "step": 805
-    },
-    {
-      "epoch": 0.5996668517490283,
-      "grad_norm": 0.8240879183306341,
-      "learning_rate": 8.263518223330698e-06,
-      "loss": 0.7154,
-      "step": 810
-    },
-    {
-      "epoch": 0.603368498982047,
-      "grad_norm": 0.8813244564095002,
-      "learning_rate": 8.13634734890818e-06,
-      "loss": 0.7269,
-      "step": 815
-    },
-    {
-      "epoch": 0.6070701462150657,
-      "grad_norm": 0.8912483807816779,
-      "learning_rate": 8.009487965928597e-06,
-      "loss": 0.7554,
-      "step": 820
-    },
-    {
-      "epoch": 0.6107717934480844,
-      "grad_norm": 0.8571355410783688,
-      "learning_rate": 7.882961277705897e-06,
-      "loss": 0.7447,
-      "step": 825
-    },
-    {
-      "epoch": 0.6144734406811031,
-      "grad_norm": 0.7573684977008671,
-      "learning_rate": 7.756788431947327e-06,
-      "loss": 0.6973,
-      "step": 830
-    },
-    {
-      "epoch": 0.6181750879141218,
-      "grad_norm": 0.8624302347661563,
-      "learning_rate": 7.630990517218809e-06,
-      "loss": 0.7371,
-      "step": 835
-    },
-    {
-      "epoch": 0.6218767351471405,
-      "grad_norm": 0.8516524205337687,
-      "learning_rate": 7.505588559420188e-06,
-      "loss": 0.7341,
-      "step": 840
-    },
-    {
-      "epoch": 0.6255783823801592,
-      "grad_norm": 0.9276498179412384,
-      "learning_rate": 7.380603518270942e-06,
-      "loss": 0.7141,
-      "step": 845
-    },
-    {
-      "epoch": 0.6292800296131779,
-      "grad_norm": 0.8975411826897715,
-      "learning_rate": 7.256056283806987e-06,
-      "loss": 0.7436,
-      "step": 850
-    },
-    {
-      "epoch": 0.6329816768461966,
-      "grad_norm": 0.8474590565828304,
-      "learning_rate": 7.131967672889101e-06,
-      "loss": 0.7493,
-      "step": 855
-    },
-    {
-      "epoch": 0.6366833240792152,
-      "grad_norm": 0.8185387261304637,
-      "learning_rate": 7.008358425723586e-06,
-      "loss": 0.735,
-      "step": 860
-    },
-    {
-      "epoch": 0.6403849713122339,
-      "grad_norm": 0.8128716130836567,
-      "learning_rate": 6.885249202395754e-06,
-      "loss": 0.7442,
-      "step": 865
-    },
-    {
-      "epoch": 0.6440866185452526,
-      "grad_norm": 0.8314128496312047,
-      "learning_rate": 6.762660579416791e-06,
-      "loss": 0.735,
-      "step": 870
-    },
-    {
-      "epoch": 0.6477882657782713,
-      "grad_norm": 0.8396057619913906,
-      "learning_rate": 6.640613046284581e-06,
-      "loss": 0.7277,
-      "step": 875
-    },
-    {
-      "epoch": 0.65148991301129,
-      "grad_norm": 0.8254782849090412,
-      "learning_rate": 6.519127002059096e-06,
-      "loss": 0.7447,
-      "step": 880
-    },
-    {
-      "epoch": 0.6551915602443087,
-      "grad_norm": 0.8091364242123348,
-      "learning_rate": 6.3982227519528986e-06,
-      "loss": 0.7355,
-      "step": 885
-    },
-    {
-      "epoch": 0.6588932074773274,
-      "grad_norm": 0.834856655149733,
-      "learning_rate": 6.277920503937303e-06,
-      "loss": 0.7467,
-      "step": 890
-    },
-    {
-      "epoch": 0.6625948547103461,
-      "grad_norm": 0.8144437357073777,
-      "learning_rate": 6.158240365364823e-06,
-      "loss": 0.7144,
-      "step": 895
-    },
-    {
-      "epoch": 0.6662965019433648,
-      "grad_norm": 0.7827054414778657,
-      "learning_rate": 6.039202339608432e-06,
-      "loss": 0.7261,
-      "step": 900
-    },
-    {
-      "epoch": 0.6662965019433648,
-      "eval_loss": 0.7522028684616089,
-      "eval_runtime": 13.95,
-      "eval_samples_per_second": 9.176,
-      "eval_steps_per_second": 2.294,
-      "step": 900
-    },
-    {
-      "epoch": 0.6699981491763835,
-      "grad_norm": 0.9041340180526984,
-      "learning_rate": 5.920826322718165e-06,
-      "loss": 0.7662,
-      "step": 905
-    },
-    {
-      "epoch": 0.6736997964094021,
-      "grad_norm": 0.8350839367682599,
-      "learning_rate": 5.80313210009571e-06,
-      "loss": 0.7186,
-      "step": 910
-    },
-    {
-      "epoch": 0.6774014436424208,
-      "grad_norm": 0.7780590810310966,
-      "learning_rate": 5.686139343187468e-06,
-      "loss": 0.6972,
-      "step": 915
-    },
-    {
-      "epoch": 0.6811030908754395,
-      "grad_norm": 0.8000697324325545,
-      "learning_rate": 5.569867606196652e-06,
-      "loss": 0.728,
-      "step": 920
-    },
-    {
-      "epoch": 0.6848047381084582,
-      "grad_norm": 0.7467762690487865,
-      "learning_rate": 5.454336322814995e-06,
-      "loss": 0.7037,
-      "step": 925
-    },
-    {
-      "epoch": 0.688506385341477,
-      "grad_norm": 0.777234399717618,
-      "learning_rate": 5.339564802974615e-06,
-      "loss": 0.7157,
-      "step": 930
-    },
-    {
-      "epoch": 0.6922080325744957,
-      "grad_norm": 0.8221362929506051,
-      "learning_rate": 5.2255722296205104e-06,
-      "loss": 0.7155,
-      "step": 935
-    },
-    {
-      "epoch": 0.6959096798075144,
-      "grad_norm": 0.8187388357604898,
-      "learning_rate": 5.112377655504359e-06,
-      "loss": 0.7258,
-      "step": 940
-    },
-    {
-      "epoch": 0.6996113270405331,
-      "grad_norm": 0.837214991569854,
-      "learning_rate": 5.000000000000003e-06,
-      "loss": 0.7379,
-      "step": 945
-    },
-    {
-      "epoch": 0.7033129742735518,
-      "grad_norm": 0.7919335607892027,
-      "learning_rate": 4.888458045941269e-06,
-      "loss": 0.7182,
-      "step": 950
-    },
-    {
-      "epoch": 0.7070146215065705,
-      "grad_norm": 0.745708705122289,
-      "learning_rate": 4.7777704364826175e-06,
-      "loss": 0.7281,
-      "step": 955
-    },
-    {
-      "epoch": 0.7107162687395892,
-      "grad_norm": 0.7855917301440053,
-      "learning_rate": 4.66795567198309e-06,
-      "loss": 0.6976,
-      "step": 960
-    },
-    {
-      "epoch": 0.7144179159726078,
-      "grad_norm": 0.8039341408241852,
-      "learning_rate": 4.559032106914173e-06,
-      "loss": 0.6941,
-      "step": 965
-    },
-    {
-      "epoch": 0.7181195632056265,
-      "grad_norm": 0.7989086472348528,
-      "learning_rate": 4.4510179467920325e-06,
-      "loss": 0.7212,
-      "step": 970
-    },
-    {
-      "epoch": 0.7218212104386452,
-      "grad_norm": 0.7552035261702617,
-      "learning_rate": 4.343931245134616e-06,
-      "loss": 0.7024,
-      "step": 975
-    },
-    {
-      "epoch": 0.7255228576716639,
-      "grad_norm": 0.7357103146927452,
-      "learning_rate": 4.237789900444197e-06,
-      "loss": 0.7508,
-      "step": 980
-    },
-    {
-      "epoch": 0.7292245049046826,
-      "grad_norm": 0.8059912605188725,
-      "learning_rate": 4.132611653215822e-06,
-      "loss": 0.7212,
-      "step": 985
-    },
-    {
-      "epoch": 0.7329261521377013,
-      "grad_norm": 0.8301874562284118,
-      "learning_rate": 4.028414082972141e-06,
-      "loss": 0.7169,
-      "step": 990
-    },
-    {
-      "epoch": 0.73662779937072,
-      "grad_norm": 0.7945303825545997,
-      "learning_rate": 3.925214605325164e-06,
-      "loss": 0.7322,
-      "step": 995
-    },
-    {
-      "epoch": 0.7403294466037387,
-      "grad_norm": 0.8561858117102767,
-      "learning_rate": 3.823030469065431e-06,
-      "loss": 0.7239,
-      "step": 1000
-    },
-    {
-      "epoch": 0.7403294466037387,
-      "eval_loss": 0.7484843730926514,
-      "eval_runtime": 13.9605,
-      "eval_samples_per_second": 9.169,
-      "eval_steps_per_second": 2.292,
-      "step": 1000
-    },
-    {
-      "epoch": 0.7440310938367574,
-      "grad_norm": 0.763331720710852,
-      "learning_rate": 3.7218787532790167e-06,
-      "loss": 0.7545,
-      "step": 1005
-    },
-    {
-      "epoch": 0.747732741069776,
-      "grad_norm": 0.8079982386291987,
-      "learning_rate": 3.6217763644929393e-06,
-      "loss": 0.6701,
-      "step": 1010
-    },
-    {
-      "epoch": 0.7514343883027947,
-      "grad_norm": 0.9128451316649459,
-      "learning_rate": 3.522740033849411e-06,
-      "loss": 0.7272,
-      "step": 1015
-    },
-    {
-      "epoch": 0.7551360355358134,
-      "grad_norm": 0.7762845069890241,
-      "learning_rate": 3.424786314309365e-06,
-      "loss": 0.7038,
-      "step": 1020
-    },
-    {
-      "epoch": 0.7588376827688321,
-      "grad_norm": 0.8035944499948804,
-      "learning_rate": 3.3279315778858034e-06,
-      "loss": 0.7571,
-      "step": 1025
-    },
-    {
-      "epoch": 0.7625393300018508,
-      "grad_norm": 0.7994108178325574,
-      "learning_rate": 3.2321920129073815e-06,
-      "loss": 0.7438,
-      "step": 1030
-    },
-    {
-      "epoch": 0.7662409772348695,
-      "grad_norm": 0.7747601431769363,
-      "learning_rate": 3.1375836213126653e-06,
-      "loss": 0.7063,
-      "step": 1035
-    },
-    {
-      "epoch": 0.7699426244678882,
-      "grad_norm": 0.7952549748289679,
-      "learning_rate": 3.04412221597558e-06,
-      "loss": 0.7088,
-      "step": 1040
-    },
-    {
-      "epoch": 0.7736442717009069,
-      "grad_norm": 0.7542488837628736,
-      "learning_rate": 2.9518234180624393e-06,
-      "loss": 0.7006,
-      "step": 1045
-    },
-    {
-      "epoch": 0.7773459189339256,
-      "grad_norm": 0.7848275252191611,
-      "learning_rate": 2.8607026544210115e-06,
-      "loss": 0.7121,
-      "step": 1050
-    },
-    {
-      "epoch": 0.7810475661669443,
-      "grad_norm": 0.8279489678198366,
-      "learning_rate": 2.770775155002071e-06,
-      "loss": 0.7188,
-      "step": 1055
-    },
-    {
-      "epoch": 0.784749213399963,
-      "grad_norm": 0.7944390505914574,
-      "learning_rate": 2.6820559503138797e-06,
-      "loss": 0.7394,
-      "step": 1060
-    },
-    {
-      "epoch": 0.7884508606329816,
-      "grad_norm": 0.775645480566711,
-      "learning_rate": 2.594559868909956e-06,
-      "loss": 0.763,
-      "step": 1065
-    },
-    {
-      "epoch": 0.7921525078660003,
-      "grad_norm": 0.8055614791757847,
-      "learning_rate": 2.50830153491064e-06,
-      "loss": 0.7058,
-      "step": 1070
-    },
-    {
-      "epoch": 0.795854155099019,
-      "grad_norm": 0.781453907704552,
-      "learning_rate": 2.423295365558821e-06,
-      "loss": 0.7208,
-      "step": 1075
-    },
-    {
-      "epoch": 0.7995558023320377,
-      "grad_norm": 0.8993073016418077,
-      "learning_rate": 2.339555568810221e-06,
-      "loss": 0.7286,
-      "step": 1080
-    },
-    {
-      "epoch": 0.8032574495650564,
-      "grad_norm": 0.8088055370829833,
-      "learning_rate": 2.2570961409586756e-06,
-      "loss": 0.7158,
-      "step": 1085
-    },
-    {
-      "epoch": 0.8069590967980751,
-      "grad_norm": 0.8400279823862996,
-      "learning_rate": 2.1759308642968024e-06,
-      "loss": 0.7358,
-      "step": 1090
-    },
-    {
-      "epoch": 0.8106607440310938,
-      "grad_norm": 0.7707002767536119,
-      "learning_rate": 2.0960733048124082e-06,
-      "loss": 0.7104,
-      "step": 1095
-    },
-    {
-      "epoch": 0.8143623912641126,
-      "grad_norm": 0.7239863902825704,
-      "learning_rate": 2.01753680992107e-06,
-      "loss": 0.7248,
-      "step": 1100
-    },
-    {
-      "epoch": 0.8143623912641126,
-      "eval_loss": 0.7456310987472534,
-      "eval_runtime": 13.9636,
-      "eval_samples_per_second": 9.167,
-      "eval_steps_per_second": 2.292,
-      "step": 1100
-    },
-    {
-      "epoch": 0.8180640384971313,
-      "grad_norm": 0.7951679059745718,
-      "learning_rate": 1.9403345062352574e-06,
-      "loss": 0.7133,
-      "step": 1105
-    },
-    {
-      "epoch": 0.82176568573015,
-      "grad_norm": 0.786912782956472,
-      "learning_rate": 1.8644792973703252e-06,
-      "loss": 0.7349,
-      "step": 1110
-    },
-    {
-      "epoch": 0.8254673329631687,
-      "grad_norm": 0.8115220548838313,
-      "learning_rate": 1.7899838617878163e-06,
-      "loss": 0.673,
-      "step": 1115
-    },
-    {
-      "epoch": 0.8291689801961873,
-      "grad_norm": 0.817121289855384,
-      "learning_rate": 1.7168606506763696e-06,
-      "loss": 0.7105,
-      "step": 1120
-    },
-    {
-      "epoch": 0.832870627429206,
-      "grad_norm": 0.8002215656530822,
-      "learning_rate": 1.6451218858706374e-06,
-      "loss": 0.7079,
-      "step": 1125
-    },
-    {
-      "epoch": 0.8365722746622247,
-      "grad_norm": 0.7834004843053641,
-      "learning_rate": 1.5747795578085046e-06,
-      "loss": 0.7341,
-      "step": 1130
-    },
-    {
-      "epoch": 0.8402739218952434,
-      "grad_norm": 0.853886763855502,
-      "learning_rate": 1.505845423527027e-06,
-      "loss": 0.7526,
-      "step": 1135
-    },
-    {
-      "epoch": 0.8439755691282621,
-      "grad_norm": 0.7515718806456418,
-      "learning_rate": 1.4383310046973365e-06,
-      "loss": 0.7297,
-      "step": 1140
-    },
-    {
-      "epoch": 0.8476772163612808,
-      "grad_norm": 0.739784359909608,
-      "learning_rate": 1.372247585698916e-06,
-      "loss": 0.7156,
-      "step": 1145
-    },
-    {
-      "epoch": 0.8513788635942995,
-      "grad_norm": 0.8843273402374225,
-      "learning_rate": 1.307606211733522e-06,
-      "loss": 0.7076,
-      "step": 1150
-    },
-    {
-      "epoch": 0.8550805108273182,
-      "grad_norm": 0.7519899075455724,
-      "learning_rate": 1.2444176869790925e-06,
-      "loss": 0.6877,
-      "step": 1155
-    },
-    {
-      "epoch": 0.8587821580603369,
-      "grad_norm": 0.7640709305599573,
-      "learning_rate": 1.18269257278392e-06,
-      "loss": 0.7266,
-      "step": 1160
-    },
-    {
-      "epoch": 0.8624838052933556,
-      "grad_norm": 0.7757193067058663,
-      "learning_rate": 1.1224411859014417e-06,
-      "loss": 0.7493,
-      "step": 1165
-    },
-    {
-      "epoch": 0.8661854525263742,
-      "grad_norm": 0.8326144754357965,
-      "learning_rate": 1.0636735967658785e-06,
-      "loss": 0.7016,
-      "step": 1170
-    },
-    {
-      "epoch": 0.8698870997593929,
-      "grad_norm": 0.7980950743505824,
-      "learning_rate": 1.0063996278090704e-06,
-      "loss": 0.7473,
-      "step": 1175
-    },
-    {
-      "epoch": 0.8735887469924116,
-      "grad_norm": 0.7448070961300409,
-      "learning_rate": 9.506288518187468e-07,
-      "loss": 0.7417,
-      "step": 1180
-    },
-    {
-      "epoch": 0.8772903942254303,
-      "grad_norm": 0.8323564393063527,
-      "learning_rate": 8.963705903385344e-07,
-      "loss": 0.73,
-      "step": 1185
-    },
-    {
-      "epoch": 0.880992041458449,
-      "grad_norm": 0.7257509455039137,
-      "learning_rate": 8.436339121099413e-07,
-      "loss": 0.6955,
-      "step": 1190
-    },
-    {
-      "epoch": 0.8846936886914677,
-      "grad_norm": 0.7668542997983315,
-      "learning_rate": 7.924276315566171e-07,
-      "loss": 0.7203,
-      "step": 1195
-    },
-    {
-      "epoch": 0.8883953359244864,
-      "grad_norm": 0.7462196723418639,
-      "learning_rate": 7.427603073110967e-07,
-      "loss": 0.7494,
-      "step": 1200
-    },
-    {
-      "epoch": 0.8883953359244864,
-      "eval_loss": 0.7442336082458496,
-      "eval_runtime": 13.9594,
-      "eval_samples_per_second": 9.169,
-      "eval_steps_per_second": 2.292,
-      "step": 1200
-    },
-    {
-      "epoch": 0.8920969831575051,
-      "grad_norm": 0.7740354232470301,
-      "learning_rate": 6.946402407843156e-07,
-      "loss": 0.7271,
-      "step": 1205
-    },
-    {
-      "epoch": 0.8957986303905238,
-      "grad_norm": 0.8223246853277537,
-      "learning_rate": 6.480754747781037e-07,
-      "loss": 0.7145,
-      "step": 1210
-    },
-    {
-      "epoch": 0.8995002776235425,
-      "grad_norm": 0.7709082515472453,
-      "learning_rate": 6.030737921409169e-07,
-      "loss": 0.7183,
-      "step": 1215
-    },
-    {
-      "epoch": 0.9032019248565611,
-      "grad_norm": 0.737942021515047,
-      "learning_rate": 5.596427144670002e-07,
-      "loss": 0.6767,
-      "step": 1220
-    },
-    {
-      "epoch": 0.9069035720895798,
-      "grad_norm": 0.7921338048969848,
-      "learning_rate": 5.177895008392353e-07,
-      "loss": 0.7339,
-      "step": 1225
-    },
-    {
-      "epoch": 0.9106052193225985,
-      "grad_norm": 0.7788313346093989,
-      "learning_rate": 4.775211466158469e-07,
-      "loss": 0.7584,
-      "step": 1230
-    },
-    {
-      "epoch": 0.9143068665556172,
-      "grad_norm": 0.7977888893081708,
-      "learning_rate": 4.388443822612043e-07,
-      "loss": 0.7331,
-      "step": 1235
-    },
-    {
-      "epoch": 0.9180085137886359,
-      "grad_norm": 0.8205134317010626,
-      "learning_rate": 4.017656722208807e-07,
-      "loss": 0.7366,
-      "step": 1240
-    },
-    {
-      "epoch": 0.9217101610216546,
-      "grad_norm": 0.9195048703105001,
-      "learning_rate": 3.662912138411967e-07,
-      "loss": 0.7397,
-      "step": 1245
-    },
-    {
-      "epoch": 0.9254118082546733,
-      "grad_norm": 0.7329045805585533,
-      "learning_rate": 3.3242693633337986e-07,
-      "loss": 0.7227,
-      "step": 1250
-    },
-    {
-      "epoch": 0.929113455487692,
-      "grad_norm": 0.7813246174244928,
-      "learning_rate": 3.001784997825652e-07,
-      "loss": 0.7034,
-      "step": 1255
-    },
-    {
-      "epoch": 0.9328151027207107,
-      "grad_norm": 0.719576735930033,
-      "learning_rate": 2.6955129420176193e-07,
-      "loss": 0.7266,
-      "step": 1260
-    },
-    {
-      "epoch": 0.9365167499537294,
-      "grad_norm": 0.7662877318795512,
-      "learning_rate": 2.405504386309643e-07,
-      "loss": 0.7363,
-      "step": 1265
-    },
-    {
-      "epoch": 0.9402183971867482,
-      "grad_norm": 0.8271100323795003,
-      "learning_rate": 2.1318078028155886e-07,
-      "loss": 0.7237,
-      "step": 1270
-    },
-    {
-      "epoch": 0.9439200444197668,
-      "grad_norm": 0.8329748294999934,
-      "learning_rate": 1.874468937261531e-07,
-      "loss": 0.7304,
-      "step": 1275
-    },
-    {
-      "epoch": 0.9476216916527855,
-      "grad_norm": 0.7890535324896384,
-      "learning_rate": 1.6335308013398888e-07,
-      "loss": 0.7094,
-      "step": 1280
-    },
-    {
-      "epoch": 0.9513233388858042,
-      "grad_norm": 0.8431357622327923,
-      "learning_rate": 1.409033665520354e-07,
-      "loss": 0.7018,
-      "step": 1285
-    },
-    {
-      "epoch": 0.9550249861188229,
-      "grad_norm": 0.7151442012235928,
-      "learning_rate": 1.201015052319099e-07,
-      "loss": 0.7046,
-      "step": 1290
-    },
-    {
-      "epoch": 0.9587266333518416,
-      "grad_norm": 0.7393727883496317,
-      "learning_rate": 1.0095097300273026e-07,
-      "loss": 0.6796,
-      "step": 1295
-    },
-    {
-      "epoch": 0.9624282805848603,
-      "grad_norm": 0.751310291423036,
-      "learning_rate": 8.345497068998897e-08,
-      "loss": 0.7353,
-      "step": 1300
-    },
-    {
-      "epoch": 0.9624282805848603,
-      "eval_loss": 0.7436981201171875,
-      "eval_runtime": 13.9653,
-      "eval_samples_per_second": 9.166,
-      "eval_steps_per_second": 2.291,
-      "step": 1300
-    },
-    {
-      "epoch": 0.966129927817879,
-      "grad_norm": 0.7374172211879654,
-      "learning_rate": 6.761642258056977e-08,
-      "loss": 0.7366,
-      "step": 1305
-    },
-    {
-      "epoch": 0.9698315750508977,
-      "grad_norm": 0.7708672190680163,
-      "learning_rate": 5.3437975933985366e-08,
-      "loss": 0.7092,
-      "step": 1310
-    },
-    {
-      "epoch": 0.9735332222839164,
-      "grad_norm": 0.7902779138167899,
-      "learning_rate": 4.0922000539906914e-08,
-      "loss": 0.6746,
-      "step": 1315
-    },
-    {
-      "epoch": 0.9772348695169351,
-      "grad_norm": 0.7632071329843058,
-      "learning_rate": 3.0070588322079765e-08,
-      "loss": 0.7196,
-      "step": 1320
-    },
-    {
-      "epoch": 0.9809365167499537,
-      "grad_norm": 0.7570175036997868,
-      "learning_rate": 2.088555298867978e-08,
-      "loss": 0.7255,
-      "step": 1325
-    },
-    {
-      "epoch": 0.9846381639829724,
-      "grad_norm": 0.758872580261703,
-      "learning_rate": 1.3368429729168075e-08,
-      "loss": 0.7287,
-      "step": 1330
-    },
-    {
-      "epoch": 0.9883398112159911,
-      "grad_norm": 0.779302395482425,
-      "learning_rate": 7.520474957699586e-09,
-      "loss": 0.7452,
-      "step": 1335
-    },
-    {
-      "epoch": 0.9920414584490098,
-      "grad_norm": 0.8910501338394342,
-      "learning_rate": 3.3426661031255024e-09,
-      "loss": 0.7288,
-      "step": 1340
-    },
-    {
-      "epoch": 0.9957431056820285,
-      "grad_norm": 0.8406066173627439,
-      "learning_rate": 8.357014456272794e-10,
-      "loss": 0.6923,
-      "step": 1345
-    },
-    {
-      "epoch": 0.9994447529150472,
-      "grad_norm": 0.759781227954175,
       "learning_rate": 0.0,
-      "loss": 0.7214,
-      "step": 1350
     },
     {
-      "epoch": 0.9994447529150472,
-      "step": 1350,
-      "total_flos": 76902580617216.0,
-      "train_loss": 0.7594085027553417,
-      "train_runtime": 8931.1449,
-      "train_samples_per_second": 2.42,
-      "train_steps_per_second": 0.151
     }
   ],
   "logging_steps": 5,
-  "max_steps": 1350,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 500,
@@ -2029,7 +1028,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 76902580617216.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.999259807549963,
   "eval_steps": 100,
+  "global_step": 675,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.007401924500370096,
+      "grad_norm": 2.674959598727342,
+      "learning_rate": 1.4705882352941177e-06,
+      "loss": 1.0787,
       "step": 5
     },
     {
+      "epoch": 0.014803849000740192,
+      "grad_norm": 2.2849930335009767,
+      "learning_rate": 2.9411764705882355e-06,
+      "loss": 1.0901,
       "step": 10
     },
     {
+      "epoch": 0.02220577350111029,
+      "grad_norm": 1.4169102513042604,
+      "learning_rate": 4.411764705882353e-06,
+      "loss": 1.0619,
       "step": 15
     },
     {
+      "epoch": 0.029607698001480384,
+      "grad_norm": 1.3754280634869183,
+      "learning_rate": 5.882352941176471e-06,
+      "loss": 1.0087,
       "step": 20
     },
     {
+      "epoch": 0.037009622501850484,
+      "grad_norm": 1.1141154652271672,
+      "learning_rate": 7.352941176470589e-06,
+      "loss": 0.9684,
       "step": 25
     },
     {
+      "epoch": 0.04441154700222058,
+      "grad_norm": 0.9534792105007532,
+      "learning_rate": 8.823529411764707e-06,
+      "loss": 0.9217,
       "step": 30
     },
     {
+      "epoch": 0.05181347150259067,
+      "grad_norm": 0.7606504248144849,
+      "learning_rate": 1.0294117647058823e-05,
+      "loss": 0.8859,
       "step": 35
     },
     {
+      "epoch": 0.05921539600296077,
+      "grad_norm": 0.68529790584477,
+      "learning_rate": 1.1764705882352942e-05,
+      "loss": 0.8631,
       "step": 40
     },
     {
+      "epoch": 0.06661732050333087,
+      "grad_norm": 0.7482947060538522,
+      "learning_rate": 1.323529411764706e-05,
+      "loss": 0.8485,
       "step": 45
     },
     {
+      "epoch": 0.07401924500370097,
+      "grad_norm": 0.814176151345203,
+      "learning_rate": 1.4705882352941179e-05,
+      "loss": 0.861,
       "step": 50
     },
     {
+      "epoch": 0.08142116950407106,
+      "grad_norm": 0.6269959293106316,
+      "learning_rate": 1.6176470588235296e-05,
+      "loss": 0.8545,
       "step": 55
     },
     {
+      "epoch": 0.08882309400444116,
+      "grad_norm": 0.7352759272340602,
+      "learning_rate": 1.7647058823529414e-05,
+      "loss": 0.8293,
       "step": 60
     },
     {
+      "epoch": 0.09622501850481126,
+      "grad_norm": 0.6918084490038217,
+      "learning_rate": 1.911764705882353e-05,
+      "loss": 0.8259,
       "step": 65
     },
     {
+      "epoch": 0.10362694300518134,
+      "grad_norm": 0.7718482933587625,
+      "learning_rate": 1.9999464266898485e-05,
+      "loss": 0.8211,
       "step": 70
     },
     {
+      "epoch": 0.11102886750555144,
+      "grad_norm": 0.9788064725128405,
+      "learning_rate": 1.9993437928712977e-05,
+      "loss": 0.8164,
       "step": 75
     },
     {
+      "epoch": 0.11843079200592153,
+      "grad_norm": 0.837998184141708,
+      "learning_rate": 1.998071963486563e-05,
+      "loss": 0.8062,
       "step": 80
     },
     {
+      "epoch": 0.12583271650629163,
+      "grad_norm": 0.6629895054599079,
+      "learning_rate": 1.9961317901970953e-05,
+      "loss": 0.7945,
       "step": 85
     },
     {
+      "epoch": 0.13323464100666174,
+      "grad_norm": 0.7557912436308644,
+      "learning_rate": 1.993524572210807e-05,
+      "loss": 0.7947,
       "step": 90
     },
     {
+      "epoch": 0.14063656550703182,
+      "grad_norm": 0.797960903546945,
+      "learning_rate": 1.990252055412077e-05,
+      "loss": 0.7906,
       "step": 95
     },
     {
+      "epoch": 0.14803849000740193,
+      "grad_norm": 0.7886433512486097,
+      "learning_rate": 1.9863164311926433e-05,
+      "loss": 0.8171,
       "step": 100
     },
     {
+      "epoch": 0.14803849000740193,
+      "eval_loss": 0.8197493553161621,
+      "eval_runtime": 7.2314,
+      "eval_samples_per_second": 17.701,
+      "eval_steps_per_second": 2.213,
       "step": 100
     },
     {
+      "epoch": 0.15544041450777202,
+      "grad_norm": 0.7194704926083675,
+      "learning_rate": 1.981720334984174e-05,
+      "loss": 0.792,
       "step": 105
     },
     {
+      "epoch": 0.16284233900814213,
+      "grad_norm": 0.7153212599875873,
+      "learning_rate": 1.9764668444934853e-05,
+      "loss": 0.7859,
       "step": 110
     },
     {
+      "epoch": 0.1702442635085122,
+      "grad_norm": 0.7209996155683963,
+      "learning_rate": 1.970559477641606e-05,
+      "loss": 0.7631,
       "step": 115
     },
     {
+      "epoch": 0.17764618800888232,
+      "grad_norm": 0.7333599419087882,
+      "learning_rate": 1.9640021902080523e-05,
+      "loss": 0.793,
       "step": 120
     },
     {
+      "epoch": 0.1850481125092524,
+      "grad_norm": 0.6289153412562695,
+      "learning_rate": 1.9567993731818988e-05,
+      "loss": 0.7916,
       "step": 125
     },
     {
+      "epoch": 0.19245003700962252,
+      "grad_norm": 0.7521612039190329,
+      "learning_rate": 1.9489558498214197e-05,
+      "loss": 0.7843,
       "step": 130
     },
     {
+      "epoch": 0.1998519615099926,
+      "grad_norm": 0.6568997079081034,
+      "learning_rate": 1.9404768724242667e-05,
+      "loss": 0.7703,
       "step": 135
     },
     {
+      "epoch": 0.20725388601036268,
+      "grad_norm": 0.7360339897713676,
+      "learning_rate": 1.931368118810346e-05,
+      "loss": 0.7947,
       "step": 140
     },
     {
+      "epoch": 0.2146558105107328,
+      "grad_norm": 0.7557749076828488,
+      "learning_rate": 1.92163568851975e-05,
+      "loss": 0.7757,
       "step": 145
     },
     {
+      "epoch": 0.22205773501110287,
+      "grad_norm": 0.8218802431960855,
+      "learning_rate": 1.911286098728296e-05,
+      "loss": 0.772,
       "step": 150
     },
     {
+      "epoch": 0.22945965951147299,
+      "grad_norm": 0.7041654789385663,
+      "learning_rate": 1.900326279883392e-05,
+      "loss": 0.8017,
       "step": 155
     },
     {
+      "epoch": 0.23686158401184307,
+      "grad_norm": 0.7107270494061172,
+      "learning_rate": 1.8887635710631716e-05,
+      "loss": 0.8045,
       "step": 160
     },
     {
+      "epoch": 0.24426350851221318,
+      "grad_norm": 0.7042955521495632,
+      "learning_rate": 1.8766057150619865e-05,
+      "loss": 0.7775,
       "step": 165
     },
     {
+      "epoch": 0.25166543301258326,
+      "grad_norm": 0.7141479489682149,
+      "learning_rate": 1.8638608532055635e-05,
+      "loss": 0.7947,
       "step": 170
     },
     {
+      "epoch": 0.25906735751295334,
+      "grad_norm": 0.6682818577909502,
+      "learning_rate": 1.8505375198992856e-05,
+      "loss": 0.7831,
       "step": 175
     },
     {
+      "epoch": 0.2664692820133235,
+      "grad_norm": 0.7193249750447441,
+      "learning_rate": 1.836644636913258e-05,
+      "loss": 0.7542,
       "step": 180
     },
     {
+      "epoch": 0.27387120651369357,
+      "grad_norm": 0.7847188441908851,
+      "learning_rate": 1.8221915074079764e-05,
+      "loss": 0.7778,
       "step": 185
     },
     {
+      "epoch": 0.28127313101406365,
+      "grad_norm": 0.8828987676403609,
+      "learning_rate": 1.8071878097046064e-05,
+      "loss": 0.7564,
       "step": 190
     },
     {
+      "epoch": 0.28867505551443373,
+      "grad_norm": 0.6213320600286455,
+      "learning_rate": 1.7916435908040413e-05,
+      "loss": 0.7723,
       "step": 195
     },
     {
+      "epoch": 0.29607698001480387,
+      "grad_norm": 0.6595102663479724,
+      "learning_rate": 1.7755692596590778e-05,
+      "loss": 0.7747,
       "step": 200
     },
     {
+      "epoch": 0.29607698001480387,
+      "eval_loss": 0.7899559736251831,
+      "eval_runtime": 7.2104,
+      "eval_samples_per_second": 17.752,
+      "eval_steps_per_second": 2.219,
       "step": 200
     },
     {
+      "epoch": 0.30347890451517395,
+      "grad_norm": 0.706296557877635,
+      "learning_rate": 1.7589755802042188e-05,
+      "loss": 0.773,
       "step": 205
     },
     {
+      "epoch": 0.31088082901554404,
+      "grad_norm": 0.6937734870068385,
+      "learning_rate": 1.7418736641477636e-05,
+      "loss": 0.7563,
       "step": 210
     },
     {
+      "epoch": 0.3182827535159141,
+      "grad_norm": 0.6327617109092492,
+      "learning_rate": 1.7242749635310222e-05,
+      "loss": 0.758,
       "step": 215
     },
     {
+      "epoch": 0.32568467801628426,
+      "grad_norm": 0.6635934294750666,
+      "learning_rate": 1.7061912630596252e-05,
+      "loss": 0.7605,
       "step": 220
     },
     {
+      "epoch": 0.33308660251665434,
+      "grad_norm": 0.7228160092157478,
+      "learning_rate": 1.6876346722120747e-05,
+      "loss": 0.7754,
       "step": 225
     },
     {
+      "epoch": 0.3404885270170244,
+      "grad_norm": 0.6952975644500169,
+      "learning_rate": 1.6686176171308125e-05,
+      "loss": 0.7977,
       "step": 230
     },
     {
+      "epoch": 0.3478904515173945,
+      "grad_norm": 0.6717058633626165,
+      "learning_rate": 1.6491528323012412e-05,
+      "loss": 0.7594,
       "step": 235
     },
     {
+      "epoch": 0.35529237601776464,
+      "grad_norm": 0.6596693045521963,
+      "learning_rate": 1.6292533520242663e-05,
+      "loss": 0.7623,
       "step": 240
     },
     {
+      "epoch": 0.3626943005181347,
+      "grad_norm": 0.6470637566854384,
+      "learning_rate": 1.6089325016880737e-05,
+      "loss": 0.7526,
       "step": 245
     },
     {
+      "epoch": 0.3700962250185048,
+      "grad_norm": 0.6877498215548267,
+      "learning_rate": 1.588203888844982e-05,
+      "loss": 0.7681,
       "step": 250
     },
     {
+      "epoch": 0.3774981495188749,
+      "grad_norm": 0.6358323626672553,
+      "learning_rate": 1.5670813940993504e-05,
+      "loss": 0.741,
       "step": 255
     },
     {
+      "epoch": 0.38490007401924503,
+      "grad_norm": 0.600848318475503,
+      "learning_rate": 1.5455791618126407e-05,
+      "loss": 0.7334,
       "step": 260
     },
     {
+      "epoch": 0.3923019985196151,
+      "grad_norm": 0.6314609013122284,
+      "learning_rate": 1.5237115906318565e-05,
+      "loss": 0.7572,
       "step": 265
     },
     {
+      "epoch": 0.3997039230199852,
+      "grad_norm": 0.6546980627619242,
+      "learning_rate": 1.5014933238477069e-05,
+      "loss": 0.7378,
       "step": 270
     },
     {
+      "epoch": 0.4071058475203553,
+      "grad_norm": 0.6975545683818176,
+      "learning_rate": 1.4789392395889468e-05,
+      "loss": 0.7632,
       "step": 275
     },
     {
+      "epoch": 0.41450777202072536,
+      "grad_norm": 0.6503686028697638,
+      "learning_rate": 1.4560644408594602e-05,
+      "loss": 0.744,
       "step": 280
     },
     {
+      "epoch": 0.4219096965210955,
+      "grad_norm": 0.6602116319307001,
+      "learning_rate": 1.432884245424761e-05,
+      "loss": 0.7556,
       "step": 285
     },
     {
+      "epoch": 0.4293116210214656,
+      "grad_norm": 0.6672698383287922,
+      "learning_rate": 1.4094141755546816e-05,
+      "loss": 0.7831,
       "step": 290
     },
     {
+      "epoch": 0.43671354552183567,
+      "grad_norm": 0.6305307497798698,
+      "learning_rate": 1.3856699476291176e-05,
+      "loss": 0.7426,
       "step": 295
     },
     {
+      "epoch": 0.44411547002220575,
+      "grad_norm": 0.6859073767100461,
+      "learning_rate": 1.3616674616137902e-05,
+      "loss": 0.7645,
       "step": 300
     },
     {
+      "epoch": 0.44411547002220575,
+      "eval_loss": 0.7755689024925232,
+      "eval_runtime": 7.2066,
+      "eval_samples_per_second": 17.762,
+      "eval_steps_per_second": 2.22,
       "step": 300
     },
     {
+      "epoch": 0.4515173945225759,
+      "grad_norm": 0.6626556377379951,
+      "learning_rate": 1.3374227904130724e-05,
+      "loss": 0.7549,
       "step": 305
     },
     {
+      "epoch": 0.45891931902294597,
+      "grad_norm": 0.6499839877597537,
+      "learning_rate": 1.3129521691070108e-05,
+      "loss": 0.7328,
       "step": 310
     },
     {
+      "epoch": 0.46632124352331605,
+      "grad_norm": 0.722140222976433,
+      "learning_rate": 1.2882719840797473e-05,
+      "loss": 0.7514,
       "step": 315
     },
     {
+      "epoch": 0.47372316802368614,
+      "grad_norm": 0.6900675221213151,
+      "learning_rate": 1.2633987620466229e-05,
+      "loss": 0.7353,
       "step": 320
     },
     {
+      "epoch": 0.4811250925240563,
+      "grad_norm": 0.6297341225224966,
+      "learning_rate": 1.2383491589873122e-05,
+      "loss": 0.7407,
       "step": 325
     },
     {
+      "epoch": 0.48852701702442636,
+      "grad_norm": 0.6139804357167142,
+      "learning_rate": 1.213139948992394e-05,
+      "loss": 0.7497,
       "step": 330
     },
     {
+      "epoch": 0.49592894152479644,
+      "grad_norm": 0.7120439739230976,
+      "learning_rate": 1.187788013030837e-05,
+      "loss": 0.7468,
       "step": 335
     },
     {
+      "epoch": 0.5033308660251665,
+      "grad_norm": 0.6179256601206382,
+      "learning_rate": 1.1623103276459086e-05,
+      "loss": 0.7507,
       "step": 340
     },
     {
+      "epoch": 0.5107327905255367,
+      "grad_norm": 0.6483835976434715,
+      "learning_rate": 1.1367239535870913e-05,
+      "loss": 0.7425,
       "step": 345
     },
     {
+      "epoch": 0.5181347150259067,
+      "grad_norm": 0.6928461738197682,
+      "learning_rate": 1.1110460243856051e-05,
+      "loss": 0.7302,
       "step": 350
     },
     {
+      "epoch": 0.5255366395262768,
+      "grad_norm": 0.6706880141545486,
+      "learning_rate": 1.085293734881197e-05,
+      "loss": 0.7468,
       "step": 355
     },
     {
+      "epoch": 0.532938564026647,
+      "grad_norm": 0.6042342171269331,
+      "learning_rate": 1.0594843297078736e-05,
+      "loss": 0.766,
       "step": 360
     },
     {
+      "epoch": 0.540340488527017,
+      "grad_norm": 0.693508088289296,
+      "learning_rate": 1.0336350917462925e-05,
+      "loss": 0.7558,
       "step": 365
     },
     {
+      "epoch": 0.5477424130273871,
+      "grad_norm": 0.6083705213800933,
+      "learning_rate": 1.0077633305505402e-05,
+      "loss": 0.7433,
       "step": 370
     },
     {
+      "epoch": 0.5551443375277573,
+      "grad_norm": 0.6396792431151416,
+      "learning_rate": 9.818863707570476e-06,
+      "loss": 0.7608,
       "step": 375
     },
     {
+      "epoch": 0.5625462620281273,
+      "grad_norm": 0.6663076065375303,
+      "learning_rate": 9.560215404834094e-06,
+      "loss": 0.7515,
       "step": 380
     },
     {
+      "epoch": 0.5699481865284974,
+      "grad_norm": 0.641428537274285,
+      "learning_rate": 9.30186159724869e-06,
+      "loss": 0.7146,
       "step": 385
     },
     {
+      "epoch": 0.5773501110288675,
+      "grad_norm": 0.6138036144437788,
+      "learning_rate": 9.043975287562443e-06,
+      "loss": 0.747,
       "step": 390
     },
     {
+      "epoch": 0.5847520355292376,
+      "grad_norm": 0.6807331921757377,
+      "learning_rate": 8.786729165470584e-06,
+      "loss": 0.7253,
       "step": 395
     },
     {
+      "epoch": 0.5921539600296077,
+      "grad_norm": 0.6952002905984943,
+      "learning_rate": 8.530295491976338e-06,
+      "loss": 0.7307,
       "step": 400
     },
     {
+      "epoch": 0.5921539600296077,
+      "eval_loss": 0.7637839317321777,
+      "eval_runtime": 7.2078,
+      "eval_samples_per_second": 17.759,
+      "eval_steps_per_second": 2.22,
       "step": 400
     },
     {
+      "epoch": 0.5995558845299778,
+      "grad_norm": 0.5939454843322792,
+      "learning_rate": 8.274845984038916e-06,
+      "loss": 0.7174,
       "step": 405
     },
     {
+      "epoch": 0.6069578090303479,
+      "grad_norm": 0.6621271866381216,
+      "learning_rate": 8.020551699585843e-06,
+      "loss": 0.7469,
       "step": 410
     },
     {
+      "epoch": 0.6143597335307179,
+      "grad_norm": 0.6106430449913639,
+      "learning_rate": 7.76758292296659e-06,
+      "loss": 0.7264,
       "step": 415
     },
     {
+      "epoch": 0.6217616580310881,
+      "grad_norm": 0.6584389038177016,
+      "learning_rate": 7.5161090509242005e-06,
+      "loss": 0.7418,
       "step": 420
     },
     {
+      "epoch": 0.6291635825314582,
+      "grad_norm": 0.6508063180682058,
+      "learning_rate": 7.2662984791613186e-06,
+      "loss": 0.7345,
       "step": 425
     },
     {
+      "epoch": 0.6365655070318282,
+      "grad_norm": 0.654746417724555,
+      "learning_rate": 7.01831848957653e-06,
+      "loss": 0.7488,
       "step": 430
     },
     {
+      "epoch": 0.6439674315321984,
+      "grad_norm": 0.6038759794228741,
+      "learning_rate": 6.772335138246548e-06,
+      "loss": 0.747,
       "step": 435
     },
     {
+      "epoch": 0.6513693560325685,
+      "grad_norm": 0.6254763438931118,
+      "learning_rate": 6.528513144229256e-06,
+      "loss": 0.7427,
       "step": 440
     },
     {
+      "epoch": 0.6587712805329385,
+      "grad_norm": 0.6195437763354315,
+      "learning_rate": 6.287015779262064e-06,
+      "loss": 0.7489,
       "step": 445
     },
     {
+      "epoch": 0.6661732050333087,
+      "grad_norm": 0.6629664159964251,
+      "learning_rate": 6.048004758429451e-06,
+      "loss": 0.7274,
       "step": 450
     },
     {
+      "epoch": 0.6735751295336787,
+      "grad_norm": 0.6058164232925908,
+      "learning_rate": 5.811640131872867e-06,
+      "loss": 0.7496,
       "step": 455
     },
     {
+      "epoch": 0.6809770540340488,
+      "grad_norm": 0.6082658380867586,
+      "learning_rate": 5.578080177615575e-06,
+      "loss": 0.7201,
       "step": 460
     },
     {
+      "epoch": 0.688378978534419,
+      "grad_norm": 0.6242205120975641,
+      "learning_rate": 5.347481295574141e-06,
+      "loss": 0.7172,
       "step": 465
     },
     {
+      "epoch": 0.695780903034789,
+      "grad_norm": 0.6109755979913201,
+      "learning_rate": 5.119997902827584e-06,
+      "loss": 0.7286,
       "step": 470
     },
     {
+      "epoch": 0.7031828275351591,
+      "grad_norm": 0.6087033956225949,
+      "learning_rate": 4.8957823302142916e-06,
+      "loss": 0.7354,
       "step": 475
     },
     {
+      "epoch": 0.7105847520355293,
+      "grad_norm": 0.5865522874345606,
+      "learning_rate": 4.674984720325961e-06,
+      "loss": 0.7212,
       "step": 480
     },
     {
+      "epoch": 0.7179866765358993,
+      "grad_norm": 0.5900008473027598,
+      "learning_rate": 4.457752926966888e-06,
+      "loss": 0.715,
       "step": 485
     },
     {
+      "epoch": 0.7253886010362695,
+      "grad_norm": 0.5840665816418219,
+      "learning_rate": 4.244232416145839e-06,
+      "loss": 0.7337,
       "step": 490
     },
     {
+      "epoch": 0.7327905255366395,
+      "grad_norm": 0.5914947024608387,
+      "learning_rate": 4.0345661686669745e-06,
+      "loss": 0.7271,
       "step": 495
     },
     {
+      "epoch": 0.7401924500370096,
+      "grad_norm": 0.6196202183056477,
+      "learning_rate": 3.828894584384867e-06,
+      "loss": 0.7355,
       "step": 500
     },
     {
+      "epoch": 0.7401924500370096,
+      "eval_loss": 0.7563655972480774,
+      "eval_runtime": 7.2181,
+      "eval_samples_per_second": 17.733,
+      "eval_steps_per_second": 2.217,
       "step": 500
     },
     {
+      "epoch": 0.7475943745373798,
+      "grad_norm": 0.5586852736192075,
+      "learning_rate": 3.62735538818787e-06,
+      "loss": 0.7197,
       "step": 505
     },
     {
+      "epoch": 0.7549962990377498,
+      "grad_norm": 0.6337625854919152,
+      "learning_rate": 3.4300835377726904e-06,
+      "loss": 0.7233,
       "step": 510
     },
     {
+      "epoch": 0.7623982235381199,
+      "grad_norm": 0.6205123290247885,
+      "learning_rate": 3.2372111332720045e-06,
+      "loss": 0.7587,
       "step": 515
     },
     {
+      "epoch": 0.7698001480384901,
+      "grad_norm": 0.6153129450053498,
+      "learning_rate": 3.048867328795588e-06,
+      "loss": 0.7156,
       "step": 520
     },
     {
+      "epoch": 0.7772020725388601,
+      "grad_norm": 0.6026709629344417,
+      "learning_rate": 2.865178245944218e-06,
+      "loss": 0.7144,
       "step": 525
     },
     {
+      "epoch": 0.7846039970392302,
+      "grad_norm": 0.5724937932245526,
+      "learning_rate": 2.686266889354211e-06,
+      "loss": 0.7375,
       "step": 530
     },
     {
+      "epoch": 0.7920059215396003,
+      "grad_norm": 0.5925644676097567,
+      "learning_rate": 2.5122530643292274e-06,
+      "loss": 0.7429,
       "step": 535
     },
     {
+      "epoch": 0.7994078460399704,
+      "grad_norm": 0.6326300634198754,
+      "learning_rate": 2.3432532966144526e-06,
+      "loss": 0.7323,
       "step": 540
     },
     {
+      "epoch": 0.8068097705403405,
+      "grad_norm": 0.5849976467168821,
+      "learning_rate": 2.1793807543668857e-06,
+      "loss": 0.7338,
       "step": 545
     },
     {
+      "epoch": 0.8142116950407106,
+      "grad_norm": 0.5500210584534766,
+      "learning_rate": 2.0207451723739633e-06,
+      "loss": 0.7257,
       "step": 550
     },
     {
+      "epoch": 0.8216136195410807,
+      "grad_norm": 0.5676680461413595,
+      "learning_rate": 1.8674527785713247e-06,
+      "loss": 0.7325,
       "step": 555
     },
     {
+      "epoch": 0.8290155440414507,
+      "grad_norm": 0.6239434546631168,
+      "learning_rate": 1.7196062229088606e-06,
+      "loss": 0.6996,
       "step": 560
     },
     {
+      "epoch": 0.8364174685418209,
+      "grad_norm": 0.6254391500900318,
+      "learning_rate": 1.577304508612717e-06,
+      "loss": 0.7298,
       "step": 565
     },
     {
+      "epoch": 0.843819393042191,
+      "grad_norm": 0.5238189690989516,
+      "learning_rate": 1.4406429258892762e-06,
+      "loss": 0.7503,
       "step": 570
     },
     {
+      "epoch": 0.851221317542561,
+      "grad_norm": 0.6133649327761147,
+      "learning_rate": 1.3097129881154936e-06,
+      "loss": 0.7199,
       "step": 575
     },
     {
+      "epoch": 0.8586232420429312,
+      "grad_norm": 0.5832243304649319,
+      "learning_rate": 1.1846023705583442e-06,
+      "loss": 0.7164,
       "step": 580
     },
     {
+      "epoch": 0.8660251665433013,
+      "grad_norm": 0.581421408776636,
+      "learning_rate": 1.065394851664394e-06,
+      "loss": 0.7345,
       "step": 585
     },
     {
+      "epoch": 0.8734270910436713,
+      "grad_norm": 0.5486795664712047,
+      "learning_rate": 9.521702569588199e-07,
+      "loss": 0.7537,
       "step": 590
     },
     {
+      "epoch": 0.8808290155440415,
+      "grad_norm": 0.5762089839170463,
+      "learning_rate": 8.450044055914497e-07,
+      "loss": 0.7221,
       "step": 595
     },
     {
+      "epoch": 0.8882309400444115,
+      "grad_norm": 0.5637562364478066,
+      "learning_rate": 7.439690595656013e-07,
+      "loss": 0.7445,
       "step": 600
     },
     {
+      "epoch": 0.8882309400444115,
+      "eval_loss": 0.7531630992889404,
+      "eval_runtime": 6.3295,
+      "eval_samples_per_second": 20.223,
+      "eval_steps_per_second": 2.528,
       "step": 600
     },
     {
+      "epoch": 0.8956328645447816,
+      "grad_norm": 0.6333025954598674,
+      "learning_rate": 6.491318756837417e-07,
+      "loss": 0.7298,
       "step": 605
     },
     {
+      "epoch": 0.9030347890451518,
+      "grad_norm": 0.5151052584290405,
+      "learning_rate": 5.605563602421149e-07,
+      "loss": 0.7058,
       "step": 610
     },
     {
+      "epoch": 0.9104367135455218,
+      "grad_norm": 0.558083301486103,
+      "learning_rate": 4.783018265047179e-07,
+      "loss": 0.7557,
       "step": 615
     },
     {
+      "epoch": 0.9178386380458919,
+      "grad_norm": 0.5838054593517799,
+      "learning_rate": 4.024233549850509e-07,
+      "loss": 0.7436,
       "step": 620
     },
     {
+      "epoch": 0.9252405625462621,
+      "grad_norm": 0.5532527164872905,
+      "learning_rate": 3.329717565622825e-07,
+      "loss": 0.7404,
       "step": 625
     },
     {
+      "epoch": 0.9326424870466321,
+      "grad_norm": 0.5531839239223881,
+      "learning_rate": 2.6999353845651113e-07,
+      "loss": 0.724,
       "step": 630
     },
     {
+      "epoch": 0.9400444115470022,
+      "grad_norm": 0.5908996580668381,
+      "learning_rate": 2.1353087308590314e-07,
+      "loss": 0.7391,
       "step": 635
     },
     {
+      "epoch": 0.9474463360473723,
+      "grad_norm": 0.5583503930295213,
+      "learning_rate": 1.6362156982656085e-07,
+      "loss": 0.7292,
       "step": 640
     },
     {
+      "epoch": 0.9548482605477424,
+      "grad_norm": 0.5273929642155748,
+      "learning_rate": 1.2029904969404482e-07,
+      "loss": 0.7127,
       "step": 645
     },
     {
+      "epoch": 0.9622501850481125,
+      "grad_norm": 0.6043817732068986,
+      "learning_rate": 8.359232296349163e-08,
+      "loss": 0.7163,
       "step": 650
     },
     {
+      "epoch": 0.9696521095484826,
+      "grad_norm": 0.5680817655810946,
+      "learning_rate": 5.3525969743324356e-08,
+      "loss": 0.7322,
       "step": 655
     },
     {
+      "epoch": 0.9770540340488527,
+      "grad_norm": 0.5484921768212552,
+      "learning_rate": 3.012012351554017e-08,
+      "loss": 0.7064,
       "step": 660
     },
     {
+      "epoch": 0.9844559585492227,
+      "grad_norm": 0.5841736663763849,
+      "learning_rate": 1.3390457653639221e-08,
+      "loss": 0.7353,
       "step": 665
     },
     {
+      "epoch": 0.9918578830495929,
+      "grad_norm": 0.6439690756031937,
+      "learning_rate": 3.3481749271768726e-09,
+      "loss": 0.7463,
       "step": 670
     },
     {
+      "epoch": 0.999259807549963,
+      "grad_norm": 0.5777335037865771,
       "learning_rate": 0.0,
+      "loss": 0.7158,
+      "step": 675
     },
     {
+      "epoch": 0.999259807549963,
+      "step": 675,
+      "total_flos": 76888336760832.0,
+      "train_loss": 0.7675936229140671,
+      "train_runtime": 4627.4844,
+      "train_samples_per_second": 4.67,
+      "train_steps_per_second": 0.146
     }
   ],
   "logging_steps": 5,
+  "max_steps": 675,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 76888336760832.0,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2d1db9b31890b71d03fe2d7aef1a05bea0ecf23be7567023d332facbc04d44a
 size 7416

 version https://git-lfs.github.com/spec/v1
+oid sha256:b874f4c00970d0c1ca0bdeb1229662c5f353e103ad59da3dc823860bf66099a1
 size 7416