Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
tokenizer.json +13 -17
tokenizer_config.json +32 -24
trainer_state.json +829 -3

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d612e15f55999c6a227ebf07288e0f48cdba71db6ce47ba871b73bcb7c116356
 size 16794200

 version https://git-lfs.github.com/spec/v1
+oid sha256:709ae92384d7aeae2a594b24f20bc72a9ede6fc892d384e3edb835e78c4cc441
 size 16794200

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f52009c94247dfb026804c2bdcea609c03eef29d05333278db627b2de60917c8
 size 33630266

 version https://git-lfs.github.com/spec/v1
+oid sha256:c8412229afcfd9ba03f3adc09a87546ff7c8ef8b4093a3684805704a3aa73147
 size 33630266

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3bee608bccf720d1f8a1cf1f5605dc8581ecae741b97cce5adea0c59b58117c
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:b3cfd0f149ee197292991a77959f8ad42f317393eec06f6d5290905f53ad8df3
 size 14244

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:193fd7696f4050db882ed979f43f74654a2702263893d6e614bd950ee15e7a6b
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:7a1760d654783fa21ab023433f66d210cdfce9d79e1ba7107ab45b42aa3b2970
 size 1064

tokenizer.json CHANGED Viewed

@@ -1,6 +1,11 @@
 {
   "version": "1.0",
-  "truncation": null,
   "padding": null,
   "added_tokens": [
     {
@@ -31,23 +36,13 @@
       "special": true
     }
   ],
-  "normalizer": {
-    "type": "Sequence",
-    "normalizers": [
-      {
-        "type": "Prepend",
-        "prepend": "▁"
-      },
-      {
-        "type": "Replace",
-        "pattern": {
-          "String": " "
-        },
-        "content": "▁"
-      }
-    ]
   },
-  "pre_tokenizer": null,
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
@@ -134,6 +129,7 @@
     "end_of_word_suffix": null,
     "fuse_unk": true,
     "byte_fallback": true,
     "vocab": {
       "<unk>": 0,
       "<s>": 1,

 {
   "version": "1.0",
+  "truncation": {
+    "direction": "Right",
+    "max_length": 1024,
+    "strategy": "LongestFirst",
+    "stride": 0
+  },
   "padding": null,
   "added_tokens": [
     {
       "special": true
     }
   ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Metaspace",
+    "replacement": "▁",
+    "prepend_scheme": "first",
+    "split": false
   },
   "post_processor": {
     "type": "TemplateProcessing",
     "single": [
     "end_of_word_suffix": null,
     "fuse_unk": true,
     "byte_fallback": true,
+    "ignore_merges": false,
     "vocab": {
       "<unk>": 0,
       "<s>": 1,

tokenizer_config.json CHANGED Viewed

@@ -1,35 +1,43 @@
 {
   "add_bos_token": true,
   "add_eos_token": false,
-  "bos_token": {
-    "__type": "AddedToken",
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
   },
   "clean_up_tokenization_spaces": false,
-  "eos_token": {
-    "__type": "AddedToken",
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "legacy": false,
   "model_max_length": 4096,
-  "pad_token": null,
   "padding_side": "right",
   "sp_model_kwargs": {},
   "tokenizer_class": "LlamaTokenizer",
-  "unk_token": {
-    "__type": "AddedToken",
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
   "add_bos_token": true,
   "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
   },
+  "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
   "legacy": false,
   "model_max_length": 4096,
+  "pad_token": "</s>",
   "padding_side": "right",
   "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 5.0,
   "eval_steps": 500,
-  "global_step": 1175,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -826,6 +826,832 @@
       "learning_rate": 0.00015038541704796003,
       "loss": 0.0139,
       "step": 1170
     }
   ],
   "logging_steps": 10,
@@ -845,7 +1671,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4.72337525157888e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 10.0,
   "eval_steps": 500,
+  "global_step": 2350,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.00015038541704796003,
       "loss": 0.0139,
       "step": 1170
+    },
+    {
+      "epoch": 5.0212765957446805,
+      "grad_norm": 0.2920892834663391,
+      "learning_rate": 0.0001496135900856782,
+      "loss": 0.0083,
+      "step": 1180
+    },
+    {
+      "epoch": 5.0638297872340425,
+      "grad_norm": 0.32393306493759155,
+      "learning_rate": 0.0001488378223697851,
+      "loss": 0.0155,
+      "step": 1190
+    },
+    {
+      "epoch": 5.1063829787234045,
+      "grad_norm": 0.49029844999313354,
+      "learning_rate": 0.00014805817551866838,
+      "loss": 0.0109,
+      "step": 1200
+    },
+    {
+      "epoch": 5.148936170212766,
+      "grad_norm": 0.05497799441218376,
+      "learning_rate": 0.00014727471145883127,
+      "loss": 0.0095,
+      "step": 1210
+    },
+    {
+      "epoch": 5.191489361702128,
+      "grad_norm": 0.4540445804595947,
+      "learning_rate": 0.00014648749241997363,
+      "loss": 0.0106,
+      "step": 1220
+    },
+    {
+      "epoch": 5.23404255319149,
+      "grad_norm": 0.16598157584667206,
+      "learning_rate": 0.00014569658093004935,
+      "loss": 0.0072,
+      "step": 1230
+    },
+    {
+      "epoch": 5.276595744680851,
+      "grad_norm": 0.07160704582929611,
+      "learning_rate": 0.0001449020398102996,
+      "loss": 0.0108,
+      "step": 1240
+    },
+    {
+      "epoch": 5.319148936170213,
+      "grad_norm": 0.197789266705513,
+      "learning_rate": 0.00014410393217026318,
+      "loss": 0.0118,
+      "step": 1250
+    },
+    {
+      "epoch": 5.361702127659575,
+      "grad_norm": 0.07983817905187607,
+      "learning_rate": 0.00014330232140276366,
+      "loss": 0.0076,
+      "step": 1260
+    },
+    {
+      "epoch": 5.404255319148936,
+      "grad_norm": 0.0746329054236412,
+      "learning_rate": 0.00014249727117887425,
+      "loss": 0.0089,
+      "step": 1270
+    },
+    {
+      "epoch": 5.446808510638298,
+      "grad_norm": 0.09392493963241577,
+      "learning_rate": 0.00014168884544286053,
+      "loss": 0.0103,
+      "step": 1280
+    },
+    {
+      "epoch": 5.48936170212766,
+      "grad_norm": 0.18386998772621155,
+      "learning_rate": 0.0001408771084071012,
+      "loss": 0.0096,
+      "step": 1290
+    },
+    {
+      "epoch": 5.531914893617021,
+      "grad_norm": 0.07400283962488174,
+      "learning_rate": 0.00014006212454698797,
+      "loss": 0.0083,
+      "step": 1300
+    },
+    {
+      "epoch": 5.574468085106383,
+      "grad_norm": 0.06513144075870514,
+      "learning_rate": 0.00013924395859580432,
+      "loss": 0.0093,
+      "step": 1310
+    },
+    {
+      "epoch": 5.617021276595745,
+      "grad_norm": 0.6950928568840027,
+      "learning_rate": 0.00013842267553958371,
+      "loss": 0.0073,
+      "step": 1320
+    },
+    {
+      "epoch": 5.659574468085106,
+      "grad_norm": 0.05320321023464203,
+      "learning_rate": 0.00013759834061194794,
+      "loss": 0.0098,
+      "step": 1330
+    },
+    {
+      "epoch": 5.702127659574468,
+      "grad_norm": 0.17569933831691742,
+      "learning_rate": 0.00013677101928892554,
+      "loss": 0.0075,
+      "step": 1340
+    },
+    {
+      "epoch": 5.74468085106383,
+      "grad_norm": 0.052122730761766434,
+      "learning_rate": 0.00013594077728375128,
+      "loss": 0.0107,
+      "step": 1350
+    },
+    {
+      "epoch": 5.787234042553192,
+      "grad_norm": 0.2108752280473709,
+      "learning_rate": 0.00013510768054164653,
+      "loss": 0.0119,
+      "step": 1360
+    },
+    {
+      "epoch": 5.829787234042553,
+      "grad_norm": 0.047486983239650726,
+      "learning_rate": 0.00013427179523458127,
+      "loss": 0.0092,
+      "step": 1370
+    },
+    {
+      "epoch": 5.872340425531915,
+      "grad_norm": 0.043320391327142715,
+      "learning_rate": 0.0001334331877560182,
+      "loss": 0.0081,
+      "step": 1380
+    },
+    {
+      "epoch": 5.914893617021277,
+      "grad_norm": 0.09155077487230301,
+      "learning_rate": 0.00013259192471563912,
+      "loss": 0.0091,
+      "step": 1390
+    },
+    {
+      "epoch": 5.957446808510638,
+      "grad_norm": 0.049143675714731216,
+      "learning_rate": 0.00013174807293405428,
+      "loss": 0.0089,
+      "step": 1400
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.07365540415048599,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.0068,
+      "step": 1410
+    },
+    {
+      "epoch": 6.042553191489362,
+      "grad_norm": 0.11437718570232391,
+      "learning_rate": 0.00013005287145248878,
+      "loss": 0.0064,
+      "step": 1420
+    },
+    {
+      "epoch": 6.085106382978723,
+      "grad_norm": 0.06010650098323822,
+      "learning_rate": 0.0001292016564005219,
+      "loss": 0.0074,
+      "step": 1430
+    },
+    {
+      "epoch": 6.127659574468085,
+      "grad_norm": 0.04595167934894562,
+      "learning_rate": 0.0001283481218926818,
+      "loss": 0.0066,
+      "step": 1440
+    },
+    {
+      "epoch": 6.170212765957447,
+      "grad_norm": 0.05351310595870018,
+      "learning_rate": 0.00012749233572428804,
+      "loss": 0.0097,
+      "step": 1450
+    },
+    {
+      "epoch": 6.212765957446808,
+      "grad_norm": 0.13630953431129456,
+      "learning_rate": 0.00012663436586950714,
+      "loss": 0.0079,
+      "step": 1460
+    },
+    {
+      "epoch": 6.25531914893617,
+      "grad_norm": 0.17622588574886322,
+      "learning_rate": 0.00012577428047595344,
+      "loss": 0.0084,
+      "step": 1470
+    },
+    {
+      "epoch": 6.297872340425532,
+      "grad_norm": 0.050954531878232956,
+      "learning_rate": 0.0001249121478592762,
+      "loss": 0.0077,
+      "step": 1480
+    },
+    {
+      "epoch": 6.340425531914893,
+      "grad_norm": 0.3051726818084717,
+      "learning_rate": 0.0001240480364977335,
+      "loss": 0.0085,
+      "step": 1490
+    },
+    {
+      "epoch": 6.382978723404255,
+      "grad_norm": 0.30509302020072937,
+      "learning_rate": 0.00012318201502675285,
+      "loss": 0.0092,
+      "step": 1500
+    },
+    {
+      "epoch": 6.425531914893617,
+      "grad_norm": 0.09164142608642578,
+      "learning_rate": 0.00012231415223347972,
+      "loss": 0.008,
+      "step": 1510
+    },
+    {
+      "epoch": 6.468085106382979,
+      "grad_norm": 0.05406223237514496,
+      "learning_rate": 0.0001214445170513139,
+      "loss": 0.0078,
+      "step": 1520
+    },
+    {
+      "epoch": 6.51063829787234,
+      "grad_norm": 0.05845744535326958,
+      "learning_rate": 0.00012057317855443395,
+      "loss": 0.0092,
+      "step": 1530
+    },
+    {
+      "epoch": 6.553191489361702,
+      "grad_norm": 0.05021122843027115,
+      "learning_rate": 0.00011970020595231101,
+      "loss": 0.007,
+      "step": 1540
+    },
+    {
+      "epoch": 6.595744680851064,
+      "grad_norm": 0.10315235704183578,
+      "learning_rate": 0.00011882566858421135,
+      "loss": 0.0068,
+      "step": 1550
+    },
+    {
+      "epoch": 6.638297872340425,
+      "grad_norm": 0.08750782907009125,
+      "learning_rate": 0.00011794963591368893,
+      "loss": 0.009,
+      "step": 1560
+    },
+    {
+      "epoch": 6.680851063829787,
+      "grad_norm": 0.05412838235497475,
+      "learning_rate": 0.0001170721775230679,
+      "loss": 0.0071,
+      "step": 1570
+    },
+    {
+      "epoch": 6.723404255319149,
+      "grad_norm": 0.17292432487010956,
+      "learning_rate": 0.00011619336310791586,
+      "loss": 0.0091,
+      "step": 1580
+    },
+    {
+      "epoch": 6.76595744680851,
+      "grad_norm": 0.05503688380122185,
+      "learning_rate": 0.00011531326247150803,
+      "loss": 0.0069,
+      "step": 1590
+    },
+    {
+      "epoch": 6.808510638297872,
+      "grad_norm": 0.05121155083179474,
+      "learning_rate": 0.00011443194551928266,
+      "loss": 0.008,
+      "step": 1600
+    },
+    {
+      "epoch": 6.851063829787234,
+      "grad_norm": 0.0626005157828331,
+      "learning_rate": 0.00011354948225328877,
+      "loss": 0.0065,
+      "step": 1610
+    },
+    {
+      "epoch": 6.8936170212765955,
+      "grad_norm": 0.058921247720718384,
+      "learning_rate": 0.0001126659427666257,
+      "loss": 0.0078,
+      "step": 1620
+    },
+    {
+      "epoch": 6.9361702127659575,
+      "grad_norm": 0.058523211628198624,
+      "learning_rate": 0.00011178139723787597,
+      "loss": 0.008,
+      "step": 1630
+    },
+    {
+      "epoch": 6.9787234042553195,
+      "grad_norm": 0.18594586849212646,
+      "learning_rate": 0.00011089591592553082,
+      "loss": 0.0076,
+      "step": 1640
+    },
+    {
+      "epoch": 7.0212765957446805,
+      "grad_norm": 0.053747180849313736,
+      "learning_rate": 0.00011000956916240985,
+      "loss": 0.0074,
+      "step": 1650
+    },
+    {
+      "epoch": 7.0638297872340425,
+      "grad_norm": 0.03964696079492569,
+      "learning_rate": 0.00010912242735007441,
+      "loss": 0.0071,
+      "step": 1660
+    },
+    {
+      "epoch": 7.1063829787234045,
+      "grad_norm": 0.05952566862106323,
+      "learning_rate": 0.00010823456095323579,
+      "loss": 0.0065,
+      "step": 1670
+    },
+    {
+      "epoch": 7.148936170212766,
+      "grad_norm": 0.21424749493598938,
+      "learning_rate": 0.00010734604049415822,
+      "loss": 0.0075,
+      "step": 1680
+    },
+    {
+      "epoch": 7.191489361702128,
+      "grad_norm": 0.03922433406114578,
+      "learning_rate": 0.0001064569365470574,
+      "loss": 0.0071,
+      "step": 1690
+    },
+    {
+      "epoch": 7.23404255319149,
+      "grad_norm": 0.05505215749144554,
+      "learning_rate": 0.00010556731973249485,
+      "loss": 0.0061,
+      "step": 1700
+    },
+    {
+      "epoch": 7.276595744680851,
+      "grad_norm": 0.03699268028140068,
+      "learning_rate": 0.00010467726071176853,
+      "loss": 0.0075,
+      "step": 1710
+    },
+    {
+      "epoch": 7.319148936170213,
+      "grad_norm": 0.04546520113945007,
+      "learning_rate": 0.00010378683018130047,
+      "loss": 0.0072,
+      "step": 1720
+    },
+    {
+      "epoch": 7.361702127659575,
+      "grad_norm": 0.056984953582286835,
+      "learning_rate": 0.0001028960988670212,
+      "loss": 0.007,
+      "step": 1730
+    },
+    {
+      "epoch": 7.404255319148936,
+      "grad_norm": 0.24714773893356323,
+      "learning_rate": 0.00010200513751875227,
+      "loss": 0.0074,
+      "step": 1740
+    },
+    {
+      "epoch": 7.446808510638298,
+      "grad_norm": 0.06558862328529358,
+      "learning_rate": 0.00010111401690458654,
+      "loss": 0.0064,
+      "step": 1750
+    },
+    {
+      "epoch": 7.48936170212766,
+      "grad_norm": 0.06254340708255768,
+      "learning_rate": 0.00010022280780526725,
+      "loss": 0.0076,
+      "step": 1760
+    },
+    {
+      "epoch": 7.531914893617021,
+      "grad_norm": 0.08606445789337158,
+      "learning_rate": 9.93315810085658e-05,
+      "loss": 0.0076,
+      "step": 1770
+    },
+    {
+      "epoch": 7.574468085106383,
+      "grad_norm": 0.051956657320261,
+      "learning_rate": 9.844040730365936e-05,
+      "loss": 0.0073,
+      "step": 1780
+    },
+    {
+      "epoch": 7.617021276595745,
+      "grad_norm": 0.24819940328598022,
+      "learning_rate": 9.754935747550804e-05,
+      "loss": 0.0077,
+      "step": 1790
+    },
+    {
+      "epoch": 7.659574468085106,
+      "grad_norm": 0.039045918732881546,
+      "learning_rate": 9.665850229923258e-05,
+      "loss": 0.0071,
+      "step": 1800
+    },
+    {
+      "epoch": 7.702127659574468,
+      "grad_norm": 0.04438428208231926,
+      "learning_rate": 9.57679125344927e-05,
+      "loss": 0.0074,
+      "step": 1810
+    },
+    {
+      "epoch": 7.74468085106383,
+      "grad_norm": 0.31067439913749695,
+      "learning_rate": 9.487765891986682e-05,
+      "loss": 0.0087,
+      "step": 1820
+    },
+    {
+      "epoch": 7.787234042553192,
+      "grad_norm": 0.06251853704452515,
+      "learning_rate": 9.398781216723331e-05,
+      "loss": 0.0069,
+      "step": 1830
+    },
+    {
+      "epoch": 7.829787234042553,
+      "grad_norm": 0.04706185683608055,
+      "learning_rate": 9.309844295615389e-05,
+      "loss": 0.0072,
+      "step": 1840
+    },
+    {
+      "epoch": 7.872340425531915,
+      "grad_norm": 0.03839968144893646,
+      "learning_rate": 9.220962192825968e-05,
+      "loss": 0.0069,
+      "step": 1850
+    },
+    {
+      "epoch": 7.914893617021277,
+      "grad_norm": 0.03722318261861801,
+      "learning_rate": 9.132141968164026e-05,
+      "loss": 0.0069,
+      "step": 1860
+    },
+    {
+      "epoch": 7.957446808510638,
+      "grad_norm": 0.0543668158352375,
+      "learning_rate": 9.043390676523604e-05,
+      "loss": 0.0076,
+      "step": 1870
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.051696889102458954,
+      "learning_rate": 8.954715367323468e-05,
+      "loss": 0.0066,
+      "step": 1880
+    },
+    {
+      "epoch": 8.042553191489361,
+      "grad_norm": 0.08786992728710175,
+      "learning_rate": 8.866123083947182e-05,
+      "loss": 0.0062,
+      "step": 1890
+    },
+    {
+      "epoch": 8.085106382978724,
+      "grad_norm": 0.3556790351867676,
+      "learning_rate": 8.777620863183657e-05,
+      "loss": 0.0079,
+      "step": 1900
+    },
+    {
+      "epoch": 8.127659574468085,
+      "grad_norm": 0.08049122244119644,
+      "learning_rate": 8.689215734668232e-05,
+      "loss": 0.0064,
+      "step": 1910
+    },
+    {
+      "epoch": 8.170212765957446,
+      "grad_norm": 0.05408492311835289,
+      "learning_rate": 8.600914720324316e-05,
+      "loss": 0.0077,
+      "step": 1920
+    },
+    {
+      "epoch": 8.212765957446809,
+      "grad_norm": 0.034946855157613754,
+      "learning_rate": 8.512724833805634e-05,
+      "loss": 0.0067,
+      "step": 1930
+    },
+    {
+      "epoch": 8.25531914893617,
+      "grad_norm": 0.05422681197524071,
+      "learning_rate": 8.424653079939156e-05,
+      "loss": 0.0062,
+      "step": 1940
+    },
+    {
+      "epoch": 8.297872340425531,
+      "grad_norm": 0.06405791640281677,
+      "learning_rate": 8.336706454168701e-05,
+      "loss": 0.0064,
+      "step": 1950
+    },
+    {
+      "epoch": 8.340425531914894,
+      "grad_norm": 0.08694509416818619,
+      "learning_rate": 8.248891941999297e-05,
+      "loss": 0.006,
+      "step": 1960
+    },
+    {
+      "epoch": 8.382978723404255,
+      "grad_norm": 0.05589594319462776,
+      "learning_rate": 8.161216518442334e-05,
+      "loss": 0.0067,
+      "step": 1970
+    },
+    {
+      "epoch": 8.425531914893616,
+      "grad_norm": 0.05914654955267906,
+      "learning_rate": 8.073687147461547e-05,
+      "loss": 0.0065,
+      "step": 1980
+    },
+    {
+      "epoch": 8.46808510638298,
+      "grad_norm": 0.05237606540322304,
+      "learning_rate": 7.98631078141987e-05,
+      "loss": 0.0071,
+      "step": 1990
+    },
+    {
+      "epoch": 8.51063829787234,
+      "grad_norm": 0.056138359010219574,
+      "learning_rate": 7.89909436052722e-05,
+      "loss": 0.0079,
+      "step": 2000
+    },
+    {
+      "epoch": 8.553191489361701,
+      "grad_norm": 0.06285829097032547,
+      "learning_rate": 7.812044812289249e-05,
+      "loss": 0.0064,
+      "step": 2010
+    },
+    {
+      "epoch": 8.595744680851064,
+      "grad_norm": 0.04014954715967178,
+      "learning_rate": 7.72516905095709e-05,
+      "loss": 0.0072,
+      "step": 2020
+    },
+    {
+      "epoch": 8.638297872340425,
+      "grad_norm": 0.04423344135284424,
+      "learning_rate": 7.638473976978177e-05,
+      "loss": 0.0065,
+      "step": 2030
+    },
+    {
+      "epoch": 8.680851063829786,
+      "grad_norm": 0.05287999287247658,
+      "learning_rate": 7.55196647644814e-05,
+      "loss": 0.0081,
+      "step": 2040
+    },
+    {
+      "epoch": 8.72340425531915,
+      "grad_norm": 0.056493304669857025,
+      "learning_rate": 7.465653420563845e-05,
+      "loss": 0.0067,
+      "step": 2050
+    },
+    {
+      "epoch": 8.76595744680851,
+      "grad_norm": 0.057281821966171265,
+      "learning_rate": 7.379541665077643e-05,
+      "loss": 0.0078,
+      "step": 2060
+    },
+    {
+      "epoch": 8.808510638297872,
+      "grad_norm": 0.053731102496385574,
+      "learning_rate": 7.293638049752812e-05,
+      "loss": 0.0066,
+      "step": 2070
+    },
+    {
+      "epoch": 8.851063829787234,
+      "grad_norm": 0.18697482347488403,
+      "learning_rate": 7.207949397820278e-05,
+      "loss": 0.0069,
+      "step": 2080
+    },
+    {
+      "epoch": 8.893617021276595,
+      "grad_norm": 0.0387713797390461,
+      "learning_rate": 7.122482515436661e-05,
+      "loss": 0.007,
+      "step": 2090
+    },
+    {
+      "epoch": 8.936170212765958,
+      "grad_norm": 0.04441935196518898,
+      "learning_rate": 7.037244191143661e-05,
+      "loss": 0.0067,
+      "step": 2100
+    },
+    {
+      "epoch": 8.97872340425532,
+      "grad_norm": 0.05758345127105713,
+      "learning_rate": 6.952241195328868e-05,
+      "loss": 0.0065,
+      "step": 2110
+    },
+    {
+      "epoch": 9.02127659574468,
+      "grad_norm": 0.050706226378679276,
+      "learning_rate": 6.867480279687974e-05,
+      "loss": 0.0063,
+      "step": 2120
+    },
+    {
+      "epoch": 9.063829787234043,
+      "grad_norm": 0.05180887505412102,
+      "learning_rate": 6.782968176688514e-05,
+      "loss": 0.0062,
+      "step": 2130
+    },
+    {
+      "epoch": 9.106382978723405,
+      "grad_norm": 0.05492401868104935,
+      "learning_rate": 6.6987115990351e-05,
+      "loss": 0.006,
+      "step": 2140
+    },
+    {
+      "epoch": 9.148936170212766,
+      "grad_norm": 0.053439777344465256,
+      "learning_rate": 6.614717239136246e-05,
+      "loss": 0.0066,
+      "step": 2150
+    },
+    {
+      "epoch": 9.191489361702128,
+      "grad_norm": 0.15650008618831635,
+      "learning_rate": 6.530991768572794e-05,
+      "loss": 0.006,
+      "step": 2160
+    },
+    {
+      "epoch": 9.23404255319149,
+      "grad_norm": 0.04846300184726715,
+      "learning_rate": 6.447541837568e-05,
+      "loss": 0.0068,
+      "step": 2170
+    },
+    {
+      "epoch": 9.27659574468085,
+      "grad_norm": 0.047157011926174164,
+      "learning_rate": 6.364374074459307e-05,
+      "loss": 0.006,
+      "step": 2180
+    },
+    {
+      "epoch": 9.319148936170214,
+      "grad_norm": 0.047358132898807526,
+      "learning_rate": 6.281495085171869e-05,
+      "loss": 0.0058,
+      "step": 2190
+    },
+    {
+      "epoch": 9.361702127659575,
+      "grad_norm": 0.11200093477964401,
+      "learning_rate": 6.198911452693853e-05,
+      "loss": 0.007,
+      "step": 2200
+    },
+    {
+      "epoch": 9.404255319148936,
+      "grad_norm": 0.06481984257698059,
+      "learning_rate": 6.116629736553552e-05,
+      "loss": 0.0069,
+      "step": 2210
+    },
+    {
+      "epoch": 9.446808510638299,
+      "grad_norm": 0.04743931442499161,
+      "learning_rate": 6.0346564722983736e-05,
+      "loss": 0.0072,
+      "step": 2220
+    },
+    {
+      "epoch": 9.48936170212766,
+      "grad_norm": 0.07307924330234528,
+      "learning_rate": 5.952998170975724e-05,
+      "loss": 0.0062,
+      "step": 2230
+    },
+    {
+      "epoch": 9.53191489361702,
+      "grad_norm": 0.05650079995393753,
+      "learning_rate": 5.871661318615848e-05,
+      "loss": 0.0061,
+      "step": 2240
+    },
+    {
+      "epoch": 9.574468085106384,
+      "grad_norm": 0.057734813541173935,
+      "learning_rate": 5.790652375716652e-05,
+      "loss": 0.0068,
+      "step": 2250
+    },
+    {
+      "epoch": 9.617021276595745,
+      "grad_norm": 0.05608903244137764,
+      "learning_rate": 5.709977776730537e-05,
+      "loss": 0.0071,
+      "step": 2260
+    },
+    {
+      "epoch": 9.659574468085106,
+      "grad_norm": 0.07133158296346664,
+      "learning_rate": 5.62964392955335e-05,
+      "loss": 0.0067,
+      "step": 2270
+    },
+    {
+      "epoch": 9.702127659574469,
+      "grad_norm": 0.2167043685913086,
+      "learning_rate": 5.549657215015367e-05,
+      "loss": 0.0067,
+      "step": 2280
+    },
+    {
+      "epoch": 9.74468085106383,
+      "grad_norm": 0.06523007154464722,
+      "learning_rate": 5.470023986374516e-05,
+      "loss": 0.0068,
+      "step": 2290
+    },
+    {
+      "epoch": 9.787234042553191,
+      "grad_norm": 0.04895370826125145,
+      "learning_rate": 5.39075056881172e-05,
+      "loss": 0.0058,
+      "step": 2300
+    },
+    {
+      "epoch": 9.829787234042554,
+      "grad_norm": 0.056433409452438354,
+      "learning_rate": 5.31184325892849e-05,
+      "loss": 0.0067,
+      "step": 2310
+    },
+    {
+      "epoch": 9.872340425531915,
+      "grad_norm": 0.049970466643571854,
+      "learning_rate": 5.233308324246805e-05,
+      "loss": 0.0058,
+      "step": 2320
+    },
+    {
+      "epoch": 9.914893617021276,
+      "grad_norm": 0.08367093652486801,
+      "learning_rate": 5.155152002711285e-05,
+      "loss": 0.0063,
+      "step": 2330
+    },
+    {
+      "epoch": 9.957446808510639,
+      "grad_norm": 0.07821632921695709,
+      "learning_rate": 5.077380502193725e-05,
+      "loss": 0.0071,
+      "step": 2340
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.04344159737229347,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.0078,
+      "step": 2350
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 9.44675050315776e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null