Training in progress, step 189, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +4 -4
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1271 -74
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": "unsloth",
   "target_modules": [
-    "q_proj",
     "k_proj",
-    "o_proj",
-    "gate_proj",
     "v_proj",
     "down_proj",
-    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": "unsloth",
   "target_modules": [
     "k_proj",
+    "q_proj",
     "v_proj",
+    "o_proj",
+    "up_proj",
     "down_proj",
+    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c2c77e1faef5360edb86fdd675b75189f7974b272c230d2370205fe561086d7
 size 60010048

 version https://git-lfs.github.com/spec/v1
+oid sha256:e625bcbda58a2077a2f53862e647a4496991959b8e386df7a79f732236f18e9f
 size 60010048

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a19147b8c5bee718d325cc69e052a9d7b1c02ff81e02f720da3c271048875c83
 size 30427860

 version https://git-lfs.github.com/spec/v1
+oid sha256:03e4632d18d4e91d695ae7e07e8db236469a783bace61d269def1460bacaf75a
 size 30427860

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:584ad9e5089f3d6aa8e9e8b89c55f8d862ea0d388ff3164ca7b9ce670f8a69af
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:96200d4ca8dc27a2feac013eb62dd34ec99b5598bcdd66554ba3bfa0cfc0128f
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:975a394e3f3cbbd67dfc86dc023c0bf94ad366845e30e55347ecdc023963d2d7
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:5aa8b796ae61fd6c2a8cd520f49549ae764a41fd16083782a810533cbf63c112
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,142 +1,1339 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.03950617283950617,
   "eval_steps": 500,
-  "global_step": 18,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.0021947873799725653,
-      "grad_norm": 0.5619140863418579,
-      "learning_rate": 5.000000000000001e-07,
-      "loss": 2.0945,
       "step": 1
     },
     {
       "epoch": 0.0043895747599451305,
-      "grad_norm": 0.5982444286346436,
       "learning_rate": 1.0000000000000002e-06,
-      "loss": 2.1405,
-      "step": 2
     },
     {
       "epoch": 0.006584362139917695,
-      "grad_norm": 0.5753356218338013,
-      "learning_rate": 1.5e-06,
-      "loss": 2.0972,
-      "step": 3
     },
     {
       "epoch": 0.008779149519890261,
-      "grad_norm": 0.6162204146385193,
-      "learning_rate": 2.0000000000000003e-06,
-      "loss": 2.1892,
-      "step": 4
     },
     {
       "epoch": 0.010973936899862825,
-      "grad_norm": 0.6353598833084106,
-      "learning_rate": 2.5e-06,
-      "loss": 2.1669,
-      "step": 5
     },
     {
       "epoch": 0.01316872427983539,
-      "grad_norm": 0.5805812478065491,
-      "learning_rate": 3e-06,
-      "loss": 2.1406,
-      "step": 6
     },
     {
       "epoch": 0.015363511659807956,
-      "grad_norm": 0.6394542455673218,
-      "learning_rate": 3.5000000000000004e-06,
-      "loss": 2.2199,
-      "step": 7
     },
     {
       "epoch": 0.017558299039780522,
-      "grad_norm": 0.566260039806366,
-      "learning_rate": 4.000000000000001e-06,
-      "loss": 2.0736,
-      "step": 8
     },
     {
       "epoch": 0.019753086419753086,
-      "grad_norm": 0.5819919109344482,
-      "learning_rate": 4.5e-06,
-      "loss": 2.1244,
-      "step": 9
     },
     {
       "epoch": 0.02194787379972565,
-      "grad_norm": 0.6622738242149353,
-      "learning_rate": 5e-06,
-      "loss": 2.165,
-      "step": 10
     },
     {
       "epoch": 0.024142661179698217,
-      "grad_norm": 0.5929241180419922,
-      "learning_rate": 5.500000000000001e-06,
-      "loss": 2.0738,
-      "step": 11
     },
     {
       "epoch": 0.02633744855967078,
-      "grad_norm": 0.6357274651527405,
-      "learning_rate": 6e-06,
-      "loss": 2.1277,
-      "step": 12
     },
     {
       "epoch": 0.02853223593964335,
-      "grad_norm": 0.612406313419342,
-      "learning_rate": 6.5000000000000004e-06,
-      "loss": 2.1812,
-      "step": 13
     },
     {
       "epoch": 0.030727023319615913,
-      "grad_norm": 0.5351918339729309,
-      "learning_rate": 7.000000000000001e-06,
-      "loss": 2.0624,
-      "step": 14
     },
     {
       "epoch": 0.03292181069958848,
-      "grad_norm": 0.5575801730155945,
-      "learning_rate": 7.5e-06,
-      "loss": 2.0872,
-      "step": 15
     },
     {
       "epoch": 0.035116598079561044,
-      "grad_norm": 0.5277993679046631,
-      "learning_rate": 8.000000000000001e-06,
-      "loss": 2.0443,
-      "step": 16
     },
     {
       "epoch": 0.03731138545953361,
-      "grad_norm": 0.5522028207778931,
-      "learning_rate": 8.500000000000002e-06,
-      "loss": 2.1187,
-      "step": 17
     },
     {
       "epoch": 0.03950617283950617,
-      "grad_norm": 0.5474849343299866,
       "learning_rate": 9e-06,
-      "loss": 2.1184,
-      "step": 18
     }
   ],
   "logging_steps": 1,
-  "max_steps": 455,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 1,
@@ -152,7 +1349,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.186071270264013e+16,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.2074074074074074,
   "eval_steps": 500,
+  "global_step": 189,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.0010973936899862826,
+      "grad_norm": 0.5590636134147644,
+      "learning_rate": 2.0000000000000002e-07,
+      "loss": 2.0815,
       "step": 1
     },
+    {
+      "epoch": 0.0021947873799725653,
+      "grad_norm": 0.5765010714530945,
+      "learning_rate": 4.0000000000000003e-07,
+      "loss": 2.1074,
+      "step": 2
+    },
+    {
+      "epoch": 0.0032921810699588477,
+      "grad_norm": 0.584500253200531,
+      "learning_rate": 6.000000000000001e-07,
+      "loss": 2.1012,
+      "step": 3
+    },
     {
       "epoch": 0.0043895747599451305,
+      "grad_norm": 0.6246315836906433,
+      "learning_rate": 8.000000000000001e-07,
+      "loss": 2.1795,
+      "step": 4
+    },
+    {
+      "epoch": 0.0054869684499314125,
+      "grad_norm": 0.5558974146842957,
       "learning_rate": 1.0000000000000002e-06,
+      "loss": 2.0729,
+      "step": 5
     },
     {
       "epoch": 0.006584362139917695,
+      "grad_norm": 0.6098957657814026,
+      "learning_rate": 1.2000000000000002e-06,
+      "loss": 2.1216,
+      "step": 6
+    },
+    {
+      "epoch": 0.007681755829903978,
+      "grad_norm": 0.5948249697685242,
+      "learning_rate": 1.4000000000000001e-06,
+      "loss": 2.1381,
+      "step": 7
     },
     {
       "epoch": 0.008779149519890261,
+      "grad_norm": 0.6547859311103821,
+      "learning_rate": 1.6000000000000001e-06,
+      "loss": 2.2398,
+      "step": 8
+    },
+    {
+      "epoch": 0.009876543209876543,
+      "grad_norm": 0.6762146353721619,
+      "learning_rate": 1.8000000000000001e-06,
+      "loss": 2.1872,
+      "step": 9
     },
     {
       "epoch": 0.010973936899862825,
+      "grad_norm": 0.6071422100067139,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 2.1451,
+      "step": 10
+    },
+    {
+      "epoch": 0.012071330589849109,
+      "grad_norm": 0.5874962210655212,
+      "learning_rate": 2.2e-06,
+      "loss": 2.1632,
+      "step": 11
     },
     {
       "epoch": 0.01316872427983539,
+      "grad_norm": 0.5862544775009155,
+      "learning_rate": 2.4000000000000003e-06,
+      "loss": 2.1152,
+      "step": 12
+    },
+    {
+      "epoch": 0.014266117969821674,
+      "grad_norm": 0.614499032497406,
+      "learning_rate": 2.6e-06,
+      "loss": 2.1906,
+      "step": 13
     },
     {
       "epoch": 0.015363511659807956,
+      "grad_norm": 0.6741944551467896,
+      "learning_rate": 2.8000000000000003e-06,
+      "loss": 2.2436,
+      "step": 14
+    },
+    {
+      "epoch": 0.01646090534979424,
+      "grad_norm": 0.5827784538269043,
+      "learning_rate": 3e-06,
+      "loss": 2.0787,
+      "step": 15
     },
     {
       "epoch": 0.017558299039780522,
+      "grad_norm": 0.5573399066925049,
+      "learning_rate": 3.2000000000000003e-06,
+      "loss": 2.0608,
+      "step": 16
+    },
+    {
+      "epoch": 0.018655692729766804,
+      "grad_norm": 0.5720248818397522,
+      "learning_rate": 3.4000000000000005e-06,
+      "loss": 2.1152,
+      "step": 17
     },
     {
       "epoch": 0.019753086419753086,
+      "grad_norm": 0.5938870310783386,
+      "learning_rate": 3.6000000000000003e-06,
+      "loss": 2.1217,
+      "step": 18
+    },
+    {
+      "epoch": 0.020850480109739368,
+      "grad_norm": 0.6471643447875977,
+      "learning_rate": 3.8000000000000005e-06,
+      "loss": 2.1649,
+      "step": 19
     },
     {
       "epoch": 0.02194787379972565,
+      "grad_norm": 0.6734815835952759,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 2.1458,
+      "step": 20
+    },
+    {
+      "epoch": 0.023045267489711935,
+      "grad_norm": 0.5911048054695129,
+      "learning_rate": 4.2000000000000004e-06,
+      "loss": 2.0573,
+      "step": 21
     },
     {
       "epoch": 0.024142661179698217,
+      "grad_norm": 0.5842518210411072,
+      "learning_rate": 4.4e-06,
+      "loss": 2.0675,
+      "step": 22
+    },
+    {
+      "epoch": 0.0252400548696845,
+      "grad_norm": 0.6548774242401123,
+      "learning_rate": 4.600000000000001e-06,
+      "loss": 2.1,
+      "step": 23
     },
     {
       "epoch": 0.02633744855967078,
+      "grad_norm": 0.5978567600250244,
+      "learning_rate": 4.800000000000001e-06,
+      "loss": 2.1234,
+      "step": 24
+    },
+    {
+      "epoch": 0.027434842249657063,
+      "grad_norm": 0.6166532635688782,
+      "learning_rate": 5e-06,
+      "loss": 2.1784,
+      "step": 25
     },
     {
       "epoch": 0.02853223593964335,
+      "grad_norm": 0.5846672058105469,
+      "learning_rate": 5.2e-06,
+      "loss": 2.1453,
+      "step": 26
+    },
+    {
+      "epoch": 0.02962962962962963,
+      "grad_norm": 0.5325105786323547,
+      "learning_rate": 5.400000000000001e-06,
+      "loss": 2.0348,
+      "step": 27
     },
     {
       "epoch": 0.030727023319615913,
+      "grad_norm": 0.5196258425712585,
+      "learning_rate": 5.600000000000001e-06,
+      "loss": 2.0484,
+      "step": 28
+    },
+    {
+      "epoch": 0.031824417009602195,
+      "grad_norm": 0.5424355268478394,
+      "learning_rate": 5.8e-06,
+      "loss": 2.0728,
+      "step": 29
     },
     {
       "epoch": 0.03292181069958848,
+      "grad_norm": 0.5482433438301086,
+      "learning_rate": 6e-06,
+      "loss": 2.051,
+      "step": 30
+    },
+    {
+      "epoch": 0.03401920438957476,
+      "grad_norm": 0.5127721428871155,
+      "learning_rate": 6.200000000000001e-06,
+      "loss": 2.0508,
+      "step": 31
     },
     {
       "epoch": 0.035116598079561044,
+      "grad_norm": 0.5115760564804077,
+      "learning_rate": 6.4000000000000006e-06,
+      "loss": 1.9822,
+      "step": 32
+    },
+    {
+      "epoch": 0.03621399176954732,
+      "grad_norm": 0.5571993589401245,
+      "learning_rate": 6.600000000000001e-06,
+      "loss": 2.0929,
+      "step": 33
     },
     {
       "epoch": 0.03731138545953361,
+      "grad_norm": 0.5175538063049316,
+      "learning_rate": 6.800000000000001e-06,
+      "loss": 2.0776,
+      "step": 34
+    },
+    {
+      "epoch": 0.038408779149519894,
+      "grad_norm": 0.5197016596794128,
+      "learning_rate": 7e-06,
+      "loss": 2.0747,
+      "step": 35
     },
     {
       "epoch": 0.03950617283950617,
+      "grad_norm": 0.5387422442436218,
+      "learning_rate": 7.2000000000000005e-06,
+      "loss": 2.0864,
+      "step": 36
+    },
+    {
+      "epoch": 0.04060356652949246,
+      "grad_norm": 0.5122085809707642,
+      "learning_rate": 7.4e-06,
+      "loss": 2.0737,
+      "step": 37
+    },
+    {
+      "epoch": 0.041700960219478736,
+      "grad_norm": 0.6023052334785461,
+      "learning_rate": 7.600000000000001e-06,
+      "loss": 2.0519,
+      "step": 38
+    },
+    {
+      "epoch": 0.04279835390946502,
+      "grad_norm": 0.5678107738494873,
+      "learning_rate": 7.800000000000002e-06,
+      "loss": 2.1175,
+      "step": 39
+    },
+    {
+      "epoch": 0.0438957475994513,
+      "grad_norm": 0.5373347401618958,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 2.0667,
+      "step": 40
+    },
+    {
+      "epoch": 0.044993141289437585,
+      "grad_norm": 0.5342459082603455,
+      "learning_rate": 8.2e-06,
+      "loss": 2.081,
+      "step": 41
+    },
+    {
+      "epoch": 0.04609053497942387,
+      "grad_norm": 0.6377962231636047,
+      "learning_rate": 8.400000000000001e-06,
+      "loss": 2.1576,
+      "step": 42
+    },
+    {
+      "epoch": 0.04718792866941015,
+      "grad_norm": 0.46719589829444885,
+      "learning_rate": 8.6e-06,
+      "loss": 1.9755,
+      "step": 43
+    },
+    {
+      "epoch": 0.048285322359396435,
+      "grad_norm": 0.4903099834918976,
+      "learning_rate": 8.8e-06,
+      "loss": 2.0195,
+      "step": 44
+    },
+    {
+      "epoch": 0.04938271604938271,
+      "grad_norm": 0.460610955953598,
       "learning_rate": 9e-06,
+      "loss": 1.9456,
+      "step": 45
+    },
+    {
+      "epoch": 0.050480109739369,
+      "grad_norm": 0.45545804500579834,
+      "learning_rate": 9.200000000000002e-06,
+      "loss": 2.0255,
+      "step": 46
+    },
+    {
+      "epoch": 0.051577503429355284,
+      "grad_norm": 0.4546717405319214,
+      "learning_rate": 9.4e-06,
+      "loss": 1.9741,
+      "step": 47
+    },
+    {
+      "epoch": 0.05267489711934156,
+      "grad_norm": 0.46261245012283325,
+      "learning_rate": 9.600000000000001e-06,
+      "loss": 2.0104,
+      "step": 48
+    },
+    {
+      "epoch": 0.05377229080932785,
+      "grad_norm": 0.43465015292167664,
+      "learning_rate": 9.800000000000001e-06,
+      "loss": 1.9356,
+      "step": 49
+    },
+    {
+      "epoch": 0.05486968449931413,
+      "grad_norm": 0.42450013756752014,
+      "learning_rate": 1e-05,
+      "loss": 1.9312,
+      "step": 50
+    },
+    {
+      "epoch": 0.05596707818930041,
+      "grad_norm": 0.4223531484603882,
+      "learning_rate": 1.02e-05,
+      "loss": 1.9022,
+      "step": 51
+    },
+    {
+      "epoch": 0.0570644718792867,
+      "grad_norm": 0.39140722155570984,
+      "learning_rate": 1.04e-05,
+      "loss": 1.9784,
+      "step": 52
+    },
+    {
+      "epoch": 0.058161865569272976,
+      "grad_norm": 0.4256257712841034,
+      "learning_rate": 1.0600000000000002e-05,
+      "loss": 1.9278,
+      "step": 53
+    },
+    {
+      "epoch": 0.05925925925925926,
+      "grad_norm": 0.45769235491752625,
+      "learning_rate": 1.0800000000000002e-05,
+      "loss": 2.031,
+      "step": 54
+    },
+    {
+      "epoch": 0.06035665294924554,
+      "grad_norm": 0.4826626181602478,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 1.9647,
+      "step": 55
+    },
+    {
+      "epoch": 0.061454046639231825,
+      "grad_norm": 0.4414077699184418,
+      "learning_rate": 1.1200000000000001e-05,
+      "loss": 2.0007,
+      "step": 56
+    },
+    {
+      "epoch": 0.06255144032921811,
+      "grad_norm": 0.39386674761772156,
+      "learning_rate": 1.14e-05,
+      "loss": 1.8676,
+      "step": 57
+    },
+    {
+      "epoch": 0.06364883401920439,
+      "grad_norm": 0.386751264333725,
+      "learning_rate": 1.16e-05,
+      "loss": 1.9471,
+      "step": 58
+    },
+    {
+      "epoch": 0.06474622770919067,
+      "grad_norm": 0.3908196985721588,
+      "learning_rate": 1.18e-05,
+      "loss": 1.9722,
+      "step": 59
+    },
+    {
+      "epoch": 0.06584362139917696,
+      "grad_norm": 0.39014488458633423,
+      "learning_rate": 1.2e-05,
+      "loss": 1.9328,
+      "step": 60
+    },
+    {
+      "epoch": 0.06694101508916324,
+      "grad_norm": 0.3620125651359558,
+      "learning_rate": 1.22e-05,
+      "loss": 1.9918,
+      "step": 61
+    },
+    {
+      "epoch": 0.06803840877914952,
+      "grad_norm": 0.37926608324050903,
+      "learning_rate": 1.2400000000000002e-05,
+      "loss": 1.8233,
+      "step": 62
+    },
+    {
+      "epoch": 0.0691358024691358,
+      "grad_norm": 0.44235774874687195,
+      "learning_rate": 1.2600000000000001e-05,
+      "loss": 1.9558,
+      "step": 63
+    },
+    {
+      "epoch": 0.07023319615912209,
+      "grad_norm": 0.3922639489173889,
+      "learning_rate": 1.2800000000000001e-05,
+      "loss": 1.993,
+      "step": 64
+    },
+    {
+      "epoch": 0.07133058984910837,
+      "grad_norm": 0.4201815128326416,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 1.873,
+      "step": 65
+    },
+    {
+      "epoch": 0.07242798353909465,
+      "grad_norm": 0.3698742389678955,
+      "learning_rate": 1.3200000000000002e-05,
+      "loss": 1.8728,
+      "step": 66
+    },
+    {
+      "epoch": 0.07352537722908094,
+      "grad_norm": 0.38322025537490845,
+      "learning_rate": 1.3400000000000002e-05,
+      "loss": 1.8322,
+      "step": 67
+    },
+    {
+      "epoch": 0.07462277091906722,
+      "grad_norm": 0.37198516726493835,
+      "learning_rate": 1.3600000000000002e-05,
+      "loss": 1.932,
+      "step": 68
+    },
+    {
+      "epoch": 0.0757201646090535,
+      "grad_norm": 0.36407792568206787,
+      "learning_rate": 1.38e-05,
+      "loss": 1.826,
+      "step": 69
+    },
+    {
+      "epoch": 0.07681755829903979,
+      "grad_norm": 0.3603726625442505,
+      "learning_rate": 1.4e-05,
+      "loss": 1.8001,
+      "step": 70
+    },
+    {
+      "epoch": 0.07791495198902607,
+      "grad_norm": 0.37135443091392517,
+      "learning_rate": 1.4200000000000001e-05,
+      "loss": 1.8165,
+      "step": 71
+    },
+    {
+      "epoch": 0.07901234567901234,
+      "grad_norm": 0.37606292963027954,
+      "learning_rate": 1.4400000000000001e-05,
+      "loss": 1.7207,
+      "step": 72
+    },
+    {
+      "epoch": 0.08010973936899862,
+      "grad_norm": 0.3831545114517212,
+      "learning_rate": 1.46e-05,
+      "loss": 1.8692,
+      "step": 73
+    },
+    {
+      "epoch": 0.08120713305898491,
+      "grad_norm": 0.3911626935005188,
+      "learning_rate": 1.48e-05,
+      "loss": 1.8687,
+      "step": 74
+    },
+    {
+      "epoch": 0.0823045267489712,
+      "grad_norm": 0.39615172147750854,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 1.7774,
+      "step": 75
+    },
+    {
+      "epoch": 0.08340192043895747,
+      "grad_norm": 0.37161362171173096,
+      "learning_rate": 1.5200000000000002e-05,
+      "loss": 1.723,
+      "step": 76
+    },
+    {
+      "epoch": 0.08449931412894376,
+      "grad_norm": 0.3870552182197571,
+      "learning_rate": 1.54e-05,
+      "loss": 1.7314,
+      "step": 77
+    },
+    {
+      "epoch": 0.08559670781893004,
+      "grad_norm": 0.4143535792827606,
+      "learning_rate": 1.5600000000000003e-05,
+      "loss": 1.7912,
+      "step": 78
+    },
+    {
+      "epoch": 0.08669410150891632,
+      "grad_norm": 0.4270377457141876,
+      "learning_rate": 1.58e-05,
+      "loss": 1.78,
+      "step": 79
+    },
+    {
+      "epoch": 0.0877914951989026,
+      "grad_norm": 0.4058167636394501,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.7228,
+      "step": 80
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.42461684346199036,
+      "learning_rate": 1.62e-05,
+      "loss": 1.8259,
+      "step": 81
+    },
+    {
+      "epoch": 0.08998628257887517,
+      "grad_norm": 0.37542393803596497,
+      "learning_rate": 1.64e-05,
+      "loss": 1.7579,
+      "step": 82
+    },
+    {
+      "epoch": 0.09108367626886145,
+      "grad_norm": 0.4218761622905731,
+      "learning_rate": 1.66e-05,
+      "loss": 1.8108,
+      "step": 83
+    },
+    {
+      "epoch": 0.09218106995884774,
+      "grad_norm": 0.4039926528930664,
+      "learning_rate": 1.6800000000000002e-05,
+      "loss": 1.7625,
+      "step": 84
+    },
+    {
+      "epoch": 0.09327846364883402,
+      "grad_norm": 0.3840247690677643,
+      "learning_rate": 1.7e-05,
+      "loss": 1.7455,
+      "step": 85
+    },
+    {
+      "epoch": 0.0943758573388203,
+      "grad_norm": 0.41403114795684814,
+      "learning_rate": 1.72e-05,
+      "loss": 1.6384,
+      "step": 86
+    },
+    {
+      "epoch": 0.09547325102880659,
+      "grad_norm": 0.3860199451446533,
+      "learning_rate": 1.7400000000000003e-05,
+      "loss": 1.6876,
+      "step": 87
+    },
+    {
+      "epoch": 0.09657064471879287,
+      "grad_norm": 0.39576929807662964,
+      "learning_rate": 1.76e-05,
+      "loss": 1.7044,
+      "step": 88
+    },
+    {
+      "epoch": 0.09766803840877915,
+      "grad_norm": 0.4149666130542755,
+      "learning_rate": 1.7800000000000002e-05,
+      "loss": 1.657,
+      "step": 89
+    },
+    {
+      "epoch": 0.09876543209876543,
+      "grad_norm": 0.46741801500320435,
+      "learning_rate": 1.8e-05,
+      "loss": 1.6955,
+      "step": 90
+    },
+    {
+      "epoch": 0.09986282578875172,
+      "grad_norm": 0.41203179955482483,
+      "learning_rate": 1.8200000000000002e-05,
+      "loss": 1.6259,
+      "step": 91
+    },
+    {
+      "epoch": 0.100960219478738,
+      "grad_norm": 0.4069835841655731,
+      "learning_rate": 1.8400000000000003e-05,
+      "loss": 1.6547,
+      "step": 92
+    },
+    {
+      "epoch": 0.10205761316872428,
+      "grad_norm": 0.4733927845954895,
+      "learning_rate": 1.86e-05,
+      "loss": 1.6419,
+      "step": 93
+    },
+    {
+      "epoch": 0.10315500685871057,
+      "grad_norm": 0.4243912100791931,
+      "learning_rate": 1.88e-05,
+      "loss": 1.5943,
+      "step": 94
+    },
+    {
+      "epoch": 0.10425240054869685,
+      "grad_norm": 0.48016536235809326,
+      "learning_rate": 1.9e-05,
+      "loss": 1.5303,
+      "step": 95
+    },
+    {
+      "epoch": 0.10534979423868313,
+      "grad_norm": 0.44309377670288086,
+      "learning_rate": 1.9200000000000003e-05,
+      "loss": 1.5682,
+      "step": 96
+    },
+    {
+      "epoch": 0.1064471879286694,
+      "grad_norm": 0.45906150341033936,
+      "learning_rate": 1.94e-05,
+      "loss": 1.5747,
+      "step": 97
+    },
+    {
+      "epoch": 0.1075445816186557,
+      "grad_norm": 0.4476592540740967,
+      "learning_rate": 1.9600000000000002e-05,
+      "loss": 1.5563,
+      "step": 98
+    },
+    {
+      "epoch": 0.10864197530864197,
+      "grad_norm": 0.432974249124527,
+      "learning_rate": 1.98e-05,
+      "loss": 1.5782,
+      "step": 99
+    },
+    {
+      "epoch": 0.10973936899862825,
+      "grad_norm": 0.4596545398235321,
+      "learning_rate": 2e-05,
+      "loss": 1.5268,
+      "step": 100
+    },
+    {
+      "epoch": 0.11083676268861455,
+      "grad_norm": 0.49297234416007996,
+      "learning_rate": 1.997533908754624e-05,
+      "loss": 1.4867,
+      "step": 101
+    },
+    {
+      "epoch": 0.11193415637860082,
+      "grad_norm": 0.4385370910167694,
+      "learning_rate": 1.995067817509248e-05,
+      "loss": 1.5288,
+      "step": 102
+    },
+    {
+      "epoch": 0.1130315500685871,
+      "grad_norm": 0.4075908660888672,
+      "learning_rate": 1.992601726263872e-05,
+      "loss": 1.5342,
+      "step": 103
+    },
+    {
+      "epoch": 0.1141289437585734,
+      "grad_norm": 0.4361695647239685,
+      "learning_rate": 1.990135635018496e-05,
+      "loss": 1.4855,
+      "step": 104
+    },
+    {
+      "epoch": 0.11522633744855967,
+      "grad_norm": 0.3582554757595062,
+      "learning_rate": 1.9876695437731196e-05,
+      "loss": 1.5212,
+      "step": 105
+    },
+    {
+      "epoch": 0.11632373113854595,
+      "grad_norm": 0.39850226044654846,
+      "learning_rate": 1.985203452527744e-05,
+      "loss": 1.4019,
+      "step": 106
+    },
+    {
+      "epoch": 0.11742112482853223,
+      "grad_norm": 0.35860705375671387,
+      "learning_rate": 1.9827373612823677e-05,
+      "loss": 1.401,
+      "step": 107
+    },
+    {
+      "epoch": 0.11851851851851852,
+      "grad_norm": 0.34796416759490967,
+      "learning_rate": 1.9802712700369916e-05,
+      "loss": 1.3828,
+      "step": 108
+    },
+    {
+      "epoch": 0.1196159122085048,
+      "grad_norm": 0.3401014506816864,
+      "learning_rate": 1.9778051787916155e-05,
+      "loss": 1.3946,
+      "step": 109
+    },
+    {
+      "epoch": 0.12071330589849108,
+      "grad_norm": 0.36761683225631714,
+      "learning_rate": 1.9753390875462394e-05,
+      "loss": 1.3895,
+      "step": 110
+    },
+    {
+      "epoch": 0.12181069958847737,
+      "grad_norm": 0.3479093313217163,
+      "learning_rate": 1.9728729963008633e-05,
+      "loss": 1.3704,
+      "step": 111
+    },
+    {
+      "epoch": 0.12290809327846365,
+      "grad_norm": 0.3511699438095093,
+      "learning_rate": 1.9704069050554872e-05,
+      "loss": 1.4554,
+      "step": 112
+    },
+    {
+      "epoch": 0.12400548696844993,
+      "grad_norm": 0.3108881413936615,
+      "learning_rate": 1.967940813810111e-05,
+      "loss": 1.4446,
+      "step": 113
+    },
+    {
+      "epoch": 0.12510288065843622,
+      "grad_norm": 0.35024315118789673,
+      "learning_rate": 1.965474722564735e-05,
+      "loss": 1.3689,
+      "step": 114
+    },
+    {
+      "epoch": 0.1262002743484225,
+      "grad_norm": 0.3867047429084778,
+      "learning_rate": 1.9630086313193592e-05,
+      "loss": 1.323,
+      "step": 115
+    },
+    {
+      "epoch": 0.12729766803840878,
+      "grad_norm": 0.3047441244125366,
+      "learning_rate": 1.9605425400739828e-05,
+      "loss": 1.3858,
+      "step": 116
+    },
+    {
+      "epoch": 0.12839506172839507,
+      "grad_norm": 0.34278514981269836,
+      "learning_rate": 1.9580764488286066e-05,
+      "loss": 1.4294,
+      "step": 117
+    },
+    {
+      "epoch": 0.12949245541838134,
+      "grad_norm": 0.3425813317298889,
+      "learning_rate": 1.955610357583231e-05,
+      "loss": 1.3426,
+      "step": 118
+    },
+    {
+      "epoch": 0.13058984910836763,
+      "grad_norm": 0.38938474655151367,
+      "learning_rate": 1.9531442663378544e-05,
+      "loss": 1.3857,
+      "step": 119
+    },
+    {
+      "epoch": 0.13168724279835392,
+      "grad_norm": 0.323210746049881,
+      "learning_rate": 1.9506781750924787e-05,
+      "loss": 1.3922,
+      "step": 120
+    },
+    {
+      "epoch": 0.13278463648834019,
+      "grad_norm": 0.33866652846336365,
+      "learning_rate": 1.9482120838471025e-05,
+      "loss": 1.3969,
+      "step": 121
+    },
+    {
+      "epoch": 0.13388203017832648,
+      "grad_norm": 0.3292064368724823,
+      "learning_rate": 1.9457459926017264e-05,
+      "loss": 1.417,
+      "step": 122
+    },
+    {
+      "epoch": 0.13497942386831277,
+      "grad_norm": 0.3402523100376129,
+      "learning_rate": 1.9432799013563503e-05,
+      "loss": 1.3128,
+      "step": 123
+    },
+    {
+      "epoch": 0.13607681755829903,
+      "grad_norm": 0.3458561599254608,
+      "learning_rate": 1.9408138101109742e-05,
+      "loss": 1.3259,
+      "step": 124
+    },
+    {
+      "epoch": 0.13717421124828533,
+      "grad_norm": 0.3515847325325012,
+      "learning_rate": 1.938347718865598e-05,
+      "loss": 1.3399,
+      "step": 125
+    },
+    {
+      "epoch": 0.1382716049382716,
+      "grad_norm": 0.3250204622745514,
+      "learning_rate": 1.935881627620222e-05,
+      "loss": 1.325,
+      "step": 126
+    },
+    {
+      "epoch": 0.13936899862825788,
+      "grad_norm": 0.37784844636917114,
+      "learning_rate": 1.9334155363748462e-05,
+      "loss": 1.3475,
+      "step": 127
+    },
+    {
+      "epoch": 0.14046639231824418,
+      "grad_norm": 0.39530113339424133,
+      "learning_rate": 1.9309494451294698e-05,
+      "loss": 1.3755,
+      "step": 128
+    },
+    {
+      "epoch": 0.14156378600823044,
+      "grad_norm": 0.37158292531967163,
+      "learning_rate": 1.928483353884094e-05,
+      "loss": 1.3443,
+      "step": 129
+    },
+    {
+      "epoch": 0.14266117969821673,
+      "grad_norm": 0.43860819935798645,
+      "learning_rate": 1.9260172626387176e-05,
+      "loss": 1.3849,
+      "step": 130
+    },
+    {
+      "epoch": 0.14375857338820303,
+      "grad_norm": 0.43812137842178345,
+      "learning_rate": 1.9235511713933418e-05,
+      "loss": 1.335,
+      "step": 131
+    },
+    {
+      "epoch": 0.1448559670781893,
+      "grad_norm": 0.4332147538661957,
+      "learning_rate": 1.9210850801479657e-05,
+      "loss": 1.286,
+      "step": 132
+    },
+    {
+      "epoch": 0.14595336076817558,
+      "grad_norm": 0.3903179168701172,
+      "learning_rate": 1.9186189889025896e-05,
+      "loss": 1.3319,
+      "step": 133
+    },
+    {
+      "epoch": 0.14705075445816188,
+      "grad_norm": 0.4002543091773987,
+      "learning_rate": 1.9161528976572135e-05,
+      "loss": 1.384,
+      "step": 134
+    },
+    {
+      "epoch": 0.14814814814814814,
+      "grad_norm": 0.3899374008178711,
+      "learning_rate": 1.9136868064118374e-05,
+      "loss": 1.4108,
+      "step": 135
+    },
+    {
+      "epoch": 0.14924554183813443,
+      "grad_norm": 0.4650733172893524,
+      "learning_rate": 1.9112207151664612e-05,
+      "loss": 1.3531,
+      "step": 136
+    },
+    {
+      "epoch": 0.15034293552812072,
+      "grad_norm": 0.4140302538871765,
+      "learning_rate": 1.908754623921085e-05,
+      "loss": 1.3683,
+      "step": 137
+    },
+    {
+      "epoch": 0.151440329218107,
+      "grad_norm": 0.46601012349128723,
+      "learning_rate": 1.9062885326757094e-05,
+      "loss": 1.3669,
+      "step": 138
+    },
+    {
+      "epoch": 0.15253772290809328,
+      "grad_norm": 0.4634631276130676,
+      "learning_rate": 1.903822441430333e-05,
+      "loss": 1.3653,
+      "step": 139
+    },
+    {
+      "epoch": 0.15363511659807957,
+      "grad_norm": 0.49095258116722107,
+      "learning_rate": 1.901356350184957e-05,
+      "loss": 1.2959,
+      "step": 140
+    },
+    {
+      "epoch": 0.15473251028806584,
+      "grad_norm": 0.504693865776062,
+      "learning_rate": 1.898890258939581e-05,
+      "loss": 1.3398,
+      "step": 141
+    },
+    {
+      "epoch": 0.15582990397805213,
+      "grad_norm": 0.4257521629333496,
+      "learning_rate": 1.896424167694205e-05,
+      "loss": 1.4279,
+      "step": 142
+    },
+    {
+      "epoch": 0.1569272976680384,
+      "grad_norm": 0.4088304936885834,
+      "learning_rate": 1.8939580764488288e-05,
+      "loss": 1.349,
+      "step": 143
+    },
+    {
+      "epoch": 0.1580246913580247,
+      "grad_norm": 0.46413642168045044,
+      "learning_rate": 1.8914919852034527e-05,
+      "loss": 1.225,
+      "step": 144
+    },
+    {
+      "epoch": 0.15912208504801098,
+      "grad_norm": 0.42032694816589355,
+      "learning_rate": 1.8890258939580766e-05,
+      "loss": 1.3018,
+      "step": 145
+    },
+    {
+      "epoch": 0.16021947873799725,
+      "grad_norm": 0.3075321614742279,
+      "learning_rate": 1.8865598027127005e-05,
+      "loss": 1.3478,
+      "step": 146
+    },
+    {
+      "epoch": 0.16131687242798354,
+      "grad_norm": 0.2494765967130661,
+      "learning_rate": 1.8840937114673244e-05,
+      "loss": 1.2855,
+      "step": 147
+    },
+    {
+      "epoch": 0.16241426611796983,
+      "grad_norm": 0.22226963937282562,
+      "learning_rate": 1.8816276202219483e-05,
+      "loss": 1.2765,
+      "step": 148
+    },
+    {
+      "epoch": 0.1635116598079561,
+      "grad_norm": 0.19540823996067047,
+      "learning_rate": 1.879161528976572e-05,
+      "loss": 1.3324,
+      "step": 149
+    },
+    {
+      "epoch": 0.1646090534979424,
+      "grad_norm": 0.1837640106678009,
+      "learning_rate": 1.876695437731196e-05,
+      "loss": 1.357,
+      "step": 150
+    },
+    {
+      "epoch": 0.16570644718792868,
+      "grad_norm": 0.21854494512081146,
+      "learning_rate": 1.87422934648582e-05,
+      "loss": 1.3023,
+      "step": 151
+    },
+    {
+      "epoch": 0.16680384087791494,
+      "grad_norm": 0.1981099396944046,
+      "learning_rate": 1.8717632552404442e-05,
+      "loss": 1.2727,
+      "step": 152
+    },
+    {
+      "epoch": 0.16790123456790124,
+      "grad_norm": 0.1930709034204483,
+      "learning_rate": 1.8692971639950677e-05,
+      "loss": 1.309,
+      "step": 153
+    },
+    {
+      "epoch": 0.16899862825788753,
+      "grad_norm": 0.196756511926651,
+      "learning_rate": 1.866831072749692e-05,
+      "loss": 1.3215,
+      "step": 154
+    },
+    {
+      "epoch": 0.1700960219478738,
+      "grad_norm": 0.18768474459648132,
+      "learning_rate": 1.864364981504316e-05,
+      "loss": 1.3032,
+      "step": 155
+    },
+    {
+      "epoch": 0.17119341563786009,
+      "grad_norm": 0.22273507714271545,
+      "learning_rate": 1.8618988902589397e-05,
+      "loss": 1.312,
+      "step": 156
+    },
+    {
+      "epoch": 0.17229080932784638,
+      "grad_norm": 0.17846502363681793,
+      "learning_rate": 1.8594327990135636e-05,
+      "loss": 1.3079,
+      "step": 157
+    },
+    {
+      "epoch": 0.17338820301783264,
+      "grad_norm": 0.18322448432445526,
+      "learning_rate": 1.8569667077681875e-05,
+      "loss": 1.3537,
+      "step": 158
+    },
+    {
+      "epoch": 0.17448559670781894,
+      "grad_norm": 0.180659681558609,
+      "learning_rate": 1.8545006165228114e-05,
+      "loss": 1.32,
+      "step": 159
+    },
+    {
+      "epoch": 0.1755829903978052,
+      "grad_norm": 0.23424433171749115,
+      "learning_rate": 1.8520345252774353e-05,
+      "loss": 1.3525,
+      "step": 160
+    },
+    {
+      "epoch": 0.1766803840877915,
+      "grad_norm": 0.19839544594287872,
+      "learning_rate": 1.8495684340320595e-05,
+      "loss": 1.2567,
+      "step": 161
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.1809428632259369,
+      "learning_rate": 1.847102342786683e-05,
+      "loss": 1.4336,
+      "step": 162
+    },
+    {
+      "epoch": 0.17887517146776405,
+      "grad_norm": 0.18282510340213776,
+      "learning_rate": 1.8446362515413073e-05,
+      "loss": 1.3281,
+      "step": 163
+    },
+    {
+      "epoch": 0.17997256515775034,
+      "grad_norm": 0.18329757452011108,
+      "learning_rate": 1.8421701602959312e-05,
+      "loss": 1.2928,
+      "step": 164
+    },
+    {
+      "epoch": 0.18106995884773663,
+      "grad_norm": 0.1925593912601471,
+      "learning_rate": 1.839704069050555e-05,
+      "loss": 1.3299,
+      "step": 165
+    },
+    {
+      "epoch": 0.1821673525377229,
+      "grad_norm": 0.19288820028305054,
+      "learning_rate": 1.837237977805179e-05,
+      "loss": 1.3251,
+      "step": 166
+    },
+    {
+      "epoch": 0.1832647462277092,
+      "grad_norm": 0.18203546106815338,
+      "learning_rate": 1.834771886559803e-05,
+      "loss": 1.336,
+      "step": 167
+    },
+    {
+      "epoch": 0.18436213991769548,
+      "grad_norm": 0.20783405005931854,
+      "learning_rate": 1.8323057953144268e-05,
+      "loss": 1.3015,
+      "step": 168
+    },
+    {
+      "epoch": 0.18545953360768175,
+      "grad_norm": 0.16650988161563873,
+      "learning_rate": 1.8298397040690507e-05,
+      "loss": 1.3584,
+      "step": 169
+    },
+    {
+      "epoch": 0.18655692729766804,
+      "grad_norm": 0.19607752561569214,
+      "learning_rate": 1.8273736128236746e-05,
+      "loss": 1.3404,
+      "step": 170
+    },
+    {
+      "epoch": 0.18765432098765433,
+      "grad_norm": 0.1942966729402542,
+      "learning_rate": 1.8249075215782984e-05,
+      "loss": 1.3222,
+      "step": 171
+    },
+    {
+      "epoch": 0.1887517146776406,
+      "grad_norm": 0.23658064007759094,
+      "learning_rate": 1.8224414303329227e-05,
+      "loss": 1.3226,
+      "step": 172
+    },
+    {
+      "epoch": 0.1898491083676269,
+      "grad_norm": 0.1818239539861679,
+      "learning_rate": 1.8199753390875462e-05,
+      "loss": 1.378,
+      "step": 173
+    },
+    {
+      "epoch": 0.19094650205761318,
+      "grad_norm": 0.1911785900592804,
+      "learning_rate": 1.8175092478421705e-05,
+      "loss": 1.4018,
+      "step": 174
+    },
+    {
+      "epoch": 0.19204389574759945,
+      "grad_norm": 0.20029093325138092,
+      "learning_rate": 1.8150431565967943e-05,
+      "loss": 1.3754,
+      "step": 175
+    },
+    {
+      "epoch": 0.19314128943758574,
+      "grad_norm": 0.17715874314308167,
+      "learning_rate": 1.8125770653514182e-05,
+      "loss": 1.2913,
+      "step": 176
+    },
+    {
+      "epoch": 0.194238683127572,
+      "grad_norm": 0.19813261926174164,
+      "learning_rate": 1.810110974106042e-05,
+      "loss": 1.4297,
+      "step": 177
+    },
+    {
+      "epoch": 0.1953360768175583,
+      "grad_norm": 0.18769969046115875,
+      "learning_rate": 1.807644882860666e-05,
+      "loss": 1.3742,
+      "step": 178
+    },
+    {
+      "epoch": 0.1964334705075446,
+      "grad_norm": 0.19279271364212036,
+      "learning_rate": 1.80517879161529e-05,
+      "loss": 1.2959,
+      "step": 179
+    },
+    {
+      "epoch": 0.19753086419753085,
+      "grad_norm": 0.18196497857570648,
+      "learning_rate": 1.8027127003699138e-05,
+      "loss": 1.257,
+      "step": 180
+    },
+    {
+      "epoch": 0.19862825788751715,
+      "grad_norm": 0.18429674208164215,
+      "learning_rate": 1.8002466091245377e-05,
+      "loss": 1.3139,
+      "step": 181
+    },
+    {
+      "epoch": 0.19972565157750344,
+      "grad_norm": 0.17658454179763794,
+      "learning_rate": 1.7977805178791616e-05,
+      "loss": 1.2467,
+      "step": 182
+    },
+    {
+      "epoch": 0.2008230452674897,
+      "grad_norm": 0.18059629201889038,
+      "learning_rate": 1.7953144266337855e-05,
+      "loss": 1.3461,
+      "step": 183
+    },
+    {
+      "epoch": 0.201920438957476,
+      "grad_norm": 0.17834338545799255,
+      "learning_rate": 1.7928483353884094e-05,
+      "loss": 1.3542,
+      "step": 184
+    },
+    {
+      "epoch": 0.2030178326474623,
+      "grad_norm": 0.19244344532489777,
+      "learning_rate": 1.7903822441430333e-05,
+      "loss": 1.3086,
+      "step": 185
+    },
+    {
+      "epoch": 0.20411522633744855,
+      "grad_norm": 0.19486865401268005,
+      "learning_rate": 1.7879161528976575e-05,
+      "loss": 1.2946,
+      "step": 186
+    },
+    {
+      "epoch": 0.20521262002743484,
+      "grad_norm": 0.19326968491077423,
+      "learning_rate": 1.785450061652281e-05,
+      "loss": 1.2794,
+      "step": 187
+    },
+    {
+      "epoch": 0.20631001371742114,
+      "grad_norm": 0.2171463817358017,
+      "learning_rate": 1.7829839704069053e-05,
+      "loss": 1.3009,
+      "step": 188
+    },
+    {
+      "epoch": 0.2074074074074074,
+      "grad_norm": 0.2024574726819992,
+      "learning_rate": 1.780517879161529e-05,
+      "loss": 1.3694,
+      "step": 189
     }
   ],
   "logging_steps": 1,
+  "max_steps": 911,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.1524363938073805e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13a31ab6ba72d5f444e77e773341db29fe2e54fcefb9ddcac5ddba8e0711e92e
 size 5240

 version https://git-lfs.github.com/spec/v1
+oid sha256:3925e6f4fb74ea74c296487c97f50eee65db504b312a1431f8d935775889ba02
 size 5240