Training in progress, step 10800, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +3 -3
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/tokenizer_config.json +5 -1
last-checkpoint/trainer_state.json +361 -4
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -23,13 +23,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
     "k_proj",
-    "down_proj",
     "o_proj",
-    "up_proj",
     "v_proj",
-    "gate_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "down_proj",
     "q_proj",
     "k_proj",
+    "gate_proj",
     "o_proj",
     "v_proj",
+    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a53adb1ece7e14078c5cbcd5925b731e174893ac4f79b83e75a1d118a6a16ca
 size 1556140392

 version https://git-lfs.github.com/spec/v1
+oid sha256:69616034c7478cf2dfccee5c0270ffd5060d30e5019b1fedb3826dd864fdc449
 size 1556140392

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ecc0b6e807c2e801b966aac222c07c5a4d3aa838d101d41d5e11b3af8c1b26c4
 size 791682818

 version https://git-lfs.github.com/spec/v1
+oid sha256:32eee9332b07af811bf55ae350443625f7835082aa14f81cafe85b65410c568c
 size 791682818

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:133a27a96bf2028fd94eb62846a16114ede5a872ddea6198ae6b8df77a089e67
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:94ff2db7929a990beccc40066cd3722e70139073f804431d949279fb773ea474
 size 1064

last-checkpoint/tokenizer_config.json CHANGED Viewed

@@ -2055,6 +2055,7 @@
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|eot_id|>",
   "extra_special_tokens": {},
   "model_input_names": [
     "input_ids",
     "attention_mask"
@@ -2062,5 +2063,8 @@
   "model_max_length": 4096,
   "pad_token": "<|eot_id|>",
   "padding_side": "left",
-  "tokenizer_class": "PreTrainedTokenizerFast"
 }

   "clean_up_tokenization_spaces": true,
   "eos_token": "<|eot_id|>",
   "extra_special_tokens": {},
+  "max_length": 4096,
   "model_input_names": [
     "input_ids",
     "attention_mask"
   "model_max_length": 4096,
   "pad_token": "<|eot_id|>",
   "padding_side": "left",
+  "stride": 0,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first"
 }

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.0008345782458834428,
   "eval_steps": 500,
-  "global_step": 600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -28,10 +28,367 @@
       "learning_rate": 1.9999991549580503e-05,
       "loss": 1.9425,
       "step": 600
     }
   ],
   "logging_steps": 200,
-  "max_steps": 1437852,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 200,
@@ -47,7 +404,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 6846848392034304.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.007511209436860982,
   "eval_steps": 500,
+  "global_step": 10800,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 1.9999991549580503e-05,
       "loss": 1.9425,
       "step": 600
+    },
+    {
+      "epoch": 0.0005563858842119246,
+      "grad_norm": 1.1521191596984863,
+      "learning_rate": 1.999999622846857e-05,
+      "loss": 1.8664,
+      "step": 800
+    },
+    {
+      "epoch": 0.0006954823552649058,
+      "grad_norm": 1.6058714389801025,
+      "learning_rate": 1.999999409214783e-05,
+      "loss": 1.7909,
+      "step": 1000
+    },
+    {
+      "epoch": 0.0008345788263178869,
+      "grad_norm": 1.3741093873977661,
+      "learning_rate": 1.9999991478437182e-05,
+      "loss": 1.7653,
+      "step": 1200
+    },
+    {
+      "epoch": 0.0009736752973708681,
+      "grad_norm": 1.7162392139434814,
+      "learning_rate": 1.9999988387336754e-05,
+      "loss": 1.9238,
+      "step": 1400
+    },
+    {
+      "epoch": 0.001112771768423849,
+      "grad_norm": 3.3224897384643555,
+      "learning_rate": 1.9999984818846697e-05,
+      "loss": 1.8869,
+      "step": 1600
+    },
+    {
+      "epoch": 0.0012518682394768305,
+      "grad_norm": 1.6145182847976685,
+      "learning_rate": 1.999998077296718e-05,
+      "loss": 1.8906,
+      "step": 1800
+    },
+    {
+      "epoch": 0.0013909647105298116,
+      "grad_norm": 0.9710016250610352,
+      "learning_rate": 1.9999976249698394e-05,
+      "loss": 1.8672,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0015300611815827927,
+      "grad_norm": 1.0537340641021729,
+      "learning_rate": 1.9999971249040557e-05,
+      "loss": 1.9136,
+      "step": 2200
+    },
+    {
+      "epoch": 0.0016691576526357739,
+      "grad_norm": 1.443352222442627,
+      "learning_rate": 1.9999965770993904e-05,
+      "loss": 1.8307,
+      "step": 2400
+    },
+    {
+      "epoch": 0.001808254123688755,
+      "grad_norm": 3.0568041801452637,
+      "learning_rate": 1.9999959815558703e-05,
+      "loss": 1.8695,
+      "step": 2600
+    },
+    {
+      "epoch": 0.0019473505947417361,
+      "grad_norm": 0.9355903267860413,
+      "learning_rate": 1.9999953382735232e-05,
+      "loss": 1.8503,
+      "step": 2800
+    },
+    {
+      "epoch": 0.0020864470657947173,
+      "grad_norm": 0.7553691267967224,
+      "learning_rate": 1.9999946472523805e-05,
+      "loss": 1.9489,
+      "step": 3000
+    },
+    {
+      "epoch": 0.002225543536847698,
+      "grad_norm": 0.957789957523346,
+      "learning_rate": 1.9999939084924748e-05,
+      "loss": 1.854,
+      "step": 3200
+    },
+    {
+      "epoch": 0.0023646400079006796,
+      "grad_norm": 1.4734654426574707,
+      "learning_rate": 1.999993121993841e-05,
+      "loss": 1.82,
+      "step": 3400
+    },
+    {
+      "epoch": 0.002503736478953661,
+      "grad_norm": 1.5108826160430908,
+      "learning_rate": 1.9999922877565166e-05,
+      "loss": 1.8743,
+      "step": 3600
+    },
+    {
+      "epoch": 0.002642832950006642,
+      "grad_norm": 0.7057523131370544,
+      "learning_rate": 1.9999914057805428e-05,
+      "loss": 1.9118,
+      "step": 3800
+    },
+    {
+      "epoch": 0.002781929421059623,
+      "grad_norm": 1.4599885940551758,
+      "learning_rate": 1.99999047606596e-05,
+      "loss": 1.8735,
+      "step": 4000
+    },
+    {
+      "epoch": 0.002921025892112604,
+      "grad_norm": 1.1604583263397217,
+      "learning_rate": 1.9999894986128136e-05,
+      "loss": 1.8425,
+      "step": 4200
+    },
+    {
+      "epoch": 0.0030601223631655855,
+      "grad_norm": 0.7119138240814209,
+      "learning_rate": 1.99998847342115e-05,
+      "loss": 1.8803,
+      "step": 4400
+    },
+    {
+      "epoch": 0.0031992188342185664,
+      "grad_norm": 1.1162643432617188,
+      "learning_rate": 1.999987400491018e-05,
+      "loss": 1.8082,
+      "step": 4600
+    },
+    {
+      "epoch": 0.0033383153052715477,
+      "grad_norm": 0.9169935584068298,
+      "learning_rate": 1.999986279822469e-05,
+      "loss": 1.8502,
+      "step": 4800
+    },
+    {
+      "epoch": 0.0034774117763245287,
+      "grad_norm": 0.8055661916732788,
+      "learning_rate": 1.9999851114155563e-05,
+      "loss": 1.8665,
+      "step": 5000
+    },
+    {
+      "epoch": 0.00361650824737751,
+      "grad_norm": 0.7137724757194519,
+      "learning_rate": 1.9999838952703362e-05,
+      "loss": 1.8552,
+      "step": 5200
+    },
+    {
+      "epoch": 0.003755604718430491,
+      "grad_norm": 0.6924293637275696,
+      "learning_rate": 1.9999826313868657e-05,
+      "loss": 1.8663,
+      "step": 5400
+    },
+    {
+      "epoch": 0.0038947011894834723,
+      "grad_norm": 0.9479967951774597,
+      "learning_rate": 1.9999813197652065e-05,
+      "loss": 1.8452,
+      "step": 5600
+    },
+    {
+      "epoch": 0.004033797660536453,
+      "grad_norm": 0.8143343329429626,
+      "learning_rate": 1.99997996040542e-05,
+      "loss": 1.8846,
+      "step": 5800
+    },
+    {
+      "epoch": 0.004172894131589435,
+      "grad_norm": 1.111542820930481,
+      "learning_rate": 1.999978553307572e-05,
+      "loss": 1.9686,
+      "step": 6000
+    },
+    {
+      "epoch": 0.004311990602642416,
+      "grad_norm": 0.7616419792175293,
+      "learning_rate": 1.999977098471729e-05,
+      "loss": 1.8488,
+      "step": 6200
+    },
+    {
+      "epoch": 0.004451087073695396,
+      "grad_norm": 1.4848086833953857,
+      "learning_rate": 1.999975595897961e-05,
+      "loss": 1.8701,
+      "step": 6400
+    },
+    {
+      "epoch": 0.004590183544748378,
+      "grad_norm": 1.7007120847702026,
+      "learning_rate": 1.9999740455863392e-05,
+      "loss": 1.8166,
+      "step": 6600
+    },
+    {
+      "epoch": 0.004729280015801359,
+      "grad_norm": 0.7468813061714172,
+      "learning_rate": 1.999972447536938e-05,
+      "loss": 1.8487,
+      "step": 6800
+    },
+    {
+      "epoch": 0.0048683764868543405,
+      "grad_norm": 1.2272229194641113,
+      "learning_rate": 1.9999708017498335e-05,
+      "loss": 1.8977,
+      "step": 7000
+    },
+    {
+      "epoch": 0.005007472957907322,
+      "grad_norm": 0.889249861240387,
+      "learning_rate": 1.9999691082251046e-05,
+      "loss": 1.796,
+      "step": 7200
+    },
+    {
+      "epoch": 0.005146569428960302,
+      "grad_norm": 0.8674280643463135,
+      "learning_rate": 1.9999673669628317e-05,
+      "loss": 1.8211,
+      "step": 7400
+    },
+    {
+      "epoch": 0.005285665900013284,
+      "grad_norm": 0.9621294736862183,
+      "learning_rate": 1.9999655779630983e-05,
+      "loss": 1.8682,
+      "step": 7600
+    },
+    {
+      "epoch": 0.005424762371066265,
+      "grad_norm": 0.6938799619674683,
+      "learning_rate": 1.9999637412259892e-05,
+      "loss": 1.826,
+      "step": 7800
+    },
+    {
+      "epoch": 0.005563858842119246,
+      "grad_norm": 0.9145010709762573,
+      "learning_rate": 1.9999618567515927e-05,
+      "loss": 1.8725,
+      "step": 8000
+    },
+    {
+      "epoch": 0.005702955313172227,
+      "grad_norm": 0.8468737006187439,
+      "learning_rate": 1.999959924539999e-05,
+      "loss": 1.7994,
+      "step": 8200
+    },
+    {
+      "epoch": 0.005842051784225208,
+      "grad_norm": 2.022569179534912,
+      "learning_rate": 1.9999579445912994e-05,
+      "loss": 1.8348,
+      "step": 8400
+    },
+    {
+      "epoch": 0.00598114825527819,
+      "grad_norm": 0.7492377161979675,
+      "learning_rate": 1.9999559169055893e-05,
+      "loss": 1.8122,
+      "step": 8600
+    },
+    {
+      "epoch": 0.006120244726331171,
+      "grad_norm": 1.3001569509506226,
+      "learning_rate": 1.999953841482965e-05,
+      "loss": 1.7776,
+      "step": 8800
+    },
+    {
+      "epoch": 0.006259341197384151,
+      "grad_norm": 1.238887071609497,
+      "learning_rate": 1.9999517183235256e-05,
+      "loss": 1.7954,
+      "step": 9000
+    },
+    {
+      "epoch": 0.006398437668437133,
+      "grad_norm": 0.9216433763504028,
+      "learning_rate": 1.9999495474273724e-05,
+      "loss": 1.8619,
+      "step": 9200
+    },
+    {
+      "epoch": 0.006537534139490114,
+      "grad_norm": 1.2052515745162964,
+      "learning_rate": 1.9999473287946092e-05,
+      "loss": 1.8636,
+      "step": 9400
+    },
+    {
+      "epoch": 0.0066766306105430955,
+      "grad_norm": 1.1095556020736694,
+      "learning_rate": 1.9999450624253423e-05,
+      "loss": 1.8283,
+      "step": 9600
+    },
+    {
+      "epoch": 0.006815727081596076,
+      "grad_norm": 0.681342363357544,
+      "learning_rate": 1.9999427483196793e-05,
+      "loss": 1.868,
+      "step": 9800
+    },
+    {
+      "epoch": 0.006954823552649057,
+      "grad_norm": 1.1534216403961182,
+      "learning_rate": 1.999940386477731e-05,
+      "loss": 1.8575,
+      "step": 10000
+    },
+    {
+      "epoch": 0.007093920023702039,
+      "grad_norm": 0.7867778539657593,
+      "learning_rate": 1.99993797689961e-05,
+      "loss": 1.8735,
+      "step": 10200
+    },
+    {
+      "epoch": 0.00723301649475502,
+      "grad_norm": 0.9070300459861755,
+      "learning_rate": 1.999935519585431e-05,
+      "loss": 1.817,
+      "step": 10400
+    },
+    {
+      "epoch": 0.007372112965808001,
+      "grad_norm": 0.830132782459259,
+      "learning_rate": 1.9999330145353123e-05,
+      "loss": 1.8442,
+      "step": 10600
+    },
+    {
+      "epoch": 0.007511209436860982,
+      "grad_norm": 0.6420764923095703,
+      "learning_rate": 1.9999304617493725e-05,
+      "loss": 1.8847,
+      "step": 10800
     }
   ],
   "logging_steps": 200,
+  "max_steps": 2875702,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 2,
   "save_steps": 200,
       "attributes": {}
     }
   },
+  "total_flos": 6.271390014938726e+16,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa5aa241dd55111be21c66e31c3a9c312c22de9c6ecf5bc3d18a21ae67e9aeea
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:a9904b88a607b9ec7e7c8f68db2631b21b2e901d7835fe093b513a58fc6cba17
 size 6776