MohamedAhmedAE commited on
Commit
83820be
·
verified ·
1 Parent(s): f7231df

Training in progress, step 10800, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "q_proj",
27
  "k_proj",
28
- "down_proj",
29
  "o_proj",
30
- "up_proj",
31
  "v_proj",
32
- "gate_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "down_proj",
27
  "q_proj",
28
  "k_proj",
29
+ "gate_proj",
30
  "o_proj",
 
31
  "v_proj",
32
+ "up_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a53adb1ece7e14078c5cbcd5925b731e174893ac4f79b83e75a1d118a6a16ca
3
  size 1556140392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69616034c7478cf2dfccee5c0270ffd5060d30e5019b1fedb3826dd864fdc449
3
  size 1556140392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecc0b6e807c2e801b966aac222c07c5a4d3aa838d101d41d5e11b3af8c1b26c4
3
  size 791682818
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32eee9332b07af811bf55ae350443625f7835082aa14f81cafe85b65410c568c
3
  size 791682818
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:133a27a96bf2028fd94eb62846a16114ede5a872ddea6198ae6b8df77a089e67
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94ff2db7929a990beccc40066cd3722e70139073f804431d949279fb773ea474
3
  size 1064
last-checkpoint/tokenizer_config.json CHANGED
@@ -2055,6 +2055,7 @@
2055
  "clean_up_tokenization_spaces": true,
2056
  "eos_token": "<|eot_id|>",
2057
  "extra_special_tokens": {},
 
2058
  "model_input_names": [
2059
  "input_ids",
2060
  "attention_mask"
@@ -2062,5 +2063,8 @@
2062
  "model_max_length": 4096,
2063
  "pad_token": "<|eot_id|>",
2064
  "padding_side": "left",
2065
- "tokenizer_class": "PreTrainedTokenizerFast"
 
 
 
2066
  }
 
2055
  "clean_up_tokenization_spaces": true,
2056
  "eos_token": "<|eot_id|>",
2057
  "extra_special_tokens": {},
2058
+ "max_length": 4096,
2059
  "model_input_names": [
2060
  "input_ids",
2061
  "attention_mask"
 
2063
  "model_max_length": 4096,
2064
  "pad_token": "<|eot_id|>",
2065
  "padding_side": "left",
2066
+ "stride": 0,
2067
+ "tokenizer_class": "PreTrainedTokenizerFast",
2068
+ "truncation_side": "right",
2069
+ "truncation_strategy": "longest_first"
2070
  }
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.0008345782458834428,
5
  "eval_steps": 500,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -28,10 +28,367 @@
28
  "learning_rate": 1.9999991549580503e-05,
29
  "loss": 1.9425,
30
  "step": 600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  ],
33
  "logging_steps": 200,
34
- "max_steps": 1437852,
35
  "num_input_tokens_seen": 0,
36
  "num_train_epochs": 2,
37
  "save_steps": 200,
@@ -47,7 +404,7 @@
47
  "attributes": {}
48
  }
49
  },
50
- "total_flos": 6846848392034304.0,
51
  "train_batch_size": 1,
52
  "trial_name": null,
53
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.007511209436860982,
5
  "eval_steps": 500,
6
+ "global_step": 10800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
28
  "learning_rate": 1.9999991549580503e-05,
29
  "loss": 1.9425,
30
  "step": 600
31
+ },
32
+ {
33
+ "epoch": 0.0005563858842119246,
34
+ "grad_norm": 1.1521191596984863,
35
+ "learning_rate": 1.999999622846857e-05,
36
+ "loss": 1.8664,
37
+ "step": 800
38
+ },
39
+ {
40
+ "epoch": 0.0006954823552649058,
41
+ "grad_norm": 1.6058714389801025,
42
+ "learning_rate": 1.999999409214783e-05,
43
+ "loss": 1.7909,
44
+ "step": 1000
45
+ },
46
+ {
47
+ "epoch": 0.0008345788263178869,
48
+ "grad_norm": 1.3741093873977661,
49
+ "learning_rate": 1.9999991478437182e-05,
50
+ "loss": 1.7653,
51
+ "step": 1200
52
+ },
53
+ {
54
+ "epoch": 0.0009736752973708681,
55
+ "grad_norm": 1.7162392139434814,
56
+ "learning_rate": 1.9999988387336754e-05,
57
+ "loss": 1.9238,
58
+ "step": 1400
59
+ },
60
+ {
61
+ "epoch": 0.001112771768423849,
62
+ "grad_norm": 3.3224897384643555,
63
+ "learning_rate": 1.9999984818846697e-05,
64
+ "loss": 1.8869,
65
+ "step": 1600
66
+ },
67
+ {
68
+ "epoch": 0.0012518682394768305,
69
+ "grad_norm": 1.6145182847976685,
70
+ "learning_rate": 1.999998077296718e-05,
71
+ "loss": 1.8906,
72
+ "step": 1800
73
+ },
74
+ {
75
+ "epoch": 0.0013909647105298116,
76
+ "grad_norm": 0.9710016250610352,
77
+ "learning_rate": 1.9999976249698394e-05,
78
+ "loss": 1.8672,
79
+ "step": 2000
80
+ },
81
+ {
82
+ "epoch": 0.0015300611815827927,
83
+ "grad_norm": 1.0537340641021729,
84
+ "learning_rate": 1.9999971249040557e-05,
85
+ "loss": 1.9136,
86
+ "step": 2200
87
+ },
88
+ {
89
+ "epoch": 0.0016691576526357739,
90
+ "grad_norm": 1.443352222442627,
91
+ "learning_rate": 1.9999965770993904e-05,
92
+ "loss": 1.8307,
93
+ "step": 2400
94
+ },
95
+ {
96
+ "epoch": 0.001808254123688755,
97
+ "grad_norm": 3.0568041801452637,
98
+ "learning_rate": 1.9999959815558703e-05,
99
+ "loss": 1.8695,
100
+ "step": 2600
101
+ },
102
+ {
103
+ "epoch": 0.0019473505947417361,
104
+ "grad_norm": 0.9355903267860413,
105
+ "learning_rate": 1.9999953382735232e-05,
106
+ "loss": 1.8503,
107
+ "step": 2800
108
+ },
109
+ {
110
+ "epoch": 0.0020864470657947173,
111
+ "grad_norm": 0.7553691267967224,
112
+ "learning_rate": 1.9999946472523805e-05,
113
+ "loss": 1.9489,
114
+ "step": 3000
115
+ },
116
+ {
117
+ "epoch": 0.002225543536847698,
118
+ "grad_norm": 0.957789957523346,
119
+ "learning_rate": 1.9999939084924748e-05,
120
+ "loss": 1.854,
121
+ "step": 3200
122
+ },
123
+ {
124
+ "epoch": 0.0023646400079006796,
125
+ "grad_norm": 1.4734654426574707,
126
+ "learning_rate": 1.999993121993841e-05,
127
+ "loss": 1.82,
128
+ "step": 3400
129
+ },
130
+ {
131
+ "epoch": 0.002503736478953661,
132
+ "grad_norm": 1.5108826160430908,
133
+ "learning_rate": 1.9999922877565166e-05,
134
+ "loss": 1.8743,
135
+ "step": 3600
136
+ },
137
+ {
138
+ "epoch": 0.002642832950006642,
139
+ "grad_norm": 0.7057523131370544,
140
+ "learning_rate": 1.9999914057805428e-05,
141
+ "loss": 1.9118,
142
+ "step": 3800
143
+ },
144
+ {
145
+ "epoch": 0.002781929421059623,
146
+ "grad_norm": 1.4599885940551758,
147
+ "learning_rate": 1.99999047606596e-05,
148
+ "loss": 1.8735,
149
+ "step": 4000
150
+ },
151
+ {
152
+ "epoch": 0.002921025892112604,
153
+ "grad_norm": 1.1604583263397217,
154
+ "learning_rate": 1.9999894986128136e-05,
155
+ "loss": 1.8425,
156
+ "step": 4200
157
+ },
158
+ {
159
+ "epoch": 0.0030601223631655855,
160
+ "grad_norm": 0.7119138240814209,
161
+ "learning_rate": 1.99998847342115e-05,
162
+ "loss": 1.8803,
163
+ "step": 4400
164
+ },
165
+ {
166
+ "epoch": 0.0031992188342185664,
167
+ "grad_norm": 1.1162643432617188,
168
+ "learning_rate": 1.999987400491018e-05,
169
+ "loss": 1.8082,
170
+ "step": 4600
171
+ },
172
+ {
173
+ "epoch": 0.0033383153052715477,
174
+ "grad_norm": 0.9169935584068298,
175
+ "learning_rate": 1.999986279822469e-05,
176
+ "loss": 1.8502,
177
+ "step": 4800
178
+ },
179
+ {
180
+ "epoch": 0.0034774117763245287,
181
+ "grad_norm": 0.8055661916732788,
182
+ "learning_rate": 1.9999851114155563e-05,
183
+ "loss": 1.8665,
184
+ "step": 5000
185
+ },
186
+ {
187
+ "epoch": 0.00361650824737751,
188
+ "grad_norm": 0.7137724757194519,
189
+ "learning_rate": 1.9999838952703362e-05,
190
+ "loss": 1.8552,
191
+ "step": 5200
192
+ },
193
+ {
194
+ "epoch": 0.003755604718430491,
195
+ "grad_norm": 0.6924293637275696,
196
+ "learning_rate": 1.9999826313868657e-05,
197
+ "loss": 1.8663,
198
+ "step": 5400
199
+ },
200
+ {
201
+ "epoch": 0.0038947011894834723,
202
+ "grad_norm": 0.9479967951774597,
203
+ "learning_rate": 1.9999813197652065e-05,
204
+ "loss": 1.8452,
205
+ "step": 5600
206
+ },
207
+ {
208
+ "epoch": 0.004033797660536453,
209
+ "grad_norm": 0.8143343329429626,
210
+ "learning_rate": 1.99997996040542e-05,
211
+ "loss": 1.8846,
212
+ "step": 5800
213
+ },
214
+ {
215
+ "epoch": 0.004172894131589435,
216
+ "grad_norm": 1.111542820930481,
217
+ "learning_rate": 1.999978553307572e-05,
218
+ "loss": 1.9686,
219
+ "step": 6000
220
+ },
221
+ {
222
+ "epoch": 0.004311990602642416,
223
+ "grad_norm": 0.7616419792175293,
224
+ "learning_rate": 1.999977098471729e-05,
225
+ "loss": 1.8488,
226
+ "step": 6200
227
+ },
228
+ {
229
+ "epoch": 0.004451087073695396,
230
+ "grad_norm": 1.4848086833953857,
231
+ "learning_rate": 1.999975595897961e-05,
232
+ "loss": 1.8701,
233
+ "step": 6400
234
+ },
235
+ {
236
+ "epoch": 0.004590183544748378,
237
+ "grad_norm": 1.7007120847702026,
238
+ "learning_rate": 1.9999740455863392e-05,
239
+ "loss": 1.8166,
240
+ "step": 6600
241
+ },
242
+ {
243
+ "epoch": 0.004729280015801359,
244
+ "grad_norm": 0.7468813061714172,
245
+ "learning_rate": 1.999972447536938e-05,
246
+ "loss": 1.8487,
247
+ "step": 6800
248
+ },
249
+ {
250
+ "epoch": 0.0048683764868543405,
251
+ "grad_norm": 1.2272229194641113,
252
+ "learning_rate": 1.9999708017498335e-05,
253
+ "loss": 1.8977,
254
+ "step": 7000
255
+ },
256
+ {
257
+ "epoch": 0.005007472957907322,
258
+ "grad_norm": 0.889249861240387,
259
+ "learning_rate": 1.9999691082251046e-05,
260
+ "loss": 1.796,
261
+ "step": 7200
262
+ },
263
+ {
264
+ "epoch": 0.005146569428960302,
265
+ "grad_norm": 0.8674280643463135,
266
+ "learning_rate": 1.9999673669628317e-05,
267
+ "loss": 1.8211,
268
+ "step": 7400
269
+ },
270
+ {
271
+ "epoch": 0.005285665900013284,
272
+ "grad_norm": 0.9621294736862183,
273
+ "learning_rate": 1.9999655779630983e-05,
274
+ "loss": 1.8682,
275
+ "step": 7600
276
+ },
277
+ {
278
+ "epoch": 0.005424762371066265,
279
+ "grad_norm": 0.6938799619674683,
280
+ "learning_rate": 1.9999637412259892e-05,
281
+ "loss": 1.826,
282
+ "step": 7800
283
+ },
284
+ {
285
+ "epoch": 0.005563858842119246,
286
+ "grad_norm": 0.9145010709762573,
287
+ "learning_rate": 1.9999618567515927e-05,
288
+ "loss": 1.8725,
289
+ "step": 8000
290
+ },
291
+ {
292
+ "epoch": 0.005702955313172227,
293
+ "grad_norm": 0.8468737006187439,
294
+ "learning_rate": 1.999959924539999e-05,
295
+ "loss": 1.7994,
296
+ "step": 8200
297
+ },
298
+ {
299
+ "epoch": 0.005842051784225208,
300
+ "grad_norm": 2.022569179534912,
301
+ "learning_rate": 1.9999579445912994e-05,
302
+ "loss": 1.8348,
303
+ "step": 8400
304
+ },
305
+ {
306
+ "epoch": 0.00598114825527819,
307
+ "grad_norm": 0.7492377161979675,
308
+ "learning_rate": 1.9999559169055893e-05,
309
+ "loss": 1.8122,
310
+ "step": 8600
311
+ },
312
+ {
313
+ "epoch": 0.006120244726331171,
314
+ "grad_norm": 1.3001569509506226,
315
+ "learning_rate": 1.999953841482965e-05,
316
+ "loss": 1.7776,
317
+ "step": 8800
318
+ },
319
+ {
320
+ "epoch": 0.006259341197384151,
321
+ "grad_norm": 1.238887071609497,
322
+ "learning_rate": 1.9999517183235256e-05,
323
+ "loss": 1.7954,
324
+ "step": 9000
325
+ },
326
+ {
327
+ "epoch": 0.006398437668437133,
328
+ "grad_norm": 0.9216433763504028,
329
+ "learning_rate": 1.9999495474273724e-05,
330
+ "loss": 1.8619,
331
+ "step": 9200
332
+ },
333
+ {
334
+ "epoch": 0.006537534139490114,
335
+ "grad_norm": 1.2052515745162964,
336
+ "learning_rate": 1.9999473287946092e-05,
337
+ "loss": 1.8636,
338
+ "step": 9400
339
+ },
340
+ {
341
+ "epoch": 0.0066766306105430955,
342
+ "grad_norm": 1.1095556020736694,
343
+ "learning_rate": 1.9999450624253423e-05,
344
+ "loss": 1.8283,
345
+ "step": 9600
346
+ },
347
+ {
348
+ "epoch": 0.006815727081596076,
349
+ "grad_norm": 0.681342363357544,
350
+ "learning_rate": 1.9999427483196793e-05,
351
+ "loss": 1.868,
352
+ "step": 9800
353
+ },
354
+ {
355
+ "epoch": 0.006954823552649057,
356
+ "grad_norm": 1.1534216403961182,
357
+ "learning_rate": 1.999940386477731e-05,
358
+ "loss": 1.8575,
359
+ "step": 10000
360
+ },
361
+ {
362
+ "epoch": 0.007093920023702039,
363
+ "grad_norm": 0.7867778539657593,
364
+ "learning_rate": 1.99993797689961e-05,
365
+ "loss": 1.8735,
366
+ "step": 10200
367
+ },
368
+ {
369
+ "epoch": 0.00723301649475502,
370
+ "grad_norm": 0.9070300459861755,
371
+ "learning_rate": 1.999935519585431e-05,
372
+ "loss": 1.817,
373
+ "step": 10400
374
+ },
375
+ {
376
+ "epoch": 0.007372112965808001,
377
+ "grad_norm": 0.830132782459259,
378
+ "learning_rate": 1.9999330145353123e-05,
379
+ "loss": 1.8442,
380
+ "step": 10600
381
+ },
382
+ {
383
+ "epoch": 0.007511209436860982,
384
+ "grad_norm": 0.6420764923095703,
385
+ "learning_rate": 1.9999304617493725e-05,
386
+ "loss": 1.8847,
387
+ "step": 10800
388
  }
389
  ],
390
  "logging_steps": 200,
391
+ "max_steps": 2875702,
392
  "num_input_tokens_seen": 0,
393
  "num_train_epochs": 2,
394
  "save_steps": 200,
 
404
  "attributes": {}
405
  }
406
  },
407
+ "total_flos": 6.271390014938726e+16,
408
  "train_batch_size": 1,
409
  "trial_name": null,
410
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa5aa241dd55111be21c66e31c3a9c312c22de9c6ecf5bc3d18a21ae67e9aeea
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9904b88a607b9ec7e7c8f68db2631b21b2e901d7835fe093b513a58fc6cba17
3
  size 6776