SystemAdmin123 commited on
Commit
66506dc
·
verified ·
1 Parent(s): 8399f15

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:463e50976fdd4523d72dfa1f9dde00e7d2fb94044b71ff95fa42ebac380383f9
3
  size 136062744
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08823db6e7d3f7ecebdb39fa2abc5545b00f017474fd1d822068d0885a26f95d
3
  size 136062744
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0622f360da035ed6f191ceef1a70160b2b4f027c1bb245098847e56aae470e88
3
  size 138234066
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e7d7dc966642f5cf110823004ec8a086c5b5536e976261bf7da9bb4f9cbd57d
3
  size 138234066
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da2a7e5837e8166f34d69c47cbcb9c5f754b8d9aaee67ec69e6f2fe466d72356
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4651e32e118f1ea1a8e26dfbbe64298593e12e6a71bcd36cb77f04f86d3f86d
3
+ size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd3708ae31f1069fc45ac5848423f03f6df0cd0e16b6138df98e3ba73d336476
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0c1eba909fbb51daca773a25c075f182b4096aff21c9b4ff19dbada2080ac99
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28adb9f06e220aefdc723ea4380a84d42b8bfb87cc53ce65859d55ce1876f51c
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:423c49ed521f6986d20d8b29112b383f4b0f3f2e228084ef82c2ad7dcd5d1de8
3
+ size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ad54995b081fae25638228c5d9c8f38ca277e5c5ad00bc3e49897b543f84405
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a60c7d771c1fd156acee762fba03c724cb41829a3f71df370ecd1d20b134982
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,543 +1,97 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 46.15384615384615,
5
- "eval_steps": 50,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.07692307692307693,
13
- "eval_loss": 3.9168343544006348,
14
- "eval_runtime": 5.6922,
15
- "eval_samples_per_second": 263.695,
16
- "eval_steps_per_second": 4.216,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.7692307692307693,
21
- "grad_norm": 1.3671875,
22
- "learning_rate": 6.666666666666667e-05,
23
- "loss": 3.3633,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 1.5384615384615383,
28
- "grad_norm": 1.09375,
29
- "learning_rate": 0.00013333333333333334,
30
- "loss": 3.1311,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 2.3076923076923075,
35
- "grad_norm": 0.8203125,
36
- "learning_rate": 0.0002,
37
- "loss": 2.9153,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 3.076923076923077,
42
- "grad_norm": 0.7265625,
43
- "learning_rate": 0.00019984815164333163,
44
- "loss": 2.7539,
45
  "step": 40
46
  },
47
  {
48
- "epoch": 3.8461538461538463,
49
- "grad_norm": 0.71875,
50
- "learning_rate": 0.00019939306773179497,
51
- "loss": 2.5978,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 3.8461538461538463,
56
- "eval_loss": 2.814912796020508,
57
- "eval_runtime": 5.762,
58
- "eval_samples_per_second": 260.499,
59
- "eval_steps_per_second": 4.165,
60
  "step": 50
61
  },
62
- {
63
- "epoch": 4.615384615384615,
64
- "grad_norm": 0.71484375,
65
- "learning_rate": 0.00019863613034027224,
66
- "loss": 2.4698,
67
- "step": 60
68
- },
69
- {
70
- "epoch": 5.384615384615385,
71
- "grad_norm": 0.7734375,
72
- "learning_rate": 0.00019757963826274357,
73
- "loss": 2.3909,
74
- "step": 70
75
- },
76
- {
77
- "epoch": 6.153846153846154,
78
- "grad_norm": 0.765625,
79
- "learning_rate": 0.00019622680003092503,
80
- "loss": 2.2723,
81
- "step": 80
82
- },
83
- {
84
- "epoch": 6.923076923076923,
85
- "grad_norm": 0.796875,
86
- "learning_rate": 0.00019458172417006347,
87
- "loss": 2.1816,
88
- "step": 90
89
- },
90
- {
91
- "epoch": 7.6923076923076925,
92
- "grad_norm": 0.8203125,
93
- "learning_rate": 0.00019264940672148018,
94
- "loss": 2.0808,
95
- "step": 100
96
- },
97
- {
98
- "epoch": 7.6923076923076925,
99
- "eval_loss": 2.966378927230835,
100
- "eval_runtime": 5.6204,
101
- "eval_samples_per_second": 267.064,
102
- "eval_steps_per_second": 4.27,
103
- "step": 100
104
- },
105
- {
106
- "epoch": 8.461538461538462,
107
- "grad_norm": 0.83984375,
108
- "learning_rate": 0.00019043571606975777,
109
- "loss": 1.9902,
110
- "step": 110
111
- },
112
- {
113
- "epoch": 9.23076923076923,
114
- "grad_norm": 0.99609375,
115
- "learning_rate": 0.0001879473751206489,
116
- "loss": 1.9033,
117
- "step": 120
118
- },
119
  {
120
  "epoch": 10.0,
121
- "grad_norm": 0.83984375,
122
- "learning_rate": 0.00018519194088383273,
123
- "loss": 1.8173,
124
- "step": 130
125
- },
126
- {
127
- "epoch": 10.76923076923077,
128
- "grad_norm": 0.87890625,
129
- "learning_rate": 0.0001821777815225245,
130
- "loss": 1.7058,
131
- "step": 140
132
- },
133
- {
134
- "epoch": 11.538461538461538,
135
- "grad_norm": 0.9296875,
136
- "learning_rate": 0.00017891405093963938,
137
- "loss": 1.6294,
138
- "step": 150
139
- },
140
- {
141
- "epoch": 11.538461538461538,
142
- "eval_loss": 3.233675003051758,
143
- "eval_runtime": 5.5692,
144
- "eval_samples_per_second": 269.517,
145
- "eval_steps_per_second": 4.309,
146
- "step": 150
147
- },
148
- {
149
- "epoch": 12.307692307692308,
150
- "grad_norm": 0.9921875,
151
- "learning_rate": 0.00017541066097768963,
152
- "loss": 1.5563,
153
- "step": 160
154
- },
155
- {
156
- "epoch": 13.076923076923077,
157
- "grad_norm": 1.0546875,
158
- "learning_rate": 0.00017167825131684513,
159
- "loss": 1.4865,
160
- "step": 170
161
- },
162
- {
163
- "epoch": 13.846153846153847,
164
- "grad_norm": 0.94921875,
165
- "learning_rate": 0.00016772815716257412,
166
- "loss": 1.3938,
167
- "step": 180
168
- },
169
- {
170
- "epoch": 14.615384615384615,
171
- "grad_norm": 0.90625,
172
- "learning_rate": 0.00016357237482099684,
173
- "loss": 1.3247,
174
- "step": 190
175
- },
176
- {
177
- "epoch": 15.384615384615385,
178
- "grad_norm": 0.99609375,
179
- "learning_rate": 0.00015922352526649803,
180
- "loss": 1.2699,
181
- "step": 200
182
- },
183
- {
184
- "epoch": 15.384615384615385,
185
- "eval_loss": 3.521737813949585,
186
- "eval_runtime": 5.6468,
187
- "eval_samples_per_second": 265.815,
188
- "eval_steps_per_second": 4.25,
189
- "step": 200
190
- },
191
- {
192
- "epoch": 16.153846153846153,
193
- "grad_norm": 0.95703125,
194
- "learning_rate": 0.00015469481581224272,
195
- "loss": 1.2117,
196
- "step": 210
197
- },
198
- {
199
- "epoch": 16.923076923076923,
200
- "grad_norm": 1.0390625,
201
- "learning_rate": 0.00015000000000000001,
202
- "loss": 1.1498,
203
- "step": 220
204
- },
205
- {
206
- "epoch": 17.692307692307693,
207
- "grad_norm": 1.0078125,
208
- "learning_rate": 0.00014515333583108896,
209
- "loss": 1.0864,
210
- "step": 230
211
- },
212
- {
213
- "epoch": 18.46153846153846,
214
- "grad_norm": 0.81640625,
215
- "learning_rate": 0.00014016954246529696,
216
- "loss": 1.0441,
217
- "step": 240
218
- },
219
- {
220
- "epoch": 19.23076923076923,
221
- "grad_norm": 0.98828125,
222
- "learning_rate": 0.00013506375551927547,
223
- "loss": 1.0092,
224
- "step": 250
225
- },
226
- {
227
- "epoch": 19.23076923076923,
228
- "eval_loss": 3.726165533065796,
229
- "eval_runtime": 5.8207,
230
- "eval_samples_per_second": 257.872,
231
- "eval_steps_per_second": 4.123,
232
- "step": 250
233
- },
234
- {
235
- "epoch": 20.0,
236
- "grad_norm": 0.8359375,
237
- "learning_rate": 0.00012985148110016947,
238
- "loss": 0.9671,
239
- "step": 260
240
- },
241
- {
242
- "epoch": 20.76923076923077,
243
- "grad_norm": 0.8203125,
244
- "learning_rate": 0.00012454854871407994,
245
- "loss": 0.9182,
246
- "step": 270
247
- },
248
- {
249
- "epoch": 21.53846153846154,
250
- "grad_norm": 0.796875,
251
- "learning_rate": 0.00011917106319237386,
252
- "loss": 0.8857,
253
- "step": 280
254
- },
255
- {
256
- "epoch": 22.307692307692307,
257
- "grad_norm": 0.79296875,
258
- "learning_rate": 0.00011373535578184082,
259
- "loss": 0.8623,
260
- "step": 290
261
- },
262
- {
263
- "epoch": 23.076923076923077,
264
- "grad_norm": 0.7421875,
265
- "learning_rate": 0.00010825793454723325,
266
- "loss": 0.8392,
267
- "step": 300
268
- },
269
- {
270
- "epoch": 23.076923076923077,
271
- "eval_loss": 3.868284225463867,
272
- "eval_runtime": 5.7191,
273
- "eval_samples_per_second": 262.455,
274
- "eval_steps_per_second": 4.196,
275
- "step": 300
276
- },
277
- {
278
- "epoch": 23.846153846153847,
279
- "grad_norm": 0.75,
280
- "learning_rate": 0.00010275543423681621,
281
- "loss": 0.8103,
282
- "step": 310
283
- },
284
- {
285
- "epoch": 24.615384615384617,
286
- "grad_norm": 0.75390625,
287
- "learning_rate": 9.724456576318381e-05,
288
- "loss": 0.7828,
289
- "step": 320
290
- },
291
- {
292
- "epoch": 25.384615384615383,
293
- "grad_norm": 0.7265625,
294
- "learning_rate": 9.174206545276677e-05,
295
- "loss": 0.7749,
296
- "step": 330
297
- },
298
- {
299
- "epoch": 26.153846153846153,
300
- "grad_norm": 0.74609375,
301
- "learning_rate": 8.626464421815919e-05,
302
- "loss": 0.7574,
303
- "step": 340
304
- },
305
- {
306
- "epoch": 26.923076923076923,
307
- "grad_norm": 0.6875,
308
- "learning_rate": 8.082893680762619e-05,
309
- "loss": 0.7428,
310
- "step": 350
311
- },
312
- {
313
- "epoch": 26.923076923076923,
314
- "eval_loss": 3.9434773921966553,
315
- "eval_runtime": 5.6325,
316
- "eval_samples_per_second": 266.49,
317
- "eval_steps_per_second": 4.261,
318
- "step": 350
319
- },
320
- {
321
- "epoch": 27.692307692307693,
322
- "grad_norm": 0.6953125,
323
  "learning_rate": 7.54514512859201e-05,
324
- "loss": 0.7298,
325
- "step": 360
326
- },
327
- {
328
- "epoch": 28.46153846153846,
329
- "grad_norm": 0.66015625,
330
- "learning_rate": 7.014851889983057e-05,
331
- "loss": 0.7167,
332
- "step": 370
333
- },
334
- {
335
- "epoch": 29.23076923076923,
336
- "grad_norm": 0.65625,
337
- "learning_rate": 6.493624448072457e-05,
338
- "loss": 0.7147,
339
- "step": 380
340
- },
341
- {
342
- "epoch": 30.0,
343
- "grad_norm": 0.69140625,
344
- "learning_rate": 5.983045753470308e-05,
345
- "loss": 0.7019,
346
- "step": 390
347
- },
348
- {
349
- "epoch": 30.76923076923077,
350
- "grad_norm": 0.66796875,
351
- "learning_rate": 5.484666416891109e-05,
352
- "loss": 0.6952,
353
- "step": 400
354
- },
355
- {
356
- "epoch": 30.76923076923077,
357
- "eval_loss": 3.985978841781616,
358
- "eval_runtime": 5.6438,
359
- "eval_samples_per_second": 265.955,
360
- "eval_steps_per_second": 4.252,
361
- "step": 400
362
- },
363
- {
364
- "epoch": 31.53846153846154,
365
- "grad_norm": 0.6484375,
366
- "learning_rate": 5.000000000000002e-05,
367
- "loss": 0.6901,
368
- "step": 410
369
  },
370
  {
371
- "epoch": 32.30769230769231,
372
- "grad_norm": 0.640625,
373
  "learning_rate": 4.530518418775733e-05,
374
- "loss": 0.685,
375
- "step": 420
376
- },
377
- {
378
- "epoch": 33.07692307692308,
379
- "grad_norm": 0.62890625,
380
- "learning_rate": 4.077647473350201e-05,
381
- "loss": 0.6851,
382
- "step": 430
383
- },
384
- {
385
- "epoch": 33.84615384615385,
386
- "grad_norm": 0.62109375,
387
- "learning_rate": 3.642762517900322e-05,
388
- "loss": 0.6782,
389
- "step": 440
390
- },
391
- {
392
- "epoch": 34.61538461538461,
393
- "grad_norm": 0.6171875,
394
- "learning_rate": 3.227184283742591e-05,
395
- "loss": 0.6762,
396
- "step": 450
397
- },
398
- {
399
- "epoch": 34.61538461538461,
400
- "eval_loss": 3.998966932296753,
401
- "eval_runtime": 5.7156,
402
- "eval_samples_per_second": 262.613,
403
- "eval_steps_per_second": 4.199,
404
- "step": 450
405
- },
406
- {
407
- "epoch": 35.38461538461539,
408
- "grad_norm": 0.62109375,
409
- "learning_rate": 2.8321748683154893e-05,
410
- "loss": 0.6742,
411
- "step": 460
412
- },
413
- {
414
- "epoch": 36.15384615384615,
415
- "grad_norm": 0.6171875,
416
- "learning_rate": 2.4589339022310386e-05,
417
- "loss": 0.674,
418
- "step": 470
419
  },
420
  {
421
- "epoch": 36.92307692307692,
422
- "grad_norm": 0.62109375,
423
  "learning_rate": 2.1085949060360654e-05,
424
- "loss": 0.6728,
425
- "step": 480
426
- },
427
- {
428
- "epoch": 37.69230769230769,
429
- "grad_norm": 0.6171875,
430
- "learning_rate": 1.7822218477475494e-05,
431
- "loss": 0.6681,
432
- "step": 490
433
- },
434
- {
435
- "epoch": 38.46153846153846,
436
- "grad_norm": 0.62109375,
437
- "learning_rate": 1.4808059116167305e-05,
438
- "loss": 0.6739,
439
- "step": 500
440
- },
441
- {
442
- "epoch": 38.46153846153846,
443
- "eval_loss": 4.016704559326172,
444
- "eval_runtime": 5.9697,
445
- "eval_samples_per_second": 251.437,
446
- "eval_steps_per_second": 4.02,
447
- "step": 500
448
- },
449
- {
450
- "epoch": 39.23076923076923,
451
- "grad_norm": 0.61328125,
452
- "learning_rate": 1.2052624879351104e-05,
453
- "loss": 0.6685,
454
- "step": 510
455
- },
456
- {
457
- "epoch": 40.0,
458
- "grad_norm": 0.625,
459
- "learning_rate": 9.564283930242257e-06,
460
- "loss": 0.6697,
461
- "step": 520
462
- },
463
- {
464
- "epoch": 40.76923076923077,
465
- "grad_norm": 0.60546875,
466
- "learning_rate": 7.350593278519824e-06,
467
- "loss": 0.6691,
468
- "step": 530
469
  },
470
  {
471
- "epoch": 41.53846153846154,
472
- "grad_norm": 0.609375,
473
  "learning_rate": 5.418275829936537e-06,
474
- "loss": 0.6709,
475
- "step": 540
476
- },
477
- {
478
- "epoch": 42.30769230769231,
479
- "grad_norm": 0.62109375,
480
- "learning_rate": 3.7731999690749585e-06,
481
- "loss": 0.6691,
482
- "step": 550
483
- },
484
- {
485
- "epoch": 42.30769230769231,
486
- "eval_loss": 4.020811080932617,
487
- "eval_runtime": 5.6558,
488
- "eval_samples_per_second": 265.392,
489
- "eval_steps_per_second": 4.243,
490
- "step": 550
491
- },
492
- {
493
- "epoch": 43.07692307692308,
494
- "grad_norm": 0.61328125,
495
- "learning_rate": 2.420361737256438e-06,
496
- "loss": 0.6671,
497
- "step": 560
498
- },
499
- {
500
- "epoch": 43.84615384615385,
501
- "grad_norm": 0.62890625,
502
- "learning_rate": 1.3638696597277679e-06,
503
- "loss": 0.6683,
504
- "step": 570
505
- },
506
- {
507
- "epoch": 44.61538461538461,
508
- "grad_norm": 0.59375,
509
- "learning_rate": 6.069322682050516e-07,
510
- "loss": 0.6714,
511
- "step": 580
512
- },
513
- {
514
- "epoch": 45.38461538461539,
515
- "grad_norm": 0.6015625,
516
- "learning_rate": 1.518483566683826e-07,
517
- "loss": 0.6695,
518
- "step": 590
519
  },
520
  {
521
- "epoch": 46.15384615384615,
522
- "grad_norm": 0.62109375,
523
  "learning_rate": 0.0,
524
- "loss": 0.6667,
525
- "step": 600
526
- },
527
- {
528
- "epoch": 46.15384615384615,
529
- "eval_loss": 4.010259628295898,
530
- "eval_runtime": 5.6787,
531
- "eval_samples_per_second": 264.32,
532
- "eval_steps_per_second": 4.226,
533
- "step": 600
534
  }
535
  ],
536
  "logging_steps": 10,
537
- "max_steps": 600,
538
  "num_input_tokens_seen": 0,
539
- "num_train_epochs": 47,
540
- "save_steps": 50,
541
  "stateful_callbacks": {
542
  "TrainerControl": {
543
  "args": {
@@ -550,7 +104,7 @@
550
  "attributes": {}
551
  }
552
  },
553
- "total_flos": 2.05042678235136e+16,
554
  "train_batch_size": 32,
555
  "trial_name": null,
556
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 16.666666666666668,
5
+ "eval_steps": 200,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.16666666666666666,
13
+ "eval_loss": 3.9322755336761475,
14
+ "eval_runtime": 2.5681,
15
+ "eval_samples_per_second": 584.476,
16
+ "eval_steps_per_second": 4.673,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 1.6666666666666665,
21
+ "grad_norm": 1.1328125,
22
+ "learning_rate": 0.00019863613034027224,
23
+ "loss": 3.2122,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 0.671875,
29
+ "learning_rate": 0.0001879473751206489,
30
+ "loss": 2.8046,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.5078125,
36
+ "learning_rate": 0.00016772815716257412,
37
+ "loss": 2.6022,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 6.666666666666667,
42
+ "grad_norm": 0.482421875,
43
+ "learning_rate": 0.00014016954246529696,
44
+ "loss": 2.4584,
45
  "step": 40
46
  },
47
  {
48
+ "epoch": 8.333333333333334,
49
+ "grad_norm": 0.439453125,
50
+ "learning_rate": 0.00010825793454723325,
51
+ "loss": 2.3679,
 
 
 
 
 
 
 
 
52
  "step": 50
53
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  {
55
  "epoch": 10.0,
56
+ "grad_norm": 0.423828125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  "learning_rate": 7.54514512859201e-05,
58
+ "loss": 2.3013,
59
+ "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  },
61
  {
62
+ "epoch": 11.666666666666666,
63
+ "grad_norm": 0.40625,
64
  "learning_rate": 4.530518418775733e-05,
65
+ "loss": 2.2634,
66
+ "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
+ "epoch": 13.333333333333334,
70
+ "grad_norm": 0.392578125,
71
  "learning_rate": 2.1085949060360654e-05,
72
+ "loss": 2.252,
73
+ "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  },
75
  {
76
+ "epoch": 15.0,
77
+ "grad_norm": 0.392578125,
78
  "learning_rate": 5.418275829936537e-06,
79
+ "loss": 2.2463,
80
+ "step": 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  },
82
  {
83
+ "epoch": 16.666666666666668,
84
+ "grad_norm": 0.396484375,
85
  "learning_rate": 0.0,
86
+ "loss": 2.2461,
87
+ "step": 100
 
 
 
 
 
 
 
 
88
  }
89
  ],
90
  "logging_steps": 10,
91
+ "max_steps": 100,
92
  "num_input_tokens_seen": 0,
93
+ "num_train_epochs": 17,
94
+ "save_steps": 200,
95
  "stateful_callbacks": {
96
  "TrainerControl": {
97
  "args": {
 
104
  "attributes": {}
105
  }
106
  },
107
+ "total_flos": 6834755941171200.0,
108
  "train_batch_size": 32,
109
  "trial_name": null,
110
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5307f347a34c2795320714166ddd7cfa8a75c74197145a984dc8fe0048273f0e
3
  size 6840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99f76655201d3a5e646acd9bbe970d07860ce4d230a73ef72b05ccbd756ed49
3
  size 6840