SystemAdmin123 commited on
Commit
7baed21
·
verified ·
1 Parent(s): aaa577d

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da6a17dd4ebabed24d872d4606102003c0e657825096a008c1ebd2fc857ec3e7
3
  size 2066752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a257c5b2e34eda80e3d84f1a8cc4247ba163d63b057609915a20d5c03487fe
3
  size 2066752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:272e87f837b0a6554425cf17e2bd0c25c2fd014cc81e381c4c2ca0d5dbb67931
3
  size 2162798
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:413c14a29fcea5318ab66a5306fbe153286fc64053bc776aa693b9d28710605a
3
  size 2162798
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da2a7e5837e8166f34d69c47cbcb9c5f754b8d9aaee67ec69e6f2fe466d72356
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4651e32e118f1ea1a8e26dfbbe64298593e12e6a71bcd36cb77f04f86d3f86d
3
+ size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd3708ae31f1069fc45ac5848423f03f6df0cd0e16b6138df98e3ba73d336476
3
- size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0c1eba909fbb51daca773a25c075f182b4096aff21c9b4ff19dbada2080ac99
3
+ size 15024
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28adb9f06e220aefdc723ea4380a84d42b8bfb87cc53ce65859d55ce1876f51c
3
+ size 15024
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:423c49ed521f6986d20d8b29112b383f4b0f3f2e228084ef82c2ad7dcd5d1de8
3
+ size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ad54995b081fae25638228c5d9c8f38ca277e5c5ad00bc3e49897b543f84405
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a60c7d771c1fd156acee762fba03c724cb41829a3f71df370ecd1d20b134982
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,543 +1,97 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 46.15384615384615,
5
- "eval_steps": 50,
6
- "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.07692307692307693,
13
- "eval_loss": 10.376392364501953,
14
- "eval_runtime": 5.3202,
15
- "eval_samples_per_second": 282.131,
16
- "eval_steps_per_second": 4.511,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.7692307692307693,
21
- "grad_norm": 0.09619140625,
22
- "learning_rate": 6.666666666666667e-05,
23
- "loss": 10.378,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 1.5384615384615383,
28
- "grad_norm": 0.09716796875,
29
- "learning_rate": 0.00013333333333333334,
30
- "loss": 10.3754,
31
  "step": 20
32
  },
33
  {
34
- "epoch": 2.3076923076923075,
35
- "grad_norm": 0.11279296875,
36
- "learning_rate": 0.0002,
37
- "loss": 10.3683,
38
  "step": 30
39
  },
40
  {
41
- "epoch": 3.076923076923077,
42
- "grad_norm": 0.201171875,
43
- "learning_rate": 0.00019984815164333163,
44
- "loss": 10.3522,
45
- "step": 40
46
- },
47
- {
48
- "epoch": 3.8461538461538463,
49
  "grad_norm": 0.3203125,
50
- "learning_rate": 0.00019939306773179497,
51
- "loss": 10.3159,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 3.8461538461538463,
56
- "eval_loss": 10.285228729248047,
57
- "eval_runtime": 5.2726,
58
- "eval_samples_per_second": 284.681,
59
- "eval_steps_per_second": 4.552,
60
- "step": 50
61
- },
62
- {
63
- "epoch": 4.615384615384615,
64
- "grad_norm": 0.32421875,
65
- "learning_rate": 0.00019863613034027224,
66
- "loss": 10.2518,
67
- "step": 60
68
- },
69
- {
70
- "epoch": 5.384615384615385,
71
- "grad_norm": 0.326171875,
72
- "learning_rate": 0.00019757963826274357,
73
- "loss": 10.1828,
74
- "step": 70
75
- },
76
- {
77
- "epoch": 6.153846153846154,
78
- "grad_norm": 0.330078125,
79
- "learning_rate": 0.00019622680003092503,
80
- "loss": 10.1177,
81
- "step": 80
82
- },
83
- {
84
- "epoch": 6.923076923076923,
85
- "grad_norm": 0.33203125,
86
- "learning_rate": 0.00019458172417006347,
87
- "loss": 10.0565,
88
- "step": 90
89
- },
90
- {
91
- "epoch": 7.6923076923076925,
92
- "grad_norm": 0.341796875,
93
- "learning_rate": 0.00019264940672148018,
94
- "loss": 9.998,
95
- "step": 100
96
- },
97
- {
98
- "epoch": 7.6923076923076925,
99
- "eval_loss": 9.973793029785156,
100
- "eval_runtime": 5.2822,
101
- "eval_samples_per_second": 284.164,
102
- "eval_steps_per_second": 4.544,
103
- "step": 100
104
- },
105
- {
106
- "epoch": 8.461538461538462,
107
- "grad_norm": 0.345703125,
108
- "learning_rate": 0.00019043571606975777,
109
- "loss": 9.942,
110
- "step": 110
111
- },
112
- {
113
- "epoch": 9.23076923076923,
114
- "grad_norm": 0.34765625,
115
- "learning_rate": 0.0001879473751206489,
116
- "loss": 9.8887,
117
- "step": 120
118
- },
119
- {
120
- "epoch": 10.0,
121
- "grad_norm": 0.349609375,
122
- "learning_rate": 0.00018519194088383273,
123
- "loss": 9.836,
124
- "step": 130
125
- },
126
- {
127
- "epoch": 10.76923076923077,
128
- "grad_norm": 0.357421875,
129
- "learning_rate": 0.0001821777815225245,
130
- "loss": 9.7845,
131
- "step": 140
132
- },
133
- {
134
- "epoch": 11.538461538461538,
135
- "grad_norm": 0.361328125,
136
- "learning_rate": 0.00017891405093963938,
137
- "loss": 9.7359,
138
- "step": 150
139
- },
140
- {
141
- "epoch": 11.538461538461538,
142
- "eval_loss": 9.719038963317871,
143
- "eval_runtime": 5.2736,
144
- "eval_samples_per_second": 284.626,
145
- "eval_steps_per_second": 4.551,
146
- "step": 150
147
- },
148
- {
149
- "epoch": 12.307692307692308,
150
- "grad_norm": 0.369140625,
151
- "learning_rate": 0.00017541066097768963,
152
- "loss": 9.688,
153
- "step": 160
154
- },
155
- {
156
- "epoch": 13.076923076923077,
157
- "grad_norm": 0.3671875,
158
- "learning_rate": 0.00017167825131684513,
159
- "loss": 9.6416,
160
- "step": 170
161
- },
162
- {
163
- "epoch": 13.846153846153847,
164
- "grad_norm": 0.375,
165
- "learning_rate": 0.00016772815716257412,
166
- "loss": 9.597,
167
- "step": 180
168
- },
169
- {
170
- "epoch": 14.615384615384615,
171
- "grad_norm": 0.380859375,
172
- "learning_rate": 0.00016357237482099684,
173
- "loss": 9.5534,
174
- "step": 190
175
- },
176
- {
177
- "epoch": 15.384615384615385,
178
- "grad_norm": 0.37890625,
179
- "learning_rate": 0.00015922352526649803,
180
- "loss": 9.5151,
181
- "step": 200
182
- },
183
- {
184
- "epoch": 15.384615384615385,
185
- "eval_loss": 9.504176139831543,
186
- "eval_runtime": 5.2799,
187
- "eval_samples_per_second": 284.283,
188
- "eval_steps_per_second": 4.545,
189
- "step": 200
190
- },
191
- {
192
- "epoch": 16.153846153846153,
193
- "grad_norm": 0.3828125,
194
- "learning_rate": 0.00015469481581224272,
195
- "loss": 9.4734,
196
- "step": 210
197
- },
198
- {
199
- "epoch": 16.923076923076923,
200
- "grad_norm": 0.388671875,
201
- "learning_rate": 0.00015000000000000001,
202
- "loss": 9.4381,
203
- "step": 220
204
- },
205
- {
206
- "epoch": 17.692307692307693,
207
- "grad_norm": 0.39453125,
208
- "learning_rate": 0.00014515333583108896,
209
- "loss": 9.4021,
210
- "step": 230
211
- },
212
- {
213
- "epoch": 18.46153846153846,
214
- "grad_norm": 0.396484375,
215
  "learning_rate": 0.00014016954246529696,
216
- "loss": 9.3725,
217
- "step": 240
218
- },
219
- {
220
- "epoch": 19.23076923076923,
221
- "grad_norm": 0.396484375,
222
- "learning_rate": 0.00013506375551927547,
223
- "loss": 9.3407,
224
- "step": 250
225
- },
226
- {
227
- "epoch": 19.23076923076923,
228
- "eval_loss": 9.341134071350098,
229
- "eval_runtime": 5.9586,
230
- "eval_samples_per_second": 251.904,
231
- "eval_steps_per_second": 4.028,
232
- "step": 250
233
- },
234
- {
235
- "epoch": 20.0,
236
- "grad_norm": 0.400390625,
237
- "learning_rate": 0.00012985148110016947,
238
- "loss": 9.3153,
239
- "step": 260
240
- },
241
- {
242
- "epoch": 20.76923076923077,
243
- "grad_norm": 0.40234375,
244
- "learning_rate": 0.00012454854871407994,
245
- "loss": 9.2905,
246
- "step": 270
247
- },
248
- {
249
- "epoch": 21.53846153846154,
250
- "grad_norm": 0.408203125,
251
- "learning_rate": 0.00011917106319237386,
252
- "loss": 9.2678,
253
- "step": 280
254
- },
255
- {
256
- "epoch": 22.307692307692307,
257
- "grad_norm": 0.404296875,
258
- "learning_rate": 0.00011373535578184082,
259
- "loss": 9.2496,
260
- "step": 290
261
  },
262
  {
263
- "epoch": 23.076923076923077,
264
- "grad_norm": 0.40625,
265
  "learning_rate": 0.00010825793454723325,
266
- "loss": 9.2338,
267
- "step": 300
268
- },
269
- {
270
- "epoch": 23.076923076923077,
271
- "eval_loss": 9.241479873657227,
272
- "eval_runtime": 5.2815,
273
- "eval_samples_per_second": 284.199,
274
- "eval_steps_per_second": 4.544,
275
- "step": 300
276
- },
277
- {
278
- "epoch": 23.846153846153847,
279
- "grad_norm": 0.412109375,
280
- "learning_rate": 0.00010275543423681621,
281
- "loss": 9.2185,
282
- "step": 310
283
- },
284
- {
285
- "epoch": 24.615384615384617,
286
- "grad_norm": 0.4140625,
287
- "learning_rate": 9.724456576318381e-05,
288
- "loss": 9.2101,
289
- "step": 320
290
- },
291
- {
292
- "epoch": 25.384615384615383,
293
- "grad_norm": 0.412109375,
294
- "learning_rate": 9.174206545276677e-05,
295
- "loss": 9.2002,
296
- "step": 330
297
- },
298
- {
299
- "epoch": 26.153846153846153,
300
- "grad_norm": 0.416015625,
301
- "learning_rate": 8.626464421815919e-05,
302
- "loss": 9.193,
303
- "step": 340
304
- },
305
- {
306
- "epoch": 26.923076923076923,
307
- "grad_norm": 0.41015625,
308
- "learning_rate": 8.082893680762619e-05,
309
- "loss": 9.1896,
310
- "step": 350
311
- },
312
- {
313
- "epoch": 26.923076923076923,
314
- "eval_loss": 9.20389461517334,
315
- "eval_runtime": 5.2643,
316
- "eval_samples_per_second": 285.129,
317
- "eval_steps_per_second": 4.559,
318
- "step": 350
319
  },
320
  {
321
- "epoch": 27.692307692307693,
322
- "grad_norm": 0.416015625,
323
  "learning_rate": 7.54514512859201e-05,
324
- "loss": 9.186,
325
- "step": 360
326
- },
327
- {
328
- "epoch": 28.46153846153846,
329
- "grad_norm": 0.412109375,
330
- "learning_rate": 7.014851889983057e-05,
331
- "loss": 9.182,
332
- "step": 370
333
- },
334
- {
335
- "epoch": 29.23076923076923,
336
- "grad_norm": 0.416015625,
337
- "learning_rate": 6.493624448072457e-05,
338
- "loss": 9.1823,
339
- "step": 380
340
- },
341
- {
342
- "epoch": 30.0,
343
- "grad_norm": 0.4140625,
344
- "learning_rate": 5.983045753470308e-05,
345
- "loss": 9.1806,
346
- "step": 390
347
- },
348
- {
349
- "epoch": 30.76923076923077,
350
- "grad_norm": 0.419921875,
351
- "learning_rate": 5.484666416891109e-05,
352
- "loss": 9.18,
353
- "step": 400
354
- },
355
- {
356
- "epoch": 30.76923076923077,
357
- "eval_loss": 9.195965766906738,
358
- "eval_runtime": 5.2708,
359
- "eval_samples_per_second": 284.775,
360
- "eval_steps_per_second": 4.553,
361
- "step": 400
362
- },
363
- {
364
- "epoch": 31.53846153846154,
365
- "grad_norm": 0.4140625,
366
- "learning_rate": 5.000000000000002e-05,
367
- "loss": 9.1787,
368
- "step": 410
369
  },
370
  {
371
- "epoch": 32.30769230769231,
372
- "grad_norm": 0.419921875,
373
  "learning_rate": 4.530518418775733e-05,
374
- "loss": 9.1782,
375
- "step": 420
376
- },
377
- {
378
- "epoch": 33.07692307692308,
379
- "grad_norm": 0.41796875,
380
- "learning_rate": 4.077647473350201e-05,
381
- "loss": 9.1776,
382
- "step": 430
383
- },
384
- {
385
- "epoch": 33.84615384615385,
386
- "grad_norm": 0.416015625,
387
- "learning_rate": 3.642762517900322e-05,
388
- "loss": 9.1787,
389
- "step": 440
390
- },
391
- {
392
- "epoch": 34.61538461538461,
393
- "grad_norm": 0.416015625,
394
- "learning_rate": 3.227184283742591e-05,
395
- "loss": 9.1777,
396
- "step": 450
397
- },
398
- {
399
- "epoch": 34.61538461538461,
400
- "eval_loss": 9.195714950561523,
401
- "eval_runtime": 5.2816,
402
- "eval_samples_per_second": 284.194,
403
- "eval_steps_per_second": 4.544,
404
- "step": 450
405
- },
406
- {
407
- "epoch": 35.38461538461539,
408
- "grad_norm": 0.4140625,
409
- "learning_rate": 2.8321748683154893e-05,
410
- "loss": 9.1782,
411
- "step": 460
412
- },
413
- {
414
- "epoch": 36.15384615384615,
415
- "grad_norm": 0.419921875,
416
- "learning_rate": 2.4589339022310386e-05,
417
- "loss": 9.1783,
418
- "step": 470
419
  },
420
  {
421
- "epoch": 36.92307692307692,
422
- "grad_norm": 0.416015625,
423
  "learning_rate": 2.1085949060360654e-05,
424
- "loss": 9.1769,
425
- "step": 480
426
- },
427
- {
428
- "epoch": 37.69230769230769,
429
- "grad_norm": 0.4140625,
430
- "learning_rate": 1.7822218477475494e-05,
431
- "loss": 9.1779,
432
- "step": 490
433
- },
434
- {
435
- "epoch": 38.46153846153846,
436
- "grad_norm": 0.419921875,
437
- "learning_rate": 1.4808059116167305e-05,
438
- "loss": 9.1781,
439
- "step": 500
440
- },
441
- {
442
- "epoch": 38.46153846153846,
443
- "eval_loss": 9.193068504333496,
444
- "eval_runtime": 5.6598,
445
- "eval_samples_per_second": 265.206,
446
- "eval_steps_per_second": 4.24,
447
- "step": 500
448
- },
449
- {
450
- "epoch": 39.23076923076923,
451
- "grad_norm": 0.419921875,
452
- "learning_rate": 1.2052624879351104e-05,
453
- "loss": 9.1771,
454
- "step": 510
455
- },
456
- {
457
- "epoch": 40.0,
458
- "grad_norm": 0.41796875,
459
- "learning_rate": 9.564283930242257e-06,
460
- "loss": 9.1775,
461
- "step": 520
462
- },
463
- {
464
- "epoch": 40.76923076923077,
465
- "grad_norm": 0.416015625,
466
- "learning_rate": 7.350593278519824e-06,
467
- "loss": 9.1773,
468
- "step": 530
469
  },
470
  {
471
- "epoch": 41.53846153846154,
472
- "grad_norm": 0.41796875,
473
  "learning_rate": 5.418275829936537e-06,
474
- "loss": 9.1787,
475
- "step": 540
476
- },
477
- {
478
- "epoch": 42.30769230769231,
479
- "grad_norm": 0.41796875,
480
- "learning_rate": 3.7731999690749585e-06,
481
- "loss": 9.1761,
482
- "step": 550
483
- },
484
- {
485
- "epoch": 42.30769230769231,
486
- "eval_loss": 9.19363784790039,
487
- "eval_runtime": 5.2646,
488
- "eval_samples_per_second": 285.113,
489
- "eval_steps_per_second": 4.559,
490
- "step": 550
491
- },
492
- {
493
- "epoch": 43.07692307692308,
494
- "grad_norm": 0.4140625,
495
- "learning_rate": 2.420361737256438e-06,
496
- "loss": 9.1784,
497
- "step": 560
498
- },
499
- {
500
- "epoch": 43.84615384615385,
501
- "grad_norm": 0.41796875,
502
- "learning_rate": 1.3638696597277679e-06,
503
- "loss": 9.1777,
504
- "step": 570
505
- },
506
- {
507
- "epoch": 44.61538461538461,
508
- "grad_norm": 0.419921875,
509
- "learning_rate": 6.069322682050516e-07,
510
- "loss": 9.1766,
511
- "step": 580
512
- },
513
- {
514
- "epoch": 45.38461538461539,
515
- "grad_norm": 0.412109375,
516
- "learning_rate": 1.518483566683826e-07,
517
- "loss": 9.1785,
518
- "step": 590
519
  },
520
  {
521
- "epoch": 46.15384615384615,
522
- "grad_norm": 0.419921875,
523
  "learning_rate": 0.0,
524
- "loss": 9.1762,
525
- "step": 600
526
- },
527
- {
528
- "epoch": 46.15384615384615,
529
- "eval_loss": 9.194318771362305,
530
- "eval_runtime": 5.2604,
531
- "eval_samples_per_second": 285.34,
532
- "eval_steps_per_second": 4.562,
533
- "step": 600
534
  }
535
  ],
536
  "logging_steps": 10,
537
- "max_steps": 600,
538
  "num_input_tokens_seen": 0,
539
- "num_train_epochs": 47,
540
- "save_steps": 50,
541
  "stateful_callbacks": {
542
  "TrainerControl": {
543
  "args": {
@@ -550,7 +104,7 @@
550
  "attributes": {}
551
  }
552
  },
553
- "total_flos": 245495129702400.0,
554
  "train_batch_size": 32,
555
  "trial_name": null,
556
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 16.666666666666668,
5
+ "eval_steps": 200,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.16666666666666666,
13
+ "eval_loss": 10.376375198364258,
14
+ "eval_runtime": 2.371,
15
+ "eval_samples_per_second": 633.075,
16
+ "eval_steps_per_second": 5.061,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 1.6666666666666665,
21
+ "grad_norm": 0.09375,
22
+ "learning_rate": 0.00019863613034027224,
23
+ "loss": 10.3756,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 3.3333333333333335,
28
+ "grad_norm": 0.12353515625,
29
+ "learning_rate": 0.0001879473751206489,
30
+ "loss": 10.3632,
31
  "step": 20
32
  },
33
  {
34
+ "epoch": 5.0,
35
+ "grad_norm": 0.2236328125,
36
+ "learning_rate": 0.00016772815716257412,
37
+ "loss": 10.3433,
38
  "step": 30
39
  },
40
  {
41
+ "epoch": 6.666666666666667,
 
 
 
 
 
 
 
42
  "grad_norm": 0.3203125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  "learning_rate": 0.00014016954246529696,
44
+ "loss": 10.3073,
45
+ "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  },
47
  {
48
+ "epoch": 8.333333333333334,
49
+ "grad_norm": 0.330078125,
50
  "learning_rate": 0.00010825793454723325,
51
+ "loss": 10.2602,
52
+ "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  },
54
  {
55
+ "epoch": 10.0,
56
+ "grad_norm": 0.326171875,
57
  "learning_rate": 7.54514512859201e-05,
58
+ "loss": 10.2203,
59
+ "step": 60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  },
61
  {
62
+ "epoch": 11.666666666666666,
63
+ "grad_norm": 0.326171875,
64
  "learning_rate": 4.530518418775733e-05,
65
+ "loss": 10.1945,
66
+ "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  },
68
  {
69
+ "epoch": 13.333333333333334,
70
+ "grad_norm": 0.328125,
71
  "learning_rate": 2.1085949060360654e-05,
72
+ "loss": 10.1812,
73
+ "step": 80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  },
75
  {
76
+ "epoch": 15.0,
77
+ "grad_norm": 0.328125,
78
  "learning_rate": 5.418275829936537e-06,
79
+ "loss": 10.1773,
80
+ "step": 90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  },
82
  {
83
+ "epoch": 16.666666666666668,
84
+ "grad_norm": 0.328125,
85
  "learning_rate": 0.0,
86
+ "loss": 10.1767,
87
+ "step": 100
 
 
 
 
 
 
 
 
88
  }
89
  ],
90
  "logging_steps": 10,
91
+ "max_steps": 100,
92
  "num_input_tokens_seen": 0,
93
+ "num_train_epochs": 17,
94
+ "save_steps": 200,
95
  "stateful_callbacks": {
96
  "TrainerControl": {
97
  "args": {
 
104
  "attributes": {}
105
  }
106
  },
107
+ "total_flos": 81831709900800.0,
108
  "train_batch_size": 32,
109
  "trial_name": null,
110
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:120614a5953861571b441322c20a1add8c543820ada9a66731f41e9cf2b7b9de
3
  size 6904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cfc960a203b27f090e467325bfefda4ba62fdf00b36c1ffe04aa88d3518354f
3
  size 6904