SystemAdmin123 commited on
Commit
15010b5
·
verified ·
1 Parent(s): a1cab8d

Training in progress, step 20, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4156994b0d2538aa2d37af1880314c638cc4d24a008e661dbcf7a289e3478dca
3
  size 2066752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9b841b09c9dd8f6d4d92bae6a6575aadd48707d9b68b71831107d9f2bc76adb
3
  size 2066752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fba8dbfefb423350e220e5a13f8bccbed1d56b9ffd0aa6cc568c8e5687f191cf
3
  size 2162798
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd23b2c7ff318c50bf906b4c5c4326c4f1fbed5724d8cbe6c56c8b59db03399
3
  size 2162798
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac85e2287c8599899f85951b17c5e4c4f197b5b4d1cc47f24edbb3e1bc0f460
3
+ size 14512
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c80ddb4d12adee2e9c1a55ece2b5416bdcce910d1d208bbc2d42ed23c071a2b
3
+ size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b204b20cdc92a140e2e21e015bdaa04af008c00e0bde30e59edf0f23817a338
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3d7496cd5cad5fcd343f51f06f864ca525a833da3ba71e9c4d90915510423ac
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1873 +1,41 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7398638650488311,
5
- "eval_steps": 200,
6
- "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0002959455460195324,
13
- "eval_loss": 10.376261711120605,
14
- "eval_runtime": 10.819,
15
- "eval_samples_per_second": 138.829,
16
- "eval_steps_per_second": 34.754,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.002959455460195324,
21
- "grad_norm": 0.298828125,
22
- "learning_rate": 1.6000000000000003e-05,
23
- "loss": 10.3804,
24
  "step": 10
25
  },
26
  {
27
- "epoch": 0.005918910920390648,
28
- "grad_norm": 0.357421875,
29
- "learning_rate": 3.2000000000000005e-05,
30
- "loss": 10.3767,
31
- "step": 20
32
- },
33
- {
34
- "epoch": 0.008878366380585973,
35
- "grad_norm": 0.443359375,
36
- "learning_rate": 4.8e-05,
37
  "loss": 10.3754,
38
- "step": 30
39
- },
40
- {
41
- "epoch": 0.011837821840781295,
42
- "grad_norm": 0.5625,
43
- "learning_rate": 6.400000000000001e-05,
44
- "loss": 10.3767,
45
- "step": 40
46
- },
47
- {
48
- "epoch": 0.01479727730097662,
49
- "grad_norm": 1.109375,
50
- "learning_rate": 8e-05,
51
- "loss": 10.3722,
52
- "step": 50
53
- },
54
- {
55
- "epoch": 0.017756732761171946,
56
- "grad_norm": 0.294921875,
57
- "learning_rate": 9.6e-05,
58
- "loss": 10.3804,
59
- "step": 60
60
- },
61
- {
62
- "epoch": 0.020716188221367268,
63
- "grad_norm": 0.373046875,
64
- "learning_rate": 0.00011200000000000001,
65
- "loss": 10.3739,
66
- "step": 70
67
- },
68
- {
69
- "epoch": 0.02367564368156259,
70
- "grad_norm": 0.42578125,
71
- "learning_rate": 0.00012800000000000002,
72
- "loss": 10.3736,
73
- "step": 80
74
- },
75
- {
76
- "epoch": 0.026635099141757917,
77
- "grad_norm": 0.70703125,
78
- "learning_rate": 0.000144,
79
- "loss": 10.3643,
80
- "step": 90
81
- },
82
- {
83
- "epoch": 0.02959455460195324,
84
- "grad_norm": 1.59375,
85
- "learning_rate": 0.00016,
86
- "loss": 10.364,
87
- "step": 100
88
- },
89
- {
90
- "epoch": 0.032554010062148565,
91
- "grad_norm": 0.478515625,
92
- "learning_rate": 0.00017600000000000002,
93
- "loss": 10.3561,
94
- "step": 110
95
- },
96
- {
97
- "epoch": 0.03551346552234389,
98
- "grad_norm": 0.73046875,
99
- "learning_rate": 0.000192,
100
- "loss": 10.3211,
101
- "step": 120
102
- },
103
- {
104
- "epoch": 0.03847292098253921,
105
- "grad_norm": 0.84375,
106
- "learning_rate": 0.0001999978128380225,
107
- "loss": 10.2582,
108
- "step": 130
109
- },
110
- {
111
- "epoch": 0.041432376442734536,
112
- "grad_norm": 0.72265625,
113
- "learning_rate": 0.0001999803161162393,
114
- "loss": 10.172,
115
- "step": 140
116
- },
117
- {
118
- "epoch": 0.04439183190292986,
119
- "grad_norm": 1.2734375,
120
- "learning_rate": 0.00019994532573409262,
121
- "loss": 10.1033,
122
- "step": 150
123
- },
124
- {
125
- "epoch": 0.04735128736312518,
126
- "grad_norm": 0.43359375,
127
- "learning_rate": 0.00019989284781388617,
128
- "loss": 10.0041,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.05031074282332051,
133
- "grad_norm": 0.41796875,
134
- "learning_rate": 0.00019982289153773646,
135
- "loss": 9.9331,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.053270198283515834,
140
- "grad_norm": 0.46875,
141
- "learning_rate": 0.00019973546914596623,
142
- "loss": 9.8548,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.05622965374371116,
147
- "grad_norm": 0.64453125,
148
- "learning_rate": 0.00019963059593496268,
149
- "loss": 9.7692,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.05918910920390648,
154
- "grad_norm": 1.140625,
155
- "learning_rate": 0.00019950829025450114,
156
- "loss": 9.7054,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.05918910920390648,
161
- "eval_loss": 9.686193466186523,
162
- "eval_runtime": 20.1405,
163
- "eval_samples_per_second": 74.576,
164
- "eval_steps_per_second": 18.669,
165
- "step": 200
166
- },
167
- {
168
- "epoch": 0.062148564664101805,
169
- "grad_norm": 0.46484375,
170
- "learning_rate": 0.0001993685735045343,
171
- "loss": 9.6486,
172
- "step": 210
173
- },
174
- {
175
- "epoch": 0.06510802012429713,
176
- "grad_norm": 0.51171875,
177
- "learning_rate": 0.0001992114701314478,
178
- "loss": 9.6029,
179
- "step": 220
180
- },
181
- {
182
- "epoch": 0.06806747558449246,
183
- "grad_norm": 0.5078125,
184
- "learning_rate": 0.000199037007623783,
185
- "loss": 9.5554,
186
- "step": 230
187
- },
188
- {
189
- "epoch": 0.07102693104468778,
190
- "grad_norm": 0.609375,
191
- "learning_rate": 0.00019884521650742715,
192
- "loss": 9.4941,
193
- "step": 240
194
- },
195
- {
196
- "epoch": 0.0739863865048831,
197
- "grad_norm": 1.78125,
198
- "learning_rate": 0.00019863613034027224,
199
- "loss": 9.508,
200
- "step": 250
201
- },
202
- {
203
- "epoch": 0.07694584196507842,
204
- "grad_norm": 0.5078125,
205
- "learning_rate": 0.0001984097857063434,
206
- "loss": 9.3502,
207
- "step": 260
208
- },
209
- {
210
- "epoch": 0.07990529742527375,
211
- "grad_norm": 0.55859375,
212
- "learning_rate": 0.0001981662222093976,
213
- "loss": 9.3473,
214
- "step": 270
215
- },
216
- {
217
- "epoch": 0.08286475288546907,
218
- "grad_norm": 0.5234375,
219
- "learning_rate": 0.00019790548246599447,
220
- "loss": 9.2955,
221
- "step": 280
222
- },
223
- {
224
- "epoch": 0.0858242083456644,
225
- "grad_norm": 0.625,
226
- "learning_rate": 0.00019762761209803927,
227
- "loss": 9.2712,
228
- "step": 290
229
- },
230
- {
231
- "epoch": 0.08878366380585972,
232
- "grad_norm": 1.140625,
233
- "learning_rate": 0.0001973326597248006,
234
- "loss": 9.2969,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.09174311926605505,
239
- "grad_norm": 0.455078125,
240
- "learning_rate": 0.00019702067695440332,
241
- "loss": 9.1616,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.09470257472625036,
246
- "grad_norm": 0.4609375,
247
- "learning_rate": 0.00019669171837479873,
248
- "loss": 9.1605,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.09766203018644569,
253
- "grad_norm": 0.474609375,
254
- "learning_rate": 0.00019634584154421317,
255
- "loss": 9.1402,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.10062148564664102,
260
- "grad_norm": 0.578125,
261
- "learning_rate": 0.00019598310698107702,
262
- "loss": 9.0839,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.10358094110683634,
267
- "grad_norm": 1.296875,
268
- "learning_rate": 0.00019560357815343577,
269
- "loss": 9.0709,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.10654039656703167,
274
- "grad_norm": 0.57421875,
275
- "learning_rate": 0.00019520732146784491,
276
- "loss": 9.0372,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.109499852027227,
281
- "grad_norm": 0.76953125,
282
- "learning_rate": 0.0001947944062577507,
283
- "loss": 9.0209,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.11245930748742232,
288
- "grad_norm": 0.5390625,
289
- "learning_rate": 0.00019436490477135878,
290
- "loss": 8.9724,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.11541876294761765,
295
- "grad_norm": 0.6171875,
296
- "learning_rate": 0.00019391889215899299,
297
- "loss": 9.0212,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.11837821840781296,
302
- "grad_norm": 1.421875,
303
- "learning_rate": 0.0001934564464599461,
304
- "loss": 8.9091,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.11837821840781296,
309
- "eval_loss": 8.961220741271973,
310
- "eval_runtime": 13.0065,
311
- "eval_samples_per_second": 115.48,
312
- "eval_steps_per_second": 28.909,
313
- "step": 400
314
- },
315
- {
316
- "epoch": 0.12133767386800828,
317
- "grad_norm": 0.443359375,
318
- "learning_rate": 0.00019297764858882514,
319
- "loss": 8.9547,
320
- "step": 410
321
- },
322
- {
323
- "epoch": 0.12429712932820361,
324
- "grad_norm": 0.466796875,
325
- "learning_rate": 0.00019248258232139388,
326
- "loss": 8.9394,
327
- "step": 420
328
- },
329
- {
330
- "epoch": 0.12725658478839894,
331
- "grad_norm": 0.61328125,
332
- "learning_rate": 0.00019197133427991436,
333
- "loss": 8.9748,
334
- "step": 430
335
- },
336
- {
337
- "epoch": 0.13021604024859426,
338
- "grad_norm": 0.73046875,
339
- "learning_rate": 0.00019144399391799043,
340
- "loss": 8.9198,
341
- "step": 440
342
- },
343
- {
344
- "epoch": 0.1331754957087896,
345
- "grad_norm": 1.203125,
346
- "learning_rate": 0.00019090065350491626,
347
- "loss": 8.8904,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.1361349511689849,
352
- "grad_norm": 0.494140625,
353
- "learning_rate": 0.0001903414081095315,
354
- "loss": 8.8971,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.13909440662918024,
359
- "grad_norm": 0.48046875,
360
- "learning_rate": 0.00018976635558358722,
361
- "loss": 8.84,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.14205386208937557,
366
- "grad_norm": 0.55859375,
367
- "learning_rate": 0.00018917559654462474,
368
- "loss": 8.838,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.1450133175495709,
373
- "grad_norm": 0.5703125,
374
- "learning_rate": 0.00018856923435837022,
375
- "loss": 8.7761,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.1479727730097662,
380
- "grad_norm": 0.96875,
381
- "learning_rate": 0.0001879473751206489,
382
- "loss": 8.8421,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.15093222846996152,
387
- "grad_norm": 0.478515625,
388
- "learning_rate": 0.00018731012763882133,
389
- "loss": 8.7691,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.15389168393015684,
394
- "grad_norm": 0.4921875,
395
- "learning_rate": 0.00018665760341274505,
396
- "loss": 8.7749,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.15685113939035217,
401
- "grad_norm": 0.51171875,
402
- "learning_rate": 0.00018598991661526572,
403
- "loss": 8.79,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.1598105948505475,
408
- "grad_norm": 0.58203125,
409
- "learning_rate": 0.00018530718407223974,
410
- "loss": 8.8742,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.16277005031074282,
415
- "grad_norm": 1.234375,
416
- "learning_rate": 0.00018460952524209355,
417
- "loss": 8.7845,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.16572950577093815,
422
- "grad_norm": 0.470703125,
423
- "learning_rate": 0.00018389706219492147,
424
- "loss": 8.8165,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.16868896123113347,
429
- "grad_norm": 0.486328125,
430
- "learning_rate": 0.00018316991959112716,
431
- "loss": 8.7024,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.1716484166913288,
436
- "grad_norm": 0.53515625,
437
- "learning_rate": 0.00018242822465961176,
438
- "loss": 8.7764,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.17460787215152412,
443
- "grad_norm": 0.58984375,
444
- "learning_rate": 0.00018167210717551224,
445
- "loss": 8.7501,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.17756732761171945,
450
- "grad_norm": 1.28125,
451
- "learning_rate": 0.00018090169943749476,
452
- "loss": 8.7257,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.17756732761171945,
457
- "eval_loss": 8.762685775756836,
458
- "eval_runtime": 18.9408,
459
- "eval_samples_per_second": 79.3,
460
- "eval_steps_per_second": 19.851,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.18052678307191478,
465
- "grad_norm": 0.54296875,
466
- "learning_rate": 0.00018011713624460608,
467
- "loss": 8.7709,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.1834862385321101,
472
- "grad_norm": 0.53515625,
473
- "learning_rate": 0.00017931855487268782,
474
- "loss": 8.7334,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.18644569399230543,
479
- "grad_norm": 0.56640625,
480
- "learning_rate": 0.0001785060950503568,
481
- "loss": 8.824,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.18940514945250073,
486
- "grad_norm": 0.69921875,
487
- "learning_rate": 0.00017767989893455698,
488
- "loss": 8.6731,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.19236460491269605,
493
- "grad_norm": 0.90625,
494
- "learning_rate": 0.00017684011108568592,
495
- "loss": 8.7669,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.19532406037289138,
500
- "grad_norm": 0.49609375,
501
- "learning_rate": 0.00017598687844230088,
502
- "loss": 8.6911,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.1982835158330867,
507
- "grad_norm": 0.44140625,
508
- "learning_rate": 0.00017512035029540885,
509
- "loss": 8.6932,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.20124297129328203,
514
- "grad_norm": 0.52734375,
515
- "learning_rate": 0.000174240678262345,
516
- "loss": 8.71,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.20420242675347736,
521
- "grad_norm": 0.59375,
522
- "learning_rate": 0.000173348016260244,
523
- "loss": 8.7219,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.20716188221367268,
528
- "grad_norm": 1.3515625,
529
- "learning_rate": 0.00017244252047910892,
530
- "loss": 8.6973,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.210121337673868,
535
- "grad_norm": 0.462890625,
536
- "learning_rate": 0.00017152434935448256,
537
- "loss": 8.6743,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.21308079313406333,
542
- "grad_norm": 0.451171875,
543
- "learning_rate": 0.0001705936635397259,
544
- "loss": 8.7094,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.21604024859425866,
549
- "grad_norm": 0.57421875,
550
- "learning_rate": 0.00016965062587790823,
551
- "loss": 8.7353,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.218999704054454,
556
- "grad_norm": 0.5546875,
557
- "learning_rate": 0.00016869540137331445,
558
- "loss": 8.6939,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.2219591595146493,
563
- "grad_norm": 1.0703125,
564
- "learning_rate": 0.00016772815716257412,
565
- "loss": 8.7202,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.22491861497484464,
570
- "grad_norm": 0.51171875,
571
- "learning_rate": 0.00016674906248541726,
572
- "loss": 8.6779,
573
- "step": 760
574
- },
575
- {
576
- "epoch": 0.22787807043503996,
577
- "grad_norm": 0.671875,
578
- "learning_rate": 0.00016575828865506245,
579
- "loss": 8.6627,
580
- "step": 770
581
- },
582
- {
583
- "epoch": 0.2308375258952353,
584
- "grad_norm": 0.4375,
585
- "learning_rate": 0.0001647560090282419,
586
- "loss": 8.7348,
587
- "step": 780
588
- },
589
- {
590
- "epoch": 0.2337969813554306,
591
- "grad_norm": 0.6875,
592
- "learning_rate": 0.000163742398974869,
593
- "loss": 8.7236,
594
- "step": 790
595
- },
596
- {
597
- "epoch": 0.23675643681562591,
598
- "grad_norm": 1.4140625,
599
- "learning_rate": 0.0001627176358473537,
600
- "loss": 8.7416,
601
- "step": 800
602
- },
603
- {
604
- "epoch": 0.23675643681562591,
605
- "eval_loss": 8.710856437683105,
606
- "eval_runtime": 16.7859,
607
- "eval_samples_per_second": 89.48,
608
- "eval_steps_per_second": 22.4,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 0.23971589227582124,
613
- "grad_norm": 0.47265625,
614
- "learning_rate": 0.0001616818989495711,
615
- "loss": 8.7235,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 0.24267534773601657,
620
- "grad_norm": 0.447265625,
621
- "learning_rate": 0.00016063536950548826,
622
- "loss": 8.7121,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 0.2456348031962119,
627
- "grad_norm": 0.50390625,
628
- "learning_rate": 0.0001595782306274553,
629
- "loss": 8.741,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 0.24859425865640722,
634
- "grad_norm": 0.58203125,
635
- "learning_rate": 0.00015851066728416618,
636
- "loss": 8.6978,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 0.25155371411660254,
641
- "grad_norm": 1.296875,
642
- "learning_rate": 0.00015743286626829437,
643
- "loss": 8.7496,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 0.25451316957679787,
648
- "grad_norm": 0.455078125,
649
- "learning_rate": 0.00015634501616380967,
650
- "loss": 8.6913,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 0.2574726250369932,
655
- "grad_norm": 0.5078125,
656
- "learning_rate": 0.00015524730731298134,
657
- "loss": 8.6728,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 0.2604320804971885,
662
- "grad_norm": 0.5,
663
- "learning_rate": 0.0001541399317830738,
664
- "loss": 8.6724,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 0.26339153595738385,
669
- "grad_norm": 0.8359375,
670
- "learning_rate": 0.0001530230833327405,
671
- "loss": 8.763,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 0.2663509914175792,
676
- "grad_norm": 0.953125,
677
- "learning_rate": 0.00015189695737812152,
678
- "loss": 8.6008,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 0.2693104468777745,
683
- "grad_norm": 0.455078125,
684
- "learning_rate": 0.0001507617509586517,
685
- "loss": 8.7395,
686
- "step": 910
687
- },
688
- {
689
- "epoch": 0.2722699023379698,
690
- "grad_norm": 0.47265625,
691
- "learning_rate": 0.00014961766270258422,
692
- "loss": 8.6413,
693
- "step": 920
694
- },
695
- {
696
- "epoch": 0.27522935779816515,
697
- "grad_norm": 0.5078125,
698
- "learning_rate": 0.00014846489279223652,
699
- "loss": 8.7083,
700
- "step": 930
701
- },
702
- {
703
- "epoch": 0.2781888132583605,
704
- "grad_norm": 0.609375,
705
- "learning_rate": 0.0001473036429289641,
706
- "loss": 8.6829,
707
- "step": 940
708
- },
709
- {
710
- "epoch": 0.2811482687185558,
711
- "grad_norm": 1.34375,
712
- "learning_rate": 0.0001461341162978688,
713
- "loss": 8.6955,
714
- "step": 950
715
- },
716
- {
717
- "epoch": 0.28410772417875113,
718
- "grad_norm": 0.419921875,
719
- "learning_rate": 0.00014495651753224705,
720
- "loss": 8.6962,
721
- "step": 960
722
- },
723
- {
724
- "epoch": 0.28706717963894646,
725
- "grad_norm": 0.478515625,
726
- "learning_rate": 0.00014377105267778518,
727
- "loss": 8.7166,
728
- "step": 970
729
- },
730
- {
731
- "epoch": 0.2900266350991418,
732
- "grad_norm": 0.55859375,
733
- "learning_rate": 0.00014257792915650728,
734
- "loss": 8.6469,
735
- "step": 980
736
- },
737
- {
738
- "epoch": 0.2929860905593371,
739
- "grad_norm": 0.6640625,
740
- "learning_rate": 0.00014137735573048233,
741
- "loss": 8.6999,
742
- "step": 990
743
- },
744
- {
745
- "epoch": 0.2959455460195324,
746
- "grad_norm": 1.1171875,
747
- "learning_rate": 0.00014016954246529696,
748
- "loss": 8.5944,
749
- "step": 1000
750
- },
751
- {
752
- "epoch": 0.2959455460195324,
753
- "eval_loss": 8.698212623596191,
754
- "eval_runtime": 13.7844,
755
- "eval_samples_per_second": 108.964,
756
- "eval_steps_per_second": 27.277,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 0.2989050014797277,
761
- "grad_norm": 0.5546875,
762
- "learning_rate": 0.00013895470069330004,
763
- "loss": 8.7432,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 0.30186445693992303,
768
- "grad_norm": 0.498046875,
769
- "learning_rate": 0.00013773304297662559,
770
- "loss": 8.6772,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 0.30482391240011836,
775
- "grad_norm": 0.671875,
776
- "learning_rate": 0.00013650478307000057,
777
- "loss": 8.73,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 0.3077833678603137,
782
- "grad_norm": 0.77734375,
783
- "learning_rate": 0.00013527013588334415,
784
- "loss": 8.7362,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 0.310742823320509,
789
- "grad_norm": 1.296875,
790
- "learning_rate": 0.00013402931744416433,
791
- "loss": 8.6947,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 0.31370227878070434,
796
- "grad_norm": 0.451171875,
797
- "learning_rate": 0.00013278254485975976,
798
- "loss": 8.6919,
799
- "step": 1060
800
- },
801
- {
802
- "epoch": 0.31666173424089966,
803
- "grad_norm": 0.59375,
804
- "learning_rate": 0.00013153003627923218,
805
- "loss": 8.7202,
806
- "step": 1070
807
- },
808
- {
809
- "epoch": 0.319621189701095,
810
- "grad_norm": 0.58203125,
811
- "learning_rate": 0.00013027201085531634,
812
- "loss": 8.7236,
813
- "step": 1080
814
- },
815
- {
816
- "epoch": 0.3225806451612903,
817
- "grad_norm": 0.640625,
818
- "learning_rate": 0.00012900868870603503,
819
- "loss": 8.7817,
820
- "step": 1090
821
- },
822
- {
823
- "epoch": 0.32554010062148564,
824
- "grad_norm": 1.1015625,
825
- "learning_rate": 0.00012774029087618446,
826
- "loss": 8.8011,
827
- "step": 1100
828
- },
829
- {
830
- "epoch": 0.32849955608168097,
831
- "grad_norm": 0.462890625,
832
- "learning_rate": 0.00012646703929865817,
833
- "loss": 8.687,
834
- "step": 1110
835
- },
836
- {
837
- "epoch": 0.3314590115418763,
838
- "grad_norm": 0.47265625,
839
- "learning_rate": 0.00012518915675561483,
840
- "loss": 8.6354,
841
- "step": 1120
842
- },
843
- {
844
- "epoch": 0.3344184670020716,
845
- "grad_norm": 0.578125,
846
- "learning_rate": 0.00012390686683949798,
847
- "loss": 8.6407,
848
- "step": 1130
849
- },
850
- {
851
- "epoch": 0.33737792246226694,
852
- "grad_norm": 0.71875,
853
- "learning_rate": 0.00012262039391391404,
854
- "loss": 8.6823,
855
- "step": 1140
856
- },
857
- {
858
- "epoch": 0.34033737792246227,
859
- "grad_norm": 1.6484375,
860
- "learning_rate": 0.0001213299630743747,
861
- "loss": 8.7369,
862
- "step": 1150
863
- },
864
- {
865
- "epoch": 0.3432968333826576,
866
- "grad_norm": 0.48046875,
867
- "learning_rate": 0.00012003580010891213,
868
- "loss": 8.6849,
869
- "step": 1160
870
- },
871
- {
872
- "epoch": 0.3462562888428529,
873
- "grad_norm": 0.55078125,
874
- "learning_rate": 0.00011873813145857249,
875
- "loss": 8.6571,
876
- "step": 1170
877
- },
878
- {
879
- "epoch": 0.34921574430304825,
880
- "grad_norm": 0.51171875,
881
- "learning_rate": 0.00011743718417779517,
882
- "loss": 8.7425,
883
- "step": 1180
884
- },
885
- {
886
- "epoch": 0.3521751997632436,
887
- "grad_norm": 0.9453125,
888
- "learning_rate": 0.00011613318589468511,
889
- "loss": 8.6455,
890
- "step": 1190
891
- },
892
- {
893
- "epoch": 0.3551346552234389,
894
- "grad_norm": 0.8203125,
895
- "learning_rate": 0.0001148263647711842,
896
- "loss": 8.673,
897
- "step": 1200
898
- },
899
- {
900
- "epoch": 0.3551346552234389,
901
- "eval_loss": 8.696282386779785,
902
- "eval_runtime": 20.1067,
903
- "eval_samples_per_second": 74.702,
904
- "eval_steps_per_second": 18.7,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 0.3580941106836342,
909
- "grad_norm": 0.466796875,
910
- "learning_rate": 0.0001135169494631497,
911
- "loss": 8.6666,
912
- "step": 1210
913
- },
914
- {
915
- "epoch": 0.36105356614382955,
916
- "grad_norm": 0.484375,
917
- "learning_rate": 0.00011220516908034601,
918
- "loss": 8.6954,
919
- "step": 1220
920
- },
921
- {
922
- "epoch": 0.3640130216040249,
923
- "grad_norm": 0.5,
924
- "learning_rate": 0.00011089125314635726,
925
- "loss": 8.7236,
926
- "step": 1230
927
- },
928
- {
929
- "epoch": 0.3669724770642202,
930
- "grad_norm": 0.62109375,
931
- "learning_rate": 0.00010957543155842702,
932
- "loss": 8.7772,
933
- "step": 1240
934
- },
935
- {
936
- "epoch": 0.36993193252441553,
937
- "grad_norm": 0.859375,
938
- "learning_rate": 0.00010825793454723325,
939
- "loss": 8.7222,
940
- "step": 1250
941
- },
942
- {
943
- "epoch": 0.37289138798461086,
944
- "grad_norm": 0.51171875,
945
- "learning_rate": 0.00010693899263660441,
946
- "loss": 8.6544,
947
- "step": 1260
948
- },
949
- {
950
- "epoch": 0.3758508434448062,
951
- "grad_norm": 0.498046875,
952
- "learning_rate": 0.00010561883660318455,
953
- "loss": 8.6639,
954
- "step": 1270
955
- },
956
- {
957
- "epoch": 0.37881029890500145,
958
- "grad_norm": 0.53515625,
959
- "learning_rate": 0.00010429769743605407,
960
- "loss": 8.6819,
961
- "step": 1280
962
- },
963
- {
964
- "epoch": 0.3817697543651968,
965
- "grad_norm": 0.68359375,
966
- "learning_rate": 0.00010297580629631325,
967
- "loss": 8.6511,
968
- "step": 1290
969
- },
970
- {
971
- "epoch": 0.3847292098253921,
972
- "grad_norm": 1.03125,
973
- "learning_rate": 0.00010165339447663587,
974
- "loss": 8.6257,
975
- "step": 1300
976
- },
977
- {
978
- "epoch": 0.38768866528558743,
979
- "grad_norm": 0.43359375,
980
- "learning_rate": 0.00010033069336079952,
981
- "loss": 8.7457,
982
- "step": 1310
983
- },
984
- {
985
- "epoch": 0.39064812074578276,
986
- "grad_norm": 0.482421875,
987
- "learning_rate": 9.900793438320037e-05,
988
- "loss": 8.6771,
989
- "step": 1320
990
- },
991
- {
992
- "epoch": 0.3936075762059781,
993
- "grad_norm": 0.50390625,
994
- "learning_rate": 9.768534898835862e-05,
995
- "loss": 8.6776,
996
- "step": 1330
997
- },
998
- {
999
- "epoch": 0.3965670316661734,
1000
- "grad_norm": 0.6171875,
1001
- "learning_rate": 9.636316859042259e-05,
1002
- "loss": 8.6742,
1003
- "step": 1340
1004
- },
1005
- {
1006
- "epoch": 0.39952648712636873,
1007
- "grad_norm": 0.859375,
1008
- "learning_rate": 9.504162453267777e-05,
1009
- "loss": 8.6419,
1010
- "step": 1350
1011
- },
1012
- {
1013
- "epoch": 0.40248594258656406,
1014
- "grad_norm": 0.451171875,
1015
- "learning_rate": 9.372094804706867e-05,
1016
- "loss": 8.7098,
1017
- "step": 1360
1018
- },
1019
- {
1020
- "epoch": 0.4054453980467594,
1021
- "grad_norm": 0.546875,
1022
- "learning_rate": 9.24013702137397e-05,
1023
- "loss": 8.6633,
1024
- "step": 1370
1025
- },
1026
- {
1027
- "epoch": 0.4084048535069547,
1028
- "grad_norm": 0.5859375,
1029
- "learning_rate": 9.108312192060298e-05,
1030
- "loss": 8.687,
1031
- "step": 1380
1032
- },
1033
- {
1034
- "epoch": 0.41136430896715004,
1035
- "grad_norm": 0.890625,
1036
- "learning_rate": 8.97664338229395e-05,
1037
- "loss": 8.713,
1038
- "step": 1390
1039
- },
1040
- {
1041
- "epoch": 0.41432376442734536,
1042
- "grad_norm": 1.6015625,
1043
- "learning_rate": 8.845153630304139e-05,
1044
- "loss": 8.7511,
1045
- "step": 1400
1046
- },
1047
- {
1048
- "epoch": 0.41432376442734536,
1049
- "eval_loss": 8.69721794128418,
1050
- "eval_runtime": 8.7762,
1051
- "eval_samples_per_second": 171.145,
1052
- "eval_steps_per_second": 42.843,
1053
- "step": 1400
1054
- },
1055
- {
1056
- "epoch": 0.4172832198875407,
1057
- "grad_norm": 0.46484375,
1058
- "learning_rate": 8.713865942990141e-05,
1059
- "loss": 8.6655,
1060
- "step": 1410
1061
- },
1062
- {
1063
- "epoch": 0.420242675347736,
1064
- "grad_norm": 0.55078125,
1065
- "learning_rate": 8.582803291895758e-05,
1066
- "loss": 8.6978,
1067
- "step": 1420
1068
- },
1069
- {
1070
- "epoch": 0.42320213080793134,
1071
- "grad_norm": 0.515625,
1072
- "learning_rate": 8.451988609189987e-05,
1073
- "loss": 8.7573,
1074
- "step": 1430
1075
- },
1076
- {
1077
- "epoch": 0.42616158626812667,
1078
- "grad_norm": 0.70703125,
1079
- "learning_rate": 8.321444783654524e-05,
1080
- "loss": 8.6963,
1081
- "step": 1440
1082
- },
1083
- {
1084
- "epoch": 0.429121041728322,
1085
- "grad_norm": 1.1953125,
1086
- "learning_rate": 8.191194656678904e-05,
1087
- "loss": 8.7627,
1088
- "step": 1450
1089
- },
1090
- {
1091
- "epoch": 0.4320804971885173,
1092
- "grad_norm": 0.53125,
1093
- "learning_rate": 8.061261018263919e-05,
1094
- "loss": 8.6564,
1095
- "step": 1460
1096
- },
1097
- {
1098
- "epoch": 0.43503995264871265,
1099
- "grad_norm": 0.5546875,
1100
- "learning_rate": 7.931666603034033e-05,
1101
- "loss": 8.6641,
1102
- "step": 1470
1103
- },
1104
- {
1105
- "epoch": 0.437999408108908,
1106
- "grad_norm": 0.83203125,
1107
- "learning_rate": 7.80243408625947e-05,
1108
- "loss": 8.6646,
1109
- "step": 1480
1110
- },
1111
- {
1112
- "epoch": 0.4409588635691033,
1113
- "grad_norm": 0.71875,
1114
- "learning_rate": 7.673586079888698e-05,
1115
- "loss": 8.7323,
1116
- "step": 1490
1117
- },
1118
- {
1119
- "epoch": 0.4439183190292986,
1120
- "grad_norm": 1.0078125,
1121
- "learning_rate": 7.54514512859201e-05,
1122
- "loss": 8.6167,
1123
- "step": 1500
1124
- },
1125
- {
1126
- "epoch": 0.44687777448949395,
1127
- "grad_norm": 0.453125,
1128
- "learning_rate": 7.417133705816837e-05,
1129
- "loss": 8.6929,
1130
- "step": 1510
1131
- },
1132
- {
1133
- "epoch": 0.4498372299496893,
1134
- "grad_norm": 0.482421875,
1135
- "learning_rate": 7.289574209855559e-05,
1136
- "loss": 8.6871,
1137
- "step": 1520
1138
- },
1139
- {
1140
- "epoch": 0.4527966854098846,
1141
- "grad_norm": 0.484375,
1142
- "learning_rate": 7.16248895992645e-05,
1143
- "loss": 8.6881,
1144
- "step": 1530
1145
- },
1146
- {
1147
- "epoch": 0.45575614087007993,
1148
- "grad_norm": 0.7578125,
1149
- "learning_rate": 7.035900192268464e-05,
1150
- "loss": 8.6753,
1151
- "step": 1540
1152
- },
1153
- {
1154
- "epoch": 0.45871559633027525,
1155
- "grad_norm": 0.83203125,
1156
- "learning_rate": 6.909830056250527e-05,
1157
- "loss": 8.7056,
1158
- "step": 1550
1159
- },
1160
- {
1161
- "epoch": 0.4616750517904706,
1162
- "grad_norm": 0.431640625,
1163
- "learning_rate": 6.784300610496048e-05,
1164
- "loss": 8.706,
1165
- "step": 1560
1166
- },
1167
- {
1168
- "epoch": 0.46463450725066585,
1169
- "grad_norm": 0.482421875,
1170
- "learning_rate": 6.65933381902329e-05,
1171
- "loss": 8.6888,
1172
- "step": 1570
1173
- },
1174
- {
1175
- "epoch": 0.4675939627108612,
1176
- "grad_norm": 0.67578125,
1177
- "learning_rate": 6.534951547402322e-05,
1178
- "loss": 8.7158,
1179
- "step": 1580
1180
- },
1181
- {
1182
- "epoch": 0.4705534181710565,
1183
- "grad_norm": 0.63671875,
1184
- "learning_rate": 6.411175558929152e-05,
1185
- "loss": 8.728,
1186
- "step": 1590
1187
- },
1188
- {
1189
- "epoch": 0.47351287363125183,
1190
- "grad_norm": 0.890625,
1191
- "learning_rate": 6.28802751081779e-05,
1192
- "loss": 8.729,
1193
- "step": 1600
1194
- },
1195
- {
1196
- "epoch": 0.47351287363125183,
1197
- "eval_loss": 8.696118354797363,
1198
- "eval_runtime": 19.8643,
1199
- "eval_samples_per_second": 75.613,
1200
- "eval_steps_per_second": 18.928,
1201
- "step": 1600
1202
- },
1203
- {
1204
- "epoch": 0.47647232909144716,
1205
- "grad_norm": 0.52734375,
1206
- "learning_rate": 6.165528950410884e-05,
1207
- "loss": 8.6937,
1208
- "step": 1610
1209
- },
1210
- {
1211
- "epoch": 0.4794317845516425,
1212
- "grad_norm": 0.486328125,
1213
- "learning_rate": 6.0437013114095195e-05,
1214
- "loss": 8.6631,
1215
- "step": 1620
1216
- },
1217
- {
1218
- "epoch": 0.4823912400118378,
1219
- "grad_norm": 0.51953125,
1220
- "learning_rate": 5.922565910122967e-05,
1221
- "loss": 8.696,
1222
- "step": 1630
1223
- },
1224
- {
1225
- "epoch": 0.48535069547203313,
1226
- "grad_norm": 0.75,
1227
- "learning_rate": 5.8021439417389444e-05,
1228
- "loss": 8.6176,
1229
- "step": 1640
1230
- },
1231
- {
1232
- "epoch": 0.48831015093222846,
1233
- "grad_norm": 0.984375,
1234
- "learning_rate": 5.6824564766150726e-05,
1235
- "loss": 8.7082,
1236
- "step": 1650
1237
- },
1238
- {
1239
- "epoch": 0.4912696063924238,
1240
- "grad_norm": 0.470703125,
1241
- "learning_rate": 5.563524456592163e-05,
1242
- "loss": 8.6952,
1243
- "step": 1660
1244
- },
1245
- {
1246
- "epoch": 0.4942290618526191,
1247
- "grad_norm": 0.5,
1248
- "learning_rate": 5.4453686913300074e-05,
1249
- "loss": 8.678,
1250
- "step": 1670
1251
- },
1252
- {
1253
- "epoch": 0.49718851731281444,
1254
- "grad_norm": 0.5625,
1255
- "learning_rate": 5.328009854666303e-05,
1256
- "loss": 8.6815,
1257
- "step": 1680
1258
- },
1259
- {
1260
- "epoch": 0.5001479727730098,
1261
- "grad_norm": 0.703125,
1262
- "learning_rate": 5.2114684809993044e-05,
1263
- "loss": 8.6626,
1264
- "step": 1690
1265
- },
1266
- {
1267
- "epoch": 0.5031074282332051,
1268
- "grad_norm": 1.0390625,
1269
- "learning_rate": 5.095764961694922e-05,
1270
- "loss": 8.7641,
1271
- "step": 1700
1272
- },
1273
- {
1274
- "epoch": 0.5060668836934004,
1275
- "grad_norm": 0.515625,
1276
- "learning_rate": 4.980919541518796e-05,
1277
- "loss": 8.6364,
1278
- "step": 1710
1279
- },
1280
- {
1281
- "epoch": 0.5090263391535957,
1282
- "grad_norm": 0.482421875,
1283
- "learning_rate": 4.866952315094088e-05,
1284
- "loss": 8.689,
1285
- "step": 1720
1286
- },
1287
- {
1288
- "epoch": 0.511985794613791,
1289
- "grad_norm": 0.51953125,
1290
- "learning_rate": 4.753883223385467e-05,
1291
- "loss": 8.7382,
1292
- "step": 1730
1293
- },
1294
- {
1295
- "epoch": 0.5149452500739864,
1296
- "grad_norm": 0.62109375,
1297
- "learning_rate": 4.6417320502100316e-05,
1298
- "loss": 8.6902,
1299
- "step": 1740
1300
- },
1301
- {
1302
- "epoch": 0.5179047055341817,
1303
- "grad_norm": 1.0859375,
1304
- "learning_rate": 4.530518418775733e-05,
1305
- "loss": 8.6841,
1306
- "step": 1750
1307
- },
1308
- {
1309
- "epoch": 0.520864160994377,
1310
- "grad_norm": 0.478515625,
1311
- "learning_rate": 4.4202617882478405e-05,
1312
- "loss": 8.708,
1313
- "step": 1760
1314
- },
1315
- {
1316
- "epoch": 0.5238236164545723,
1317
- "grad_norm": 0.486328125,
1318
- "learning_rate": 4.310981450344189e-05,
1319
- "loss": 8.6534,
1320
- "step": 1770
1321
- },
1322
- {
1323
- "epoch": 0.5267830719147677,
1324
- "grad_norm": 0.5234375,
1325
- "learning_rate": 4.2026965259596666e-05,
1326
- "loss": 8.6607,
1327
- "step": 1780
1328
- },
1329
- {
1330
- "epoch": 0.529742527374963,
1331
- "grad_norm": 0.8515625,
1332
- "learning_rate": 4.0954259618206295e-05,
1333
- "loss": 8.6611,
1334
- "step": 1790
1335
- },
1336
- {
1337
- "epoch": 0.5327019828351583,
1338
- "grad_norm": 1.8984375,
1339
- "learning_rate": 3.9891885271697496e-05,
1340
- "loss": 8.6325,
1341
- "step": 1800
1342
- },
1343
- {
1344
- "epoch": 0.5327019828351583,
1345
- "eval_loss": 8.694791793823242,
1346
- "eval_runtime": 14.7074,
1347
- "eval_samples_per_second": 102.126,
1348
- "eval_steps_per_second": 25.565,
1349
- "step": 1800
1350
- },
1351
- {
1352
- "epoch": 0.5356614382953536,
1353
- "grad_norm": 0.50390625,
1354
- "learning_rate": 3.884002810481958e-05,
1355
- "loss": 8.6837,
1356
- "step": 1810
1357
- },
1358
- {
1359
- "epoch": 0.538620893755549,
1360
- "grad_norm": 0.490234375,
1361
- "learning_rate": 3.779887216211995e-05,
1362
- "loss": 8.6631,
1363
- "step": 1820
1364
- },
1365
- {
1366
- "epoch": 0.5415803492157443,
1367
- "grad_norm": 0.5546875,
1368
- "learning_rate": 3.676859961574162e-05,
1369
- "loss": 8.6576,
1370
- "step": 1830
1371
- },
1372
- {
1373
- "epoch": 0.5445398046759397,
1374
- "grad_norm": 0.69921875,
1375
- "learning_rate": 3.574939073354838e-05,
1376
- "loss": 8.7047,
1377
- "step": 1840
1378
- },
1379
- {
1380
- "epoch": 0.5474992601361349,
1381
- "grad_norm": 1.0546875,
1382
- "learning_rate": 3.4741423847583134e-05,
1383
- "loss": 8.7234,
1384
- "step": 1850
1385
- },
1386
- {
1387
- "epoch": 0.5504587155963303,
1388
- "grad_norm": 0.48046875,
1389
- "learning_rate": 3.3744875322865034e-05,
1390
- "loss": 8.7565,
1391
- "step": 1860
1392
- },
1393
- {
1394
- "epoch": 0.5534181710565256,
1395
- "grad_norm": 0.4609375,
1396
- "learning_rate": 3.275991952653054e-05,
1397
- "loss": 8.6812,
1398
- "step": 1870
1399
- },
1400
- {
1401
- "epoch": 0.556377626516721,
1402
- "grad_norm": 0.54296875,
1403
- "learning_rate": 3.178672879732435e-05,
1404
- "loss": 8.7074,
1405
- "step": 1880
1406
- },
1407
- {
1408
- "epoch": 0.5593370819769162,
1409
- "grad_norm": 0.6875,
1410
- "learning_rate": 3.0825473415445074e-05,
1411
- "loss": 8.6826,
1412
- "step": 1890
1413
- },
1414
- {
1415
- "epoch": 0.5622965374371116,
1416
- "grad_norm": 1.3359375,
1417
- "learning_rate": 2.9876321572751144e-05,
1418
- "loss": 8.7359,
1419
- "step": 1900
1420
- },
1421
- {
1422
- "epoch": 0.5652559928973069,
1423
- "grad_norm": 0.49609375,
1424
- "learning_rate": 2.8939439343332086e-05,
1425
- "loss": 8.6599,
1426
- "step": 1910
1427
- },
1428
- {
1429
- "epoch": 0.5682154483575023,
1430
- "grad_norm": 0.53515625,
1431
- "learning_rate": 2.8014990654450325e-05,
1432
- "loss": 8.631,
1433
- "step": 1920
1434
- },
1435
- {
1436
- "epoch": 0.5711749038176975,
1437
- "grad_norm": 0.55859375,
1438
- "learning_rate": 2.7103137257858868e-05,
1439
- "loss": 8.6579,
1440
- "step": 1930
1441
- },
1442
- {
1443
- "epoch": 0.5741343592778929,
1444
- "grad_norm": 0.59765625,
1445
- "learning_rate": 2.6204038701499056e-05,
1446
- "loss": 8.7039,
1447
- "step": 1940
1448
- },
1449
- {
1450
- "epoch": 0.5770938147380882,
1451
- "grad_norm": 1.2109375,
1452
- "learning_rate": 2.5317852301584643e-05,
1453
- "loss": 8.6511,
1454
- "step": 1950
1455
- },
1456
- {
1457
- "epoch": 0.5800532701982836,
1458
- "grad_norm": 0.4921875,
1459
- "learning_rate": 2.4444733115075823e-05,
1460
- "loss": 8.6733,
1461
- "step": 1960
1462
- },
1463
- {
1464
- "epoch": 0.5830127256584788,
1465
- "grad_norm": 0.48828125,
1466
- "learning_rate": 2.3584833912548888e-05,
1467
- "loss": 8.6404,
1468
- "step": 1970
1469
- },
1470
- {
1471
- "epoch": 0.5859721811186742,
1472
- "grad_norm": 0.53515625,
1473
- "learning_rate": 2.2738305151465645e-05,
1474
- "loss": 8.6784,
1475
- "step": 1980
1476
- },
1477
- {
1478
- "epoch": 0.5889316365788695,
1479
- "grad_norm": 0.67578125,
1480
- "learning_rate": 2.190529494984782e-05,
1481
- "loss": 8.6897,
1482
- "step": 1990
1483
- },
1484
- {
1485
- "epoch": 0.5918910920390648,
1486
- "grad_norm": 1.3046875,
1487
- "learning_rate": 2.1085949060360654e-05,
1488
- "loss": 8.6338,
1489
- "step": 2000
1490
- },
1491
- {
1492
- "epoch": 0.5918910920390648,
1493
- "eval_loss": 8.694610595703125,
1494
- "eval_runtime": 15.9292,
1495
- "eval_samples_per_second": 94.292,
1496
- "eval_steps_per_second": 23.604,
1497
- "step": 2000
1498
- },
1499
- {
1500
- "epoch": 0.5948505474992601,
1501
- "grad_norm": 0.55078125,
1502
- "learning_rate": 2.0280410844810428e-05,
1503
- "loss": 8.6746,
1504
- "step": 2010
1505
- },
1506
- {
1507
- "epoch": 0.5978100029594554,
1508
- "grad_norm": 0.490234375,
1509
- "learning_rate": 1.9488821249060297e-05,
1510
- "loss": 8.7101,
1511
- "step": 2020
1512
- },
1513
- {
1514
- "epoch": 0.6007694584196508,
1515
- "grad_norm": 0.546875,
1516
- "learning_rate": 1.871131877836879e-05,
1517
- "loss": 8.6891,
1518
- "step": 2030
1519
- },
1520
- {
1521
- "epoch": 0.6037289138798461,
1522
- "grad_norm": 0.60546875,
1523
- "learning_rate": 1.7948039473155554e-05,
1524
- "loss": 8.6504,
1525
- "step": 2040
1526
- },
1527
- {
1528
- "epoch": 0.6066883693400414,
1529
- "grad_norm": 1.0390625,
1530
- "learning_rate": 1.7199116885197995e-05,
1531
- "loss": 8.7523,
1532
- "step": 2050
1533
- },
1534
- {
1535
- "epoch": 0.6096478248002367,
1536
- "grad_norm": 0.4921875,
1537
- "learning_rate": 1.646468205426377e-05,
1538
- "loss": 8.6832,
1539
- "step": 2060
1540
- },
1541
- {
1542
- "epoch": 0.6126072802604321,
1543
- "grad_norm": 0.466796875,
1544
- "learning_rate": 1.5744863485182537e-05,
1545
- "loss": 8.7104,
1546
- "step": 2070
1547
- },
1548
- {
1549
- "epoch": 0.6155667357206274,
1550
- "grad_norm": 0.55859375,
1551
- "learning_rate": 1.5039787125361326e-05,
1552
- "loss": 8.6838,
1553
- "step": 2080
1554
- },
1555
- {
1556
- "epoch": 0.6185261911808227,
1557
- "grad_norm": 0.7421875,
1558
- "learning_rate": 1.4349576342747462e-05,
1559
- "loss": 8.7315,
1560
- "step": 2090
1561
- },
1562
- {
1563
- "epoch": 0.621485646641018,
1564
- "grad_norm": 1.2109375,
1565
- "learning_rate": 1.3674351904242611e-05,
1566
- "loss": 8.622,
1567
- "step": 2100
1568
- },
1569
- {
1570
- "epoch": 0.6244451021012134,
1571
- "grad_norm": 0.4765625,
1572
- "learning_rate": 1.3014231954572287e-05,
1573
- "loss": 8.6708,
1574
- "step": 2110
1575
- },
1576
- {
1577
- "epoch": 0.6274045575614087,
1578
- "grad_norm": 0.6171875,
1579
- "learning_rate": 1.2369331995613665e-05,
1580
- "loss": 8.6699,
1581
- "step": 2120
1582
- },
1583
- {
1584
- "epoch": 0.630364013021604,
1585
- "grad_norm": 0.56640625,
1586
- "learning_rate": 1.173976486618631e-05,
1587
- "loss": 8.6815,
1588
- "step": 2130
1589
- },
1590
- {
1591
- "epoch": 0.6333234684817993,
1592
- "grad_norm": 0.69921875,
1593
- "learning_rate": 1.1125640722308628e-05,
1594
- "loss": 8.7564,
1595
- "step": 2140
1596
- },
1597
- {
1598
- "epoch": 0.6362829239419947,
1599
- "grad_norm": 1.1328125,
1600
- "learning_rate": 1.0527067017923654e-05,
1601
- "loss": 8.6414,
1602
- "step": 2150
1603
- },
1604
- {
1605
- "epoch": 0.63924237940219,
1606
- "grad_norm": 0.4609375,
1607
- "learning_rate": 9.944148486097793e-06,
1608
- "loss": 8.7073,
1609
- "step": 2160
1610
- },
1611
- {
1612
- "epoch": 0.6422018348623854,
1613
- "grad_norm": 0.45703125,
1614
- "learning_rate": 9.376987120695545e-06,
1615
- "loss": 8.6823,
1616
- "step": 2170
1617
- },
1618
- {
1619
- "epoch": 0.6451612903225806,
1620
- "grad_norm": 0.56640625,
1621
- "learning_rate": 8.825682158533554e-06,
1622
- "loss": 8.6332,
1623
- "step": 2180
1624
- },
1625
- {
1626
- "epoch": 0.648120745782776,
1627
- "grad_norm": 0.6953125,
1628
- "learning_rate": 8.290330062017016e-06,
1629
- "loss": 8.6951,
1630
- "step": 2190
1631
- },
1632
- {
1633
- "epoch": 0.6510802012429713,
1634
- "grad_norm": 0.95703125,
1635
- "learning_rate": 7.771024502261526e-06,
1636
- "loss": 8.7376,
1637
- "step": 2200
1638
- },
1639
- {
1640
- "epoch": 0.6510802012429713,
1641
- "eval_loss": 8.695413589477539,
1642
- "eval_runtime": 18.9893,
1643
- "eval_samples_per_second": 79.097,
1644
- "eval_steps_per_second": 19.801,
1645
- "step": 2200
1646
- },
1647
- {
1648
- "epoch": 0.6540396567031667,
1649
- "grad_norm": 0.455078125,
1650
- "learning_rate": 7.267856342703461e-06,
1651
- "loss": 8.714,
1652
- "step": 2210
1653
- },
1654
- {
1655
- "epoch": 0.6569991121633619,
1656
- "grad_norm": 0.52734375,
1657
- "learning_rate": 6.780913623201346e-06,
1658
- "loss": 8.6495,
1659
- "step": 2220
1660
- },
1661
- {
1662
- "epoch": 0.6599585676235573,
1663
- "grad_norm": 0.61328125,
1664
- "learning_rate": 6.310281544631546e-06,
1665
- "loss": 8.7043,
1666
- "step": 2230
1667
- },
1668
- {
1669
- "epoch": 0.6629180230837526,
1670
- "grad_norm": 0.62890625,
1671
- "learning_rate": 5.856042453980526e-06,
1672
- "loss": 8.6877,
1673
- "step": 2240
1674
- },
1675
- {
1676
- "epoch": 0.665877478543948,
1677
- "grad_norm": 1.25,
1678
- "learning_rate": 5.418275829936537e-06,
1679
- "loss": 8.6366,
1680
- "step": 2250
1681
- },
1682
- {
1683
- "epoch": 0.6688369340041432,
1684
- "grad_norm": 0.470703125,
1685
- "learning_rate": 4.997058268983135e-06,
1686
- "loss": 8.7548,
1687
- "step": 2260
1688
- },
1689
- {
1690
- "epoch": 0.6717963894643386,
1691
- "grad_norm": 0.53515625,
1692
- "learning_rate": 4.592463471997022e-06,
1693
- "loss": 8.6756,
1694
- "step": 2270
1695
- },
1696
- {
1697
- "epoch": 0.6747558449245339,
1698
- "grad_norm": 0.482421875,
1699
- "learning_rate": 4.204562231352516e-06,
1700
- "loss": 8.6466,
1701
- "step": 2280
1702
- },
1703
- {
1704
- "epoch": 0.6777153003847292,
1705
- "grad_norm": 0.58203125,
1706
- "learning_rate": 3.83342241853496e-06,
1707
- "loss": 8.6794,
1708
- "step": 2290
1709
- },
1710
- {
1711
- "epoch": 0.6806747558449245,
1712
- "grad_norm": 1.171875,
1713
- "learning_rate": 3.4791089722651436e-06,
1714
- "loss": 8.6301,
1715
- "step": 2300
1716
- },
1717
- {
1718
- "epoch": 0.6836342113051198,
1719
- "grad_norm": 0.546875,
1720
- "learning_rate": 3.1416838871368924e-06,
1721
- "loss": 8.6403,
1722
- "step": 2310
1723
- },
1724
- {
1725
- "epoch": 0.6865936667653152,
1726
- "grad_norm": 0.5,
1727
- "learning_rate": 2.821206202769899e-06,
1728
- "loss": 8.7124,
1729
- "step": 2320
1730
- },
1731
- {
1732
- "epoch": 0.6895531222255105,
1733
- "grad_norm": 0.765625,
1734
- "learning_rate": 2.5177319934794e-06,
1735
- "loss": 8.6445,
1736
- "step": 2330
1737
- },
1738
- {
1739
- "epoch": 0.6925125776857058,
1740
- "grad_norm": 0.6328125,
1741
- "learning_rate": 2.2313143584648423e-06,
1742
- "loss": 8.6972,
1743
- "step": 2340
1744
- },
1745
- {
1746
- "epoch": 0.6954720331459011,
1747
- "grad_norm": 1.0625,
1748
- "learning_rate": 1.9620034125190644e-06,
1749
- "loss": 8.9133,
1750
- "step": 2350
1751
- },
1752
- {
1753
- "epoch": 0.6984314886060965,
1754
- "grad_norm": 0.44921875,
1755
- "learning_rate": 1.7098462772596302e-06,
1756
- "loss": 8.6366,
1757
- "step": 2360
1758
- },
1759
- {
1760
- "epoch": 0.7013909440662918,
1761
- "grad_norm": 0.51953125,
1762
- "learning_rate": 1.4748870728839347e-06,
1763
- "loss": 8.6377,
1764
- "step": 2370
1765
- },
1766
- {
1767
- "epoch": 0.7043503995264871,
1768
- "grad_norm": 0.470703125,
1769
- "learning_rate": 1.2571669104494256e-06,
1770
- "loss": 8.6656,
1771
- "step": 2380
1772
- },
1773
- {
1774
- "epoch": 0.7073098549866824,
1775
- "grad_norm": 0.71484375,
1776
- "learning_rate": 1.0567238846803996e-06,
1777
- "loss": 8.6525,
1778
- "step": 2390
1779
- },
1780
- {
1781
- "epoch": 0.7102693104468778,
1782
- "grad_norm": 0.7421875,
1783
- "learning_rate": 8.735930673024806e-07,
1784
- "loss": 8.573,
1785
- "step": 2400
1786
- },
1787
- {
1788
- "epoch": 0.7102693104468778,
1789
- "eval_loss": 8.698899269104004,
1790
- "eval_runtime": 10.0141,
1791
- "eval_samples_per_second": 149.989,
1792
- "eval_steps_per_second": 37.547,
1793
- "step": 2400
1794
- },
1795
- {
1796
- "epoch": 0.7132287659070731,
1797
- "grad_norm": 0.4921875,
1798
- "learning_rate": 7.078065009060941e-07,
1799
- "loss": 8.6492,
1800
- "step": 2410
1801
- },
1802
- {
1803
- "epoch": 0.7161882213672685,
1804
- "grad_norm": 0.4765625,
1805
- "learning_rate": 5.593931933399854e-07,
1806
- "loss": 8.6976,
1807
- "step": 2420
1808
- },
1809
- {
1810
- "epoch": 0.7191476768274637,
1811
- "grad_norm": 0.515625,
1812
- "learning_rate": 4.2837911263562404e-07,
1813
- "loss": 8.6823,
1814
- "step": 2430
1815
- },
1816
- {
1817
- "epoch": 0.7221071322876591,
1818
- "grad_norm": 0.6875,
1819
- "learning_rate": 3.1478718246357173e-07,
1820
- "loss": 8.7054,
1821
- "step": 2440
1822
- },
1823
- {
1824
- "epoch": 0.7250665877478544,
1825
- "grad_norm": 1.28125,
1826
- "learning_rate": 2.1863727812254653e-07,
1827
- "loss": 8.81,
1828
- "step": 2450
1829
- },
1830
- {
1831
- "epoch": 0.7280260432080498,
1832
- "grad_norm": 0.427734375,
1833
- "learning_rate": 1.3994622306173765e-07,
1834
- "loss": 8.6948,
1835
- "step": 2460
1836
- },
1837
- {
1838
- "epoch": 0.730985498668245,
1839
- "grad_norm": 0.48046875,
1840
- "learning_rate": 7.872778593728258e-08,
1841
- "loss": 8.6751,
1842
- "step": 2470
1843
- },
1844
- {
1845
- "epoch": 0.7339449541284404,
1846
- "grad_norm": 0.59375,
1847
- "learning_rate": 3.499267820307184e-08,
1848
- "loss": 8.7253,
1849
- "step": 2480
1850
- },
1851
- {
1852
- "epoch": 0.7369044095886357,
1853
- "grad_norm": 0.60546875,
1854
- "learning_rate": 8.748552236603757e-09,
1855
- "loss": 8.7342,
1856
- "step": 2490
1857
- },
1858
- {
1859
- "epoch": 0.7398638650488311,
1860
- "grad_norm": 0.91796875,
1861
- "learning_rate": 0.0,
1862
- "loss": 8.645,
1863
- "step": 2500
1864
  }
1865
  ],
1866
  "logging_steps": 10,
1867
- "max_steps": 2500,
1868
  "num_input_tokens_seen": 0,
1869
- "num_train_epochs": 1,
1870
- "save_steps": 400,
1871
  "stateful_callbacks": {
1872
  "TrainerControl": {
1873
  "args": {
@@ -1875,13 +43,13 @@
1875
  "should_evaluate": false,
1876
  "should_log": false,
1877
  "should_save": true,
1878
- "should_training_stop": true
1879
  },
1880
  "attributes": {}
1881
  }
1882
  },
1883
- "total_flos": 64161175044096.0,
1884
- "train_batch_size": 4,
1885
  "trial_name": null,
1886
  "trial_params": null
1887
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.5384615384615383,
5
+ "eval_steps": 40,
6
+ "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.07692307692307693,
13
+ "eval_loss": 10.376392364501953,
14
+ "eval_runtime": 5.3026,
15
+ "eval_samples_per_second": 283.067,
16
+ "eval_steps_per_second": 4.526,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.7692307692307693,
21
+ "grad_norm": 0.09619140625,
22
+ "learning_rate": 6.666666666666667e-05,
23
+ "loss": 10.378,
24
  "step": 10
25
  },
26
  {
27
+ "epoch": 1.5384615384615383,
28
+ "grad_norm": 0.09716796875,
29
+ "learning_rate": 0.00013333333333333334,
 
 
 
 
 
 
 
30
  "loss": 10.3754,
31
+ "step": 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
  ],
34
  "logging_steps": 10,
35
+ "max_steps": 600,
36
  "num_input_tokens_seen": 0,
37
+ "num_train_epochs": 47,
38
+ "save_steps": 20,
39
  "stateful_callbacks": {
40
  "TrainerControl": {
41
  "args": {
 
43
  "should_evaluate": false,
44
  "should_log": false,
45
  "should_save": true,
46
+ "should_training_stop": false
47
  },
48
  "attributes": {}
49
  }
50
  },
51
+ "total_flos": 8183170990080.0,
52
+ "train_batch_size": 32,
53
  "trial_name": null,
54
  "trial_params": null
55
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7ab57c26802475df8b559ffa07b1995cecba20856adfc383ddb4700563cc1b9
3
  size 6904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:786399a4c033c92470f06cc2d0496bbd0810a0dfdf57aa0b5e9a8edf096bb52a
3
  size 6904