Zheng Li commited on
Commit
6e0b95b
·
verified ·
1 Parent(s): 0b8f593

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: facebook/wav2vec2-base
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
  - superb
@@ -23,7 +24,7 @@ model-index:
23
  metrics:
24
  - name: Accuracy
25
  type: accuracy
26
- value: 0.9824948514268903
27
  ---
28
 
29
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -33,8 +34,8 @@ should probably proofread and complete it, then remove this comment. -->
33
 
34
  This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the superb dataset.
35
  It achieves the following results on the evaluation set:
36
- - Loss: 0.0926
37
- - Accuracy: 0.9825
38
 
39
  ## Model description
40
 
 
3
  license: apache-2.0
4
  base_model: facebook/wav2vec2-base
5
  tags:
6
+ - audio-classification
7
  - generated_from_trainer
8
  datasets:
9
  - superb
 
24
  metrics:
25
  - name: Accuracy
26
  type: accuracy
27
+ value: 0.9830832597822889
28
  ---
29
 
30
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
34
 
35
  This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the superb dataset.
36
  It achieves the following results on the evaluation set:
37
+ - Loss: 0.0956
38
+ - Accuracy: 0.9831
39
 
40
  ## Model description
41
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 4.989355040701315,
3
- "eval_accuracy": 0.9814651368049426,
4
- "eval_loss": 0.09822726994752884,
5
- "eval_runtime": 5.2666,
6
- "eval_samples_per_second": 1290.774,
7
- "eval_steps_per_second": 40.443,
8
- "total_flos": 2.357895379209216e+18,
9
- "train_loss": 0.5734889231528854,
10
- "train_runtime": 656.7578,
11
- "train_samples_per_second": 388.987,
12
- "train_steps_per_second": 3.038
13
  }
 
1
  {
2
+ "epoch": 7.996245306633291,
3
+ "eval_accuracy": 0.9830832597822889,
4
+ "eval_loss": 0.0956372618675232,
5
+ "eval_runtime": 5.6145,
6
+ "eval_samples_per_second": 1210.799,
7
+ "eval_steps_per_second": 37.938,
8
+ "total_flos": 3.777723239743488e+18,
9
+ "train_loss": 0.596273283347787,
10
+ "train_runtime": 640.7753,
11
+ "train_samples_per_second": 637.902,
12
+ "train_steps_per_second": 2.484
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.989355040701315,
3
- "eval_accuracy": 0.9814651368049426,
4
- "eval_loss": 0.09822726994752884,
5
- "eval_runtime": 5.2666,
6
- "eval_samples_per_second": 1290.774,
7
- "eval_steps_per_second": 40.443
8
  }
 
1
  {
2
+ "epoch": 7.996245306633291,
3
+ "eval_accuracy": 0.9830832597822889,
4
+ "eval_loss": 0.0956372618675232,
5
+ "eval_runtime": 5.6145,
6
+ "eval_samples_per_second": 1210.799,
7
+ "eval_steps_per_second": 37.938
8
  }
runs/May14_15-54-09_cs-Precision-7960-Tower/events.out.tfevents.1747253123.cs-Precision-7960-Tower.127892.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15845e83e76e501dc7170d7b668f0ca3730974474a167df2d31342c8c5348fb8
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.989355040701315,
3
- "total_flos": 2.357895379209216e+18,
4
- "train_loss": 0.5734889231528854,
5
- "train_runtime": 656.7578,
6
- "train_samples_per_second": 388.987,
7
- "train_steps_per_second": 3.038
8
  }
 
1
  {
2
+ "epoch": 7.996245306633291,
3
+ "total_flos": 3.777723239743488e+18,
4
+ "train_loss": 0.596273283347787,
5
+ "train_runtime": 640.7753,
6
+ "train_samples_per_second": 637.902,
7
+ "train_steps_per_second": 2.484
8
  }
trainer_state.json CHANGED
@@ -1,1465 +1,1212 @@
1
  {
2
- "best_metric": 0.9814651368049426,
3
- "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1995",
4
- "epoch": 4.989355040701315,
5
  "eval_steps": 500,
6
- "global_step": 1995,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.025046963055729492,
13
- "grad_norm": 2.144172430038452,
14
- "learning_rate": 1.5e-06,
15
- "loss": 3.8317,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.050093926111458985,
20
- "grad_norm": 3.0447957515716553,
21
- "learning_rate": 3e-06,
22
- "loss": 4.1331,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.07514088916718847,
27
- "grad_norm": 3.1870126724243164,
28
- "learning_rate": 4.5e-06,
29
- "loss": 4.0889,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.10018785222291797,
34
- "grad_norm": 4.074451923370361,
35
- "learning_rate": 6e-06,
36
- "loss": 3.9025,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.12523481527864747,
41
- "grad_norm": 5.182351112365723,
42
- "learning_rate": 7.5e-06,
43
- "loss": 3.6201,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.15028177833437695,
48
- "grad_norm": 5.756130218505859,
49
- "learning_rate": 9e-06,
50
- "loss": 3.1977,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.17532874139010646,
55
- "grad_norm": 5.65469217300415,
56
- "learning_rate": 1.05e-05,
57
- "loss": 2.7121,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.20037570444583594,
62
- "grad_norm": 5.120871067047119,
63
- "learning_rate": 1.2e-05,
64
- "loss": 2.4593,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.22542266750156542,
69
- "grad_norm": 4.952624320983887,
70
- "learning_rate": 1.3500000000000001e-05,
71
- "loss": 2.2216,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.25046963055729493,
76
- "grad_norm": 4.202530384063721,
77
- "learning_rate": 1.5e-05,
78
- "loss": 2.0977,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.27551659361302444,
83
- "grad_norm": 3.235758066177368,
84
- "learning_rate": 1.65e-05,
85
- "loss": 2.0442,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.3005635566687539,
90
- "grad_norm": 3.456002712249756,
91
- "learning_rate": 1.8e-05,
92
- "loss": 1.8601,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 0.3256105197244834,
97
- "grad_norm": 1.5474969148635864,
98
- "learning_rate": 1.95e-05,
99
- "loss": 1.7964,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 0.3506574827802129,
104
- "grad_norm": 1.7901959419250488,
105
- "learning_rate": 2.1e-05,
106
- "loss": 1.8098,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 0.37570444583594237,
111
- "grad_norm": 1.2036372423171997,
112
- "learning_rate": 2.25e-05,
113
- "loss": 1.7429,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 0.4007514088916719,
118
- "grad_norm": 0.6613264083862305,
119
- "learning_rate": 2.4e-05,
120
- "loss": 1.7313,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 0.4257983719474014,
125
- "grad_norm": 1.468883991241455,
126
- "learning_rate": 2.55e-05,
127
- "loss": 1.7528,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 0.45084533500313084,
132
- "grad_norm": 2.4236254692077637,
133
- "learning_rate": 2.7000000000000002e-05,
134
- "loss": 1.642,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 0.47589229805886035,
139
- "grad_norm": 10.500153541564941,
140
- "learning_rate": 2.8499999999999998e-05,
141
- "loss": 1.6462,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 0.5009392611145899,
146
- "grad_norm": 8.173652648925781,
147
- "learning_rate": 3e-05,
148
- "loss": 1.6049,
 
 
 
 
 
 
 
 
 
149
  "step": 200
150
  },
151
  {
152
- "epoch": 0.5259862241703194,
153
- "grad_norm": 4.767404556274414,
154
- "learning_rate": 2.9832869080779945e-05,
155
- "loss": 1.5754,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 0.5510331872260489,
160
- "grad_norm": 2.0099925994873047,
161
- "learning_rate": 2.9665738161559886e-05,
162
- "loss": 1.5001,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 0.5760801502817783,
167
- "grad_norm": 2.7814888954162598,
168
- "learning_rate": 2.9498607242339834e-05,
169
- "loss": 1.4049,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 0.6011271133375078,
174
- "grad_norm": 4.7790846824646,
175
- "learning_rate": 2.933147632311978e-05,
176
- "loss": 1.5321,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 0.6261740763932373,
181
- "grad_norm": 3.758580207824707,
182
- "learning_rate": 2.916434540389972e-05,
183
- "loss": 1.4002,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 0.6512210394489668,
188
- "grad_norm": 5.455554008483887,
189
- "learning_rate": 2.8997214484679665e-05,
190
- "loss": 1.4013,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 0.6762680025046963,
195
- "grad_norm": 5.183338165283203,
196
- "learning_rate": 2.8830083565459613e-05,
197
- "loss": 1.2016,
198
  "step": 270
199
  },
200
  {
201
- "epoch": 0.7013149655604258,
202
- "grad_norm": 3.5465261936187744,
203
- "learning_rate": 2.8662952646239554e-05,
204
- "loss": 1.1541,
205
  "step": 280
206
  },
207
  {
208
- "epoch": 0.7263619286161553,
209
- "grad_norm": 8.002464294433594,
210
- "learning_rate": 2.84958217270195e-05,
211
- "loss": 1.1754,
212
  "step": 290
213
  },
214
  {
215
- "epoch": 0.7514088916718847,
216
- "grad_norm": 4.145716190338135,
217
- "learning_rate": 2.8328690807799443e-05,
218
- "loss": 1.1651,
219
  "step": 300
220
  },
221
  {
222
- "epoch": 0.7764558547276142,
223
- "grad_norm": 8.420044898986816,
224
- "learning_rate": 2.8161559888579388e-05,
225
- "loss": 1.2086,
226
  "step": 310
227
  },
228
  {
229
- "epoch": 0.8015028177833438,
230
- "grad_norm": 2.530792713165283,
231
- "learning_rate": 2.7994428969359332e-05,
232
- "loss": 1.1337,
233
  "step": 320
234
  },
235
  {
236
- "epoch": 0.8265497808390733,
237
- "grad_norm": 3.45489501953125,
238
- "learning_rate": 2.7827298050139277e-05,
239
- "loss": 1.0499,
240
  "step": 330
241
  },
242
  {
243
- "epoch": 0.8515967438948028,
244
- "grad_norm": 5.169933795928955,
245
- "learning_rate": 2.7660167130919218e-05,
246
- "loss": 1.1298,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 0.8766437069505323,
251
- "grad_norm": 5.91841459274292,
252
- "learning_rate": 2.7493036211699166e-05,
253
- "loss": 1.1476,
254
  "step": 350
255
  },
256
  {
257
- "epoch": 0.9016906700062617,
258
- "grad_norm": 4.016351699829102,
259
- "learning_rate": 2.732590529247911e-05,
260
- "loss": 1.0944,
261
  "step": 360
262
  },
263
  {
264
- "epoch": 0.9267376330619912,
265
- "grad_norm": 5.553752899169922,
266
- "learning_rate": 2.7158774373259055e-05,
267
- "loss": 1.0822,
268
  "step": 370
269
  },
270
  {
271
- "epoch": 0.9517845961177207,
272
- "grad_norm": 4.519126892089844,
273
- "learning_rate": 2.7008356545961002e-05,
274
- "loss": 1.0057,
275
  "step": 380
276
  },
277
  {
278
- "epoch": 0.9768315591734502,
279
- "grad_norm": 2.7625792026519775,
280
- "learning_rate": 2.6841225626740946e-05,
281
- "loss": 1.009,
282
  "step": 390
283
  },
284
  {
285
- "epoch": 1.0,
286
- "grad_norm": 1.0362297296524048,
287
- "learning_rate": 2.6674094707520894e-05,
288
- "loss": 0.8747,
289
- "step": 400
 
 
290
  },
291
  {
292
- "epoch": 1.0,
293
- "eval_accuracy": 0.9340982641953516,
294
- "eval_loss": 0.806678831577301,
295
- "eval_runtime": 5.088,
296
- "eval_samples_per_second": 1336.08,
297
- "eval_steps_per_second": 41.863,
298
  "step": 400
299
  },
300
  {
301
- "epoch": 1.0250469630557295,
302
- "grad_norm": 7.034424304962158,
303
- "learning_rate": 2.6506963788300836e-05,
304
- "loss": 0.8779,
305
  "step": 410
306
  },
307
  {
308
- "epoch": 1.050093926111459,
309
- "grad_norm": 5.3810014724731445,
310
- "learning_rate": 2.633983286908078e-05,
311
- "loss": 0.7878,
312
  "step": 420
313
  },
314
  {
315
- "epoch": 1.0751408891671885,
316
- "grad_norm": 3.344200372695923,
317
- "learning_rate": 2.6172701949860725e-05,
318
- "loss": 0.762,
319
  "step": 430
320
  },
321
  {
322
- "epoch": 1.100187852222918,
323
- "grad_norm": 6.402628421783447,
324
- "learning_rate": 2.600557103064067e-05,
325
- "loss": 0.7631,
326
  "step": 440
327
  },
328
  {
329
- "epoch": 1.1252348152786475,
330
- "grad_norm": 4.863397121429443,
331
- "learning_rate": 2.5838440111420614e-05,
332
- "loss": 0.6772,
333
  "step": 450
334
  },
335
  {
336
- "epoch": 1.150281778334377,
337
- "grad_norm": 6.484178066253662,
338
- "learning_rate": 2.567130919220056e-05,
339
- "loss": 0.6461,
340
  "step": 460
341
  },
342
  {
343
- "epoch": 1.1753287413901066,
344
- "grad_norm": 3.2635133266448975,
345
- "learning_rate": 2.55041782729805e-05,
346
- "loss": 0.5984,
347
  "step": 470
348
  },
349
  {
350
- "epoch": 1.2003757044458359,
351
- "grad_norm": 2.40271258354187,
352
- "learning_rate": 2.5337047353760448e-05,
353
- "loss": 0.606,
354
  "step": 480
355
  },
356
  {
357
- "epoch": 1.2254226675015654,
358
- "grad_norm": 2.1505284309387207,
359
- "learning_rate": 2.5169916434540392e-05,
360
- "loss": 0.5553,
361
  "step": 490
362
  },
363
  {
364
- "epoch": 1.2504696305572949,
365
- "grad_norm": 2.476496934890747,
366
- "learning_rate": 2.5002785515320333e-05,
367
- "loss": 0.5238,
368
  "step": 500
369
  },
370
  {
371
- "epoch": 1.2755165936130244,
372
- "grad_norm": 4.137519359588623,
373
- "learning_rate": 2.4835654596100278e-05,
374
- "loss": 0.5148,
375
  "step": 510
376
  },
377
  {
378
- "epoch": 1.300563556668754,
379
- "grad_norm": 5.227903842926025,
380
- "learning_rate": 2.4668523676880226e-05,
381
- "loss": 0.4863,
382
  "step": 520
383
  },
384
  {
385
- "epoch": 1.3256105197244834,
386
- "grad_norm": 4.056149482727051,
387
- "learning_rate": 2.4501392757660167e-05,
388
- "loss": 0.49,
389
  "step": 530
390
  },
391
  {
392
- "epoch": 1.350657482780213,
393
- "grad_norm": 6.162842273712158,
394
- "learning_rate": 2.433426183844011e-05,
395
- "loss": 0.4264,
396
  "step": 540
397
  },
398
  {
399
- "epoch": 1.3757044458359424,
400
- "grad_norm": 4.439515113830566,
401
- "learning_rate": 2.4167130919220056e-05,
402
- "loss": 0.4449,
403
  "step": 550
404
  },
405
  {
406
- "epoch": 1.400751408891672,
407
- "grad_norm": 4.906720161437988,
408
- "learning_rate": 2.4e-05,
409
- "loss": 0.408,
410
  "step": 560
411
  },
412
  {
413
- "epoch": 1.4257983719474014,
414
- "grad_norm": 5.718549728393555,
415
- "learning_rate": 2.3832869080779945e-05,
416
- "loss": 0.4143,
417
  "step": 570
418
  },
419
  {
420
- "epoch": 1.4508453350031307,
421
- "grad_norm": 3.9077138900756836,
422
- "learning_rate": 2.366573816155989e-05,
423
- "loss": 0.3716,
424
  "step": 580
425
  },
426
  {
427
- "epoch": 1.4758922980588602,
428
- "grad_norm": 3.5204200744628906,
429
- "learning_rate": 2.349860724233983e-05,
430
- "loss": 0.3932,
431
  "step": 590
432
  },
433
  {
434
- "epoch": 1.5009392611145898,
435
- "grad_norm": 5.870133399963379,
436
- "learning_rate": 2.333147632311978e-05,
437
- "loss": 0.3932,
 
 
 
 
 
 
 
 
 
438
  "step": 600
439
  },
440
  {
441
- "epoch": 1.5259862241703193,
442
- "grad_norm": 5.287498950958252,
443
- "learning_rate": 2.3164345403899723e-05,
444
- "loss": 0.4031,
445
  "step": 610
446
  },
447
  {
448
- "epoch": 1.5510331872260488,
449
- "grad_norm": 5.271251201629639,
450
- "learning_rate": 2.2997214484679665e-05,
451
- "loss": 0.3467,
452
  "step": 620
453
  },
454
  {
455
- "epoch": 1.5760801502817783,
456
- "grad_norm": 5.845817565917969,
457
- "learning_rate": 2.283008356545961e-05,
458
- "loss": 0.3593,
459
  "step": 630
460
  },
461
  {
462
- "epoch": 1.6011271133375078,
463
- "grad_norm": 3.02872896194458,
464
- "learning_rate": 2.2662952646239557e-05,
465
- "loss": 0.3535,
466
  "step": 640
467
  },
468
  {
469
- "epoch": 1.6261740763932373,
470
- "grad_norm": 2.3705966472625732,
471
- "learning_rate": 2.2495821727019498e-05,
472
- "loss": 0.3313,
473
  "step": 650
474
  },
475
  {
476
- "epoch": 1.6512210394489668,
477
- "grad_norm": 3.9336166381835938,
478
- "learning_rate": 2.2328690807799443e-05,
479
- "loss": 0.374,
480
  "step": 660
481
  },
482
  {
483
- "epoch": 1.6762680025046963,
484
- "grad_norm": 6.896333694458008,
485
- "learning_rate": 2.2161559888579387e-05,
486
- "loss": 0.359,
487
  "step": 670
488
  },
489
  {
490
- "epoch": 1.7013149655604258,
491
- "grad_norm": 3.1803808212280273,
492
- "learning_rate": 2.1994428969359335e-05,
493
- "loss": 0.3215,
494
  "step": 680
495
  },
496
  {
497
- "epoch": 1.7263619286161553,
498
- "grad_norm": 3.6727025508880615,
499
- "learning_rate": 2.1827298050139276e-05,
500
- "loss": 0.3313,
501
  "step": 690
502
  },
503
  {
504
- "epoch": 1.7514088916718848,
505
- "grad_norm": 4.481452941894531,
506
- "learning_rate": 2.166016713091922e-05,
507
- "loss": 0.3075,
508
  "step": 700
509
  },
510
  {
511
- "epoch": 1.7764558547276144,
512
- "grad_norm": 4.977258205413818,
513
- "learning_rate": 2.1493036211699166e-05,
514
- "loss": 0.2814,
515
  "step": 710
516
  },
517
  {
518
- "epoch": 1.8015028177833439,
519
- "grad_norm": 4.018652439117432,
520
- "learning_rate": 2.1325905292479107e-05,
521
- "loss": 0.2902,
522
  "step": 720
523
  },
524
  {
525
- "epoch": 1.8265497808390734,
526
- "grad_norm": 6.154052734375,
527
- "learning_rate": 2.1158774373259055e-05,
528
- "loss": 0.3123,
529
  "step": 730
530
  },
531
  {
532
- "epoch": 1.8515967438948029,
533
- "grad_norm": 3.7956981658935547,
534
- "learning_rate": 2.0991643454039e-05,
535
- "loss": 0.3119,
536
  "step": 740
537
  },
538
  {
539
- "epoch": 1.8766437069505324,
540
- "grad_norm": 6.320951461791992,
541
- "learning_rate": 2.082451253481894e-05,
542
- "loss": 0.2874,
543
  "step": 750
544
  },
545
  {
546
- "epoch": 1.9016906700062617,
547
- "grad_norm": 2.6970086097717285,
548
- "learning_rate": 2.0657381615598885e-05,
549
- "loss": 0.2889,
550
  "step": 760
551
  },
552
  {
553
- "epoch": 1.9267376330619912,
554
- "grad_norm": 4.386446952819824,
555
- "learning_rate": 2.0490250696378833e-05,
556
- "loss": 0.2529,
557
  "step": 770
558
  },
559
  {
560
- "epoch": 1.9517845961177207,
561
- "grad_norm": 5.870710372924805,
562
- "learning_rate": 2.0323119777158774e-05,
563
- "loss": 0.2819,
564
  "step": 780
565
  },
566
  {
567
- "epoch": 1.9768315591734502,
568
- "grad_norm": 5.930877685546875,
569
- "learning_rate": 2.015598885793872e-05,
570
- "loss": 0.2725,
571
  "step": 790
572
  },
573
  {
574
- "epoch": 2.0,
575
- "grad_norm": 0.3753320872783661,
576
- "learning_rate": 1.9988857938718663e-05,
577
- "loss": 0.2332,
578
- "step": 800
 
 
579
  },
580
  {
581
- "epoch": 2.0,
582
- "eval_accuracy": 0.9745513386290086,
583
- "eval_loss": 0.17788007855415344,
584
- "eval_runtime": 4.7999,
585
- "eval_samples_per_second": 1416.28,
586
- "eval_steps_per_second": 44.376,
587
  "step": 800
588
  },
589
  {
590
- "epoch": 2.0250469630557295,
591
- "grad_norm": 6.34937047958374,
592
- "learning_rate": 1.9821727019498608e-05,
593
- "loss": 0.2545,
594
  "step": 810
595
  },
596
  {
597
- "epoch": 2.050093926111459,
598
- "grad_norm": 4.1271138191223145,
599
- "learning_rate": 1.9654596100278552e-05,
600
- "loss": 0.2497,
601
  "step": 820
602
  },
603
  {
604
- "epoch": 2.0751408891671885,
605
- "grad_norm": 5.419626712799072,
606
- "learning_rate": 1.9487465181058497e-05,
607
- "loss": 0.2502,
608
  "step": 830
609
  },
610
  {
611
- "epoch": 2.100187852222918,
612
- "grad_norm": 6.219632148742676,
613
- "learning_rate": 1.9320334261838438e-05,
614
- "loss": 0.2492,
615
  "step": 840
616
  },
617
  {
618
- "epoch": 2.1252348152786475,
619
- "grad_norm": 6.462090492248535,
620
- "learning_rate": 1.9153203342618386e-05,
621
- "loss": 0.2517,
622
  "step": 850
623
  },
624
  {
625
- "epoch": 2.150281778334377,
626
- "grad_norm": 3.6385743618011475,
627
- "learning_rate": 1.898607242339833e-05,
628
- "loss": 0.2393,
629
  "step": 860
630
  },
631
  {
632
- "epoch": 2.1753287413901066,
633
- "grad_norm": 4.627376556396484,
634
- "learning_rate": 1.8818941504178272e-05,
635
- "loss": 0.2414,
636
  "step": 870
637
  },
638
  {
639
- "epoch": 2.200375704445836,
640
- "grad_norm": 5.165160179138184,
641
- "learning_rate": 1.8651810584958216e-05,
642
- "loss": 0.2357,
643
  "step": 880
644
  },
645
  {
646
- "epoch": 2.2254226675015656,
647
- "grad_norm": 1.4684484004974365,
648
- "learning_rate": 1.8484679665738164e-05,
649
- "loss": 0.2548,
650
  "step": 890
651
  },
652
  {
653
- "epoch": 2.250469630557295,
654
- "grad_norm": 3.5594701766967773,
655
- "learning_rate": 1.8317548746518105e-05,
656
- "loss": 0.2403,
657
  "step": 900
658
  },
659
  {
660
- "epoch": 2.2755165936130246,
661
- "grad_norm": 3.314188003540039,
662
- "learning_rate": 1.815041782729805e-05,
663
- "loss": 0.2788,
664
  "step": 910
665
  },
666
  {
667
- "epoch": 2.300563556668754,
668
- "grad_norm": 2.3456945419311523,
669
- "learning_rate": 1.7983286908077995e-05,
670
- "loss": 0.2707,
671
  "step": 920
672
  },
673
  {
674
- "epoch": 2.325610519724483,
675
- "grad_norm": 3.4486682415008545,
676
- "learning_rate": 1.781615598885794e-05,
677
- "loss": 0.2304,
678
  "step": 930
679
  },
680
  {
681
- "epoch": 2.350657482780213,
682
- "grad_norm": 3.3779501914978027,
683
- "learning_rate": 1.7649025069637884e-05,
684
- "loss": 0.2318,
685
  "step": 940
686
  },
687
  {
688
- "epoch": 2.375704445835942,
689
- "grad_norm": 1.7540189027786255,
690
- "learning_rate": 1.7481894150417828e-05,
691
- "loss": 0.2125,
692
  "step": 950
693
  },
694
  {
695
- "epoch": 2.4007514088916717,
696
- "grad_norm": 8.057242393493652,
697
- "learning_rate": 1.731476323119777e-05,
698
- "loss": 0.2407,
699
  "step": 960
700
  },
701
  {
702
- "epoch": 2.425798371947401,
703
- "grad_norm": 3.5279042720794678,
704
- "learning_rate": 1.7147632311977717e-05,
705
- "loss": 0.2432,
706
  "step": 970
707
  },
708
  {
709
- "epoch": 2.4508453350031307,
710
- "grad_norm": 4.324340343475342,
711
- "learning_rate": 1.6980501392757662e-05,
712
- "loss": 0.2316,
713
  "step": 980
714
  },
715
  {
716
- "epoch": 2.4758922980588602,
717
- "grad_norm": 3.12505841255188,
718
- "learning_rate": 1.6813370473537606e-05,
719
- "loss": 0.2226,
720
  "step": 990
721
  },
722
  {
723
- "epoch": 2.5009392611145898,
724
- "grad_norm": 3.6332638263702393,
725
- "learning_rate": 1.6646239554317548e-05,
726
- "loss": 0.2305,
 
 
 
 
 
 
 
 
 
727
  "step": 1000
728
  },
729
  {
730
- "epoch": 2.5259862241703193,
731
- "grad_norm": 3.011505126953125,
732
- "learning_rate": 1.6479108635097496e-05,
733
- "loss": 0.2361,
734
  "step": 1010
735
  },
736
  {
737
- "epoch": 2.5510331872260488,
738
- "grad_norm": 3.6663644313812256,
739
- "learning_rate": 1.631197771587744e-05,
740
- "loss": 0.2398,
741
  "step": 1020
742
  },
743
  {
744
- "epoch": 2.5760801502817783,
745
- "grad_norm": 3.6134729385375977,
746
- "learning_rate": 1.614484679665738e-05,
747
- "loss": 0.1777,
748
  "step": 1030
749
  },
750
  {
751
- "epoch": 2.601127113337508,
752
- "grad_norm": 3.180910587310791,
753
- "learning_rate": 1.5977715877437326e-05,
754
- "loss": 0.2304,
755
  "step": 1040
756
  },
757
  {
758
- "epoch": 2.6261740763932373,
759
- "grad_norm": 2.667623281478882,
760
- "learning_rate": 1.581058495821727e-05,
761
- "loss": 0.1787,
762
  "step": 1050
763
  },
764
  {
765
- "epoch": 2.651221039448967,
766
- "grad_norm": 6.972463130950928,
767
- "learning_rate": 1.5643454038997215e-05,
768
- "loss": 0.2026,
769
  "step": 1060
770
  },
771
  {
772
- "epoch": 2.6762680025046963,
773
- "grad_norm": 1.4821382761001587,
774
- "learning_rate": 1.547632311977716e-05,
775
- "loss": 0.2295,
776
  "step": 1070
777
  },
778
  {
779
- "epoch": 2.701314965560426,
780
- "grad_norm": 3.339320421218872,
781
- "learning_rate": 1.5309192200557104e-05,
782
- "loss": 0.2234,
783
  "step": 1080
784
  },
785
  {
786
- "epoch": 2.7263619286161553,
787
- "grad_norm": 3.3274917602539062,
788
- "learning_rate": 1.5142061281337047e-05,
789
- "loss": 0.1899,
790
  "step": 1090
791
  },
792
  {
793
- "epoch": 2.751408891671885,
794
- "grad_norm": 3.077637195587158,
795
- "learning_rate": 1.4974930362116992e-05,
796
- "loss": 0.1747,
797
  "step": 1100
798
  },
799
  {
800
- "epoch": 2.7764558547276144,
801
- "grad_norm": 4.980368137359619,
802
- "learning_rate": 1.4807799442896936e-05,
803
- "loss": 0.2359,
804
  "step": 1110
805
  },
806
  {
807
- "epoch": 2.801502817783344,
808
- "grad_norm": 3.46724534034729,
809
- "learning_rate": 1.464066852367688e-05,
810
- "loss": 0.1945,
811
  "step": 1120
812
  },
813
  {
814
- "epoch": 2.8265497808390734,
815
- "grad_norm": 6.3585710525512695,
816
- "learning_rate": 1.4473537604456825e-05,
817
- "loss": 0.2356,
818
  "step": 1130
819
  },
820
  {
821
- "epoch": 2.851596743894803,
822
- "grad_norm": 3.6611506938934326,
823
- "learning_rate": 1.4306406685236768e-05,
824
- "loss": 0.223,
825
  "step": 1140
826
  },
827
  {
828
- "epoch": 2.8766437069505324,
829
- "grad_norm": 3.0758209228515625,
830
- "learning_rate": 1.4139275766016714e-05,
831
- "loss": 0.2675,
832
  "step": 1150
833
  },
834
  {
835
- "epoch": 2.9016906700062615,
836
- "grad_norm": 3.0930421352386475,
837
- "learning_rate": 1.3972144846796657e-05,
838
- "loss": 0.2342,
839
  "step": 1160
840
  },
841
  {
842
- "epoch": 2.9267376330619914,
843
- "grad_norm": 3.915057897567749,
844
- "learning_rate": 1.3805013927576602e-05,
845
- "loss": 0.2089,
846
  "step": 1170
847
  },
848
  {
849
- "epoch": 2.9517845961177205,
850
- "grad_norm": 3.755885601043701,
851
- "learning_rate": 1.3637883008356546e-05,
852
- "loss": 0.2251,
853
  "step": 1180
854
  },
855
  {
856
- "epoch": 2.9768315591734504,
857
- "grad_norm": 3.1619045734405518,
858
- "learning_rate": 1.3470752089136491e-05,
859
- "loss": 0.1927,
860
  "step": 1190
861
  },
862
  {
863
- "epoch": 3.0,
864
- "grad_norm": 3.681858539581299,
865
- "learning_rate": 1.3303621169916434e-05,
866
- "loss": 0.217,
867
- "step": 1200
 
 
868
  },
869
  {
870
- "epoch": 3.0,
871
- "eval_accuracy": 0.9766107678729038,
872
- "eval_loss": 0.1263045072555542,
873
- "eval_runtime": 4.7521,
874
- "eval_samples_per_second": 1430.516,
875
- "eval_steps_per_second": 44.822,
876
  "step": 1200
877
  },
878
  {
879
- "epoch": 3.0250469630557295,
880
- "grad_norm": 4.34127140045166,
881
- "learning_rate": 1.313649025069638e-05,
882
- "loss": 0.2222,
883
  "step": 1210
884
  },
885
  {
886
- "epoch": 3.050093926111459,
887
- "grad_norm": 2.2270660400390625,
888
- "learning_rate": 1.2969359331476323e-05,
889
- "loss": 0.1822,
890
  "step": 1220
891
  },
892
  {
893
- "epoch": 3.0751408891671885,
894
- "grad_norm": 3.879969358444214,
895
- "learning_rate": 1.2802228412256267e-05,
896
- "loss": 0.1947,
897
  "step": 1230
898
  },
899
  {
900
- "epoch": 3.100187852222918,
901
- "grad_norm": 4.284245491027832,
902
- "learning_rate": 1.2635097493036212e-05,
903
- "loss": 0.2126,
904
  "step": 1240
905
  },
906
  {
907
- "epoch": 3.1252348152786475,
908
- "grad_norm": 3.0802762508392334,
909
- "learning_rate": 1.2467966573816157e-05,
910
- "loss": 0.1954,
911
  "step": 1250
912
  },
913
  {
914
- "epoch": 3.150281778334377,
915
- "grad_norm": 4.699860095977783,
916
- "learning_rate": 1.2300835654596101e-05,
917
- "loss": 0.2189,
918
  "step": 1260
919
  },
920
  {
921
- "epoch": 3.1753287413901066,
922
- "grad_norm": 4.755823612213135,
923
- "learning_rate": 1.2133704735376046e-05,
924
- "loss": 0.2019,
925
  "step": 1270
926
  },
927
  {
928
- "epoch": 3.200375704445836,
929
- "grad_norm": 4.778765678405762,
930
- "learning_rate": 1.1966573816155989e-05,
931
- "loss": 0.1987,
932
  "step": 1280
933
  },
934
  {
935
- "epoch": 3.2254226675015656,
936
- "grad_norm": 4.719218730926514,
937
- "learning_rate": 1.1799442896935935e-05,
938
- "loss": 0.1947,
939
  "step": 1290
940
  },
941
  {
942
- "epoch": 3.250469630557295,
943
- "grad_norm": 4.547497272491455,
944
- "learning_rate": 1.1632311977715878e-05,
945
- "loss": 0.2097,
946
  "step": 1300
947
  },
948
  {
949
- "epoch": 3.2755165936130246,
950
- "grad_norm": 2.1130096912384033,
951
- "learning_rate": 1.1465181058495822e-05,
952
- "loss": 0.1327,
953
  "step": 1310
954
  },
955
  {
956
- "epoch": 3.300563556668754,
957
- "grad_norm": 4.512012958526611,
958
- "learning_rate": 1.1298050139275767e-05,
959
- "loss": 0.178,
960
  "step": 1320
961
  },
962
  {
963
- "epoch": 3.325610519724483,
964
- "grad_norm": 2.9694018363952637,
965
- "learning_rate": 1.1130919220055711e-05,
966
- "loss": 0.2077,
967
  "step": 1330
968
  },
969
  {
970
- "epoch": 3.350657482780213,
971
- "grad_norm": 2.5430564880371094,
972
- "learning_rate": 1.0963788300835654e-05,
973
- "loss": 0.1774,
974
  "step": 1340
975
  },
976
  {
977
- "epoch": 3.375704445835942,
978
- "grad_norm": 5.131649971008301,
979
- "learning_rate": 1.0796657381615599e-05,
980
- "loss": 0.2013,
981
  "step": 1350
982
  },
983
  {
984
- "epoch": 3.4007514088916717,
985
- "grad_norm": 3.391754627227783,
986
- "learning_rate": 1.0629526462395543e-05,
987
- "loss": 0.1772,
988
  "step": 1360
989
  },
990
  {
991
- "epoch": 3.425798371947401,
992
- "grad_norm": 3.4772632122039795,
993
- "learning_rate": 1.0462395543175486e-05,
994
- "loss": 0.1597,
995
  "step": 1370
996
  },
997
  {
998
- "epoch": 3.4508453350031307,
999
- "grad_norm": 3.5944013595581055,
1000
- "learning_rate": 1.0295264623955432e-05,
1001
- "loss": 0.2192,
1002
  "step": 1380
1003
  },
1004
  {
1005
- "epoch": 3.4758922980588602,
1006
- "grad_norm": 2.526901960372925,
1007
- "learning_rate": 1.0128133704735375e-05,
1008
- "loss": 0.168,
1009
  "step": 1390
1010
  },
1011
  {
1012
- "epoch": 3.5009392611145898,
1013
- "grad_norm": 3.234485626220703,
1014
- "learning_rate": 9.961002785515322e-06,
1015
- "loss": 0.2007,
 
 
 
 
 
 
 
 
 
1016
  "step": 1400
1017
  },
1018
  {
1019
- "epoch": 3.5259862241703193,
1020
- "grad_norm": 4.3917622566223145,
1021
- "learning_rate": 9.793871866295264e-06,
1022
- "loss": 0.1766,
1023
  "step": 1410
1024
  },
1025
  {
1026
- "epoch": 3.5510331872260488,
1027
- "grad_norm": 1.4780800342559814,
1028
- "learning_rate": 9.626740947075209e-06,
1029
- "loss": 0.1738,
1030
  "step": 1420
1031
  },
1032
  {
1033
- "epoch": 3.5760801502817783,
1034
- "grad_norm": 3.670740842819214,
1035
- "learning_rate": 9.459610027855154e-06,
1036
- "loss": 0.2033,
1037
  "step": 1430
1038
  },
1039
  {
1040
- "epoch": 3.601127113337508,
1041
- "grad_norm": 4.08475923538208,
1042
- "learning_rate": 9.292479108635098e-06,
1043
- "loss": 0.1638,
1044
  "step": 1440
1045
  },
1046
  {
1047
- "epoch": 3.6261740763932373,
1048
- "grad_norm": 3.445945978164673,
1049
- "learning_rate": 9.125348189415041e-06,
1050
- "loss": 0.1854,
1051
  "step": 1450
1052
  },
1053
  {
1054
- "epoch": 3.651221039448967,
1055
- "grad_norm": 3.1884312629699707,
1056
- "learning_rate": 8.958217270194987e-06,
1057
- "loss": 0.1967,
1058
  "step": 1460
1059
  },
1060
  {
1061
- "epoch": 3.6762680025046963,
1062
- "grad_norm": 1.9130624532699585,
1063
- "learning_rate": 8.79108635097493e-06,
1064
- "loss": 0.1618,
1065
  "step": 1470
1066
  },
1067
  {
1068
- "epoch": 3.701314965560426,
1069
- "grad_norm": 1.0646212100982666,
1070
- "learning_rate": 8.623955431754875e-06,
1071
- "loss": 0.1816,
1072
  "step": 1480
1073
  },
1074
  {
1075
- "epoch": 3.7263619286161553,
1076
- "grad_norm": 3.629429817199707,
1077
- "learning_rate": 8.45682451253482e-06,
1078
- "loss": 0.1432,
1079
  "step": 1490
1080
  },
1081
  {
1082
- "epoch": 3.751408891671885,
1083
- "grad_norm": 2.1418120861053467,
1084
- "learning_rate": 8.289693593314764e-06,
1085
- "loss": 0.1662,
1086
  "step": 1500
1087
  },
1088
  {
1089
- "epoch": 3.7764558547276144,
1090
- "grad_norm": 3.682490825653076,
1091
- "learning_rate": 8.122562674094707e-06,
1092
- "loss": 0.1819,
1093
  "step": 1510
1094
  },
1095
  {
1096
- "epoch": 3.801502817783344,
1097
- "grad_norm": 2.9112191200256348,
1098
- "learning_rate": 7.955431754874653e-06,
1099
- "loss": 0.1785,
1100
  "step": 1520
1101
  },
1102
  {
1103
- "epoch": 3.8265497808390734,
1104
- "grad_norm": 3.727522134780884,
1105
- "learning_rate": 7.788300835654596e-06,
1106
- "loss": 0.1476,
1107
  "step": 1530
1108
  },
1109
  {
1110
- "epoch": 3.851596743894803,
1111
- "grad_norm": 4.77044153213501,
1112
- "learning_rate": 7.621169916434541e-06,
1113
- "loss": 0.1668,
1114
  "step": 1540
1115
  },
1116
  {
1117
- "epoch": 3.8766437069505324,
1118
- "grad_norm": 2.953248977661133,
1119
- "learning_rate": 7.454038997214485e-06,
1120
- "loss": 0.1712,
1121
  "step": 1550
1122
  },
1123
  {
1124
- "epoch": 3.9016906700062615,
1125
- "grad_norm": 4.06650972366333,
1126
- "learning_rate": 7.2869080779944286e-06,
1127
- "loss": 0.1621,
1128
  "step": 1560
1129
  },
1130
  {
1131
- "epoch": 3.9267376330619914,
1132
- "grad_norm": 4.628715991973877,
1133
- "learning_rate": 7.119777158774373e-06,
1134
- "loss": 0.1513,
1135
  "step": 1570
1136
  },
1137
  {
1138
- "epoch": 3.9517845961177205,
1139
- "grad_norm": 2.5671701431274414,
1140
- "learning_rate": 6.952646239554318e-06,
1141
- "loss": 0.1918,
1142
  "step": 1580
1143
  },
1144
  {
1145
- "epoch": 3.9768315591734504,
1146
- "grad_norm": 4.511129379272461,
1147
- "learning_rate": 6.785515320334261e-06,
1148
- "loss": 0.1957,
1149
  "step": 1590
1150
  },
1151
  {
1152
- "epoch": 4.0,
1153
- "grad_norm": 1.4342707395553589,
1154
- "learning_rate": 6.618384401114206e-06,
1155
- "loss": 0.1529,
1156
- "step": 1600
1157
- },
1158
- {
1159
- "epoch": 4.0,
1160
- "eval_accuracy": 0.9805825242718447,
1161
- "eval_loss": 0.10449180752038956,
1162
- "eval_runtime": 5.032,
1163
- "eval_samples_per_second": 1350.958,
1164
- "eval_steps_per_second": 42.329,
1165
- "step": 1600
1166
- },
1167
- {
1168
- "epoch": 4.025046963055729,
1169
- "grad_norm": 3.8492510318756104,
1170
- "learning_rate": 6.4512534818941505e-06,
1171
- "loss": 0.1797,
1172
- "step": 1610
1173
- },
1174
- {
1175
- "epoch": 4.050093926111459,
1176
- "grad_norm": 4.300637245178223,
1177
- "learning_rate": 6.284122562674095e-06,
1178
- "loss": 0.1598,
1179
- "step": 1620
1180
- },
1181
- {
1182
- "epoch": 4.075140889167188,
1183
- "grad_norm": 1.9370712041854858,
1184
- "learning_rate": 6.116991643454039e-06,
1185
- "loss": 0.1746,
1186
- "step": 1630
1187
- },
1188
- {
1189
- "epoch": 4.100187852222918,
1190
- "grad_norm": 2.7867672443389893,
1191
- "learning_rate": 5.949860724233983e-06,
1192
- "loss": 0.1913,
1193
- "step": 1640
1194
- },
1195
- {
1196
- "epoch": 4.125234815278647,
1197
- "grad_norm": 3.536440372467041,
1198
- "learning_rate": 5.782729805013928e-06,
1199
- "loss": 0.1731,
1200
- "step": 1650
1201
- },
1202
- {
1203
- "epoch": 4.150281778334377,
1204
- "grad_norm": 3.2297356128692627,
1205
- "learning_rate": 5.615598885793872e-06,
1206
- "loss": 0.2019,
1207
- "step": 1660
1208
- },
1209
- {
1210
- "epoch": 4.175328741390106,
1211
- "grad_norm": 5.139032363891602,
1212
- "learning_rate": 5.448467966573816e-06,
1213
- "loss": 0.1793,
1214
- "step": 1670
1215
- },
1216
- {
1217
- "epoch": 4.200375704445836,
1218
- "grad_norm": 3.031764030456543,
1219
- "learning_rate": 5.281337047353761e-06,
1220
- "loss": 0.1836,
1221
- "step": 1680
1222
- },
1223
- {
1224
- "epoch": 4.225422667501565,
1225
- "grad_norm": 2.6612586975097656,
1226
- "learning_rate": 5.114206128133705e-06,
1227
- "loss": 0.1629,
1228
- "step": 1690
1229
- },
1230
- {
1231
- "epoch": 4.250469630557295,
1232
- "grad_norm": 2.8296072483062744,
1233
- "learning_rate": 4.947075208913649e-06,
1234
- "loss": 0.1691,
1235
- "step": 1700
1236
- },
1237
- {
1238
- "epoch": 4.275516593613024,
1239
- "grad_norm": 4.683578968048096,
1240
- "learning_rate": 4.7799442896935936e-06,
1241
- "loss": 0.1489,
1242
- "step": 1710
1243
- },
1244
- {
1245
- "epoch": 4.300563556668754,
1246
- "grad_norm": 3.7361319065093994,
1247
- "learning_rate": 4.612813370473538e-06,
1248
- "loss": 0.1453,
1249
- "step": 1720
1250
- },
1251
- {
1252
- "epoch": 4.325610519724483,
1253
- "grad_norm": 4.293661117553711,
1254
- "learning_rate": 4.445682451253482e-06,
1255
- "loss": 0.1445,
1256
- "step": 1730
1257
- },
1258
- {
1259
- "epoch": 4.350657482780213,
1260
- "grad_norm": 3.1756207942962646,
1261
- "learning_rate": 4.278551532033426e-06,
1262
- "loss": 0.1523,
1263
- "step": 1740
1264
- },
1265
- {
1266
- "epoch": 4.375704445835942,
1267
- "grad_norm": 3.921405792236328,
1268
- "learning_rate": 4.111420612813371e-06,
1269
- "loss": 0.1603,
1270
- "step": 1750
1271
- },
1272
- {
1273
- "epoch": 4.400751408891672,
1274
- "grad_norm": 2.8336334228515625,
1275
- "learning_rate": 3.944289693593315e-06,
1276
- "loss": 0.1764,
1277
- "step": 1760
1278
- },
1279
- {
1280
- "epoch": 4.425798371947401,
1281
- "grad_norm": 3.7013275623321533,
1282
- "learning_rate": 3.7771587743732592e-06,
1283
- "loss": 0.1481,
1284
- "step": 1770
1285
- },
1286
- {
1287
- "epoch": 4.450845335003131,
1288
- "grad_norm": 1.9302759170532227,
1289
- "learning_rate": 3.6100278551532034e-06,
1290
- "loss": 0.1526,
1291
- "step": 1780
1292
- },
1293
- {
1294
- "epoch": 4.47589229805886,
1295
- "grad_norm": 4.772688865661621,
1296
- "learning_rate": 3.4428969359331475e-06,
1297
- "loss": 0.1595,
1298
- "step": 1790
1299
- },
1300
- {
1301
- "epoch": 4.50093926111459,
1302
- "grad_norm": 3.1509013175964355,
1303
- "learning_rate": 3.275766016713092e-06,
1304
- "loss": 0.1384,
1305
- "step": 1800
1306
- },
1307
- {
1308
- "epoch": 4.525986224170319,
1309
- "grad_norm": 2.2213551998138428,
1310
- "learning_rate": 3.108635097493036e-06,
1311
- "loss": 0.1542,
1312
- "step": 1810
1313
- },
1314
- {
1315
- "epoch": 4.551033187226049,
1316
- "grad_norm": 2.7349748611450195,
1317
- "learning_rate": 2.9415041782729803e-06,
1318
- "loss": 0.1739,
1319
- "step": 1820
1320
- },
1321
- {
1322
- "epoch": 4.576080150281778,
1323
- "grad_norm": 5.009521961212158,
1324
- "learning_rate": 2.774373259052925e-06,
1325
- "loss": 0.1355,
1326
- "step": 1830
1327
- },
1328
- {
1329
- "epoch": 4.601127113337508,
1330
- "grad_norm": 3.5362050533294678,
1331
- "learning_rate": 2.607242339832869e-06,
1332
- "loss": 0.1335,
1333
- "step": 1840
1334
- },
1335
- {
1336
- "epoch": 4.626174076393237,
1337
- "grad_norm": 4.31157922744751,
1338
- "learning_rate": 2.4401114206128136e-06,
1339
- "loss": 0.1848,
1340
- "step": 1850
1341
- },
1342
- {
1343
- "epoch": 4.651221039448966,
1344
- "grad_norm": 3.6075448989868164,
1345
- "learning_rate": 2.2729805013927577e-06,
1346
- "loss": 0.1763,
1347
- "step": 1860
1348
- },
1349
- {
1350
- "epoch": 4.676268002504696,
1351
- "grad_norm": 3.1636250019073486,
1352
- "learning_rate": 2.1058495821727023e-06,
1353
- "loss": 0.1672,
1354
- "step": 1870
1355
- },
1356
- {
1357
- "epoch": 4.701314965560426,
1358
- "grad_norm": 0.7837923169136047,
1359
- "learning_rate": 1.9387186629526464e-06,
1360
- "loss": 0.1223,
1361
- "step": 1880
1362
- },
1363
- {
1364
- "epoch": 4.726361928616155,
1365
- "grad_norm": 3.2040278911590576,
1366
- "learning_rate": 1.7715877437325906e-06,
1367
- "loss": 0.1676,
1368
- "step": 1890
1369
- },
1370
- {
1371
- "epoch": 4.751408891671884,
1372
- "grad_norm": 2.6808199882507324,
1373
- "learning_rate": 1.604456824512535e-06,
1374
- "loss": 0.1606,
1375
- "step": 1900
1376
- },
1377
- {
1378
- "epoch": 4.776455854727614,
1379
- "grad_norm": 2.356374979019165,
1380
- "learning_rate": 1.437325905292479e-06,
1381
- "loss": 0.1655,
1382
- "step": 1910
1383
- },
1384
- {
1385
- "epoch": 4.801502817783343,
1386
- "grad_norm": 3.077162504196167,
1387
- "learning_rate": 1.2701949860724234e-06,
1388
- "loss": 0.1332,
1389
- "step": 1920
1390
- },
1391
- {
1392
- "epoch": 4.826549780839073,
1393
- "grad_norm": 3.580504894256592,
1394
- "learning_rate": 1.1030640668523677e-06,
1395
- "loss": 0.1687,
1396
- "step": 1930
1397
- },
1398
- {
1399
- "epoch": 4.851596743894802,
1400
- "grad_norm": 4.193363189697266,
1401
- "learning_rate": 9.35933147632312e-07,
1402
- "loss": 0.163,
1403
- "step": 1940
1404
- },
1405
- {
1406
- "epoch": 4.876643706950532,
1407
- "grad_norm": 3.2785427570343018,
1408
- "learning_rate": 7.688022284122563e-07,
1409
- "loss": 0.1546,
1410
- "step": 1950
1411
- },
1412
- {
1413
- "epoch": 4.9016906700062615,
1414
- "grad_norm": 2.1263206005096436,
1415
- "learning_rate": 6.016713091922006e-07,
1416
- "loss": 0.1445,
1417
- "step": 1960
1418
- },
1419
- {
1420
- "epoch": 4.926737633061991,
1421
- "grad_norm": 1.8054914474487305,
1422
- "learning_rate": 4.3454038997214486e-07,
1423
- "loss": 0.1489,
1424
- "step": 1970
1425
- },
1426
- {
1427
- "epoch": 4.9517845961177205,
1428
- "grad_norm": 2.179539680480957,
1429
- "learning_rate": 2.6740947075208915e-07,
1430
- "loss": 0.1632,
1431
- "step": 1980
1432
- },
1433
- {
1434
- "epoch": 4.97683155917345,
1435
- "grad_norm": 2.8142571449279785,
1436
- "learning_rate": 1.0027855153203343e-07,
1437
- "loss": 0.15,
1438
- "step": 1990
1439
- },
1440
- {
1441
- "epoch": 4.989355040701315,
1442
- "eval_accuracy": 0.9814651368049426,
1443
- "eval_loss": 0.09822726994752884,
1444
- "eval_runtime": 5.1844,
1445
- "eval_samples_per_second": 1311.237,
1446
- "eval_steps_per_second": 41.085,
1447
- "step": 1995
1448
  },
1449
  {
1450
- "epoch": 4.989355040701315,
1451
- "step": 1995,
1452
- "total_flos": 2.357895379209216e+18,
1453
- "train_loss": 0.5734889231528854,
1454
- "train_runtime": 656.7578,
1455
- "train_samples_per_second": 388.987,
1456
- "train_steps_per_second": 3.038
1457
  }
1458
  ],
1459
  "logging_steps": 10,
1460
- "max_steps": 1995,
1461
  "num_input_tokens_seen": 0,
1462
- "num_train_epochs": 5,
1463
  "save_steps": 500,
1464
  "stateful_callbacks": {
1465
  "TrainerControl": {
@@ -1473,8 +1220,8 @@
1473
  "attributes": {}
1474
  }
1475
  },
1476
- "total_flos": 2.357895379209216e+18,
1477
- "train_batch_size": 32,
1478
  "trial_name": null,
1479
  "trial_params": null
1480
  }
 
1
  {
2
+ "best_metric": 0.9830832597822889,
3
+ "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1393",
4
+ "epoch": 7.996245306633291,
5
  "eval_steps": 500,
6
+ "global_step": 1592,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.05006257822277847,
13
+ "grad_norm": 2.299422264099121,
14
+ "learning_rate": 1.875e-06,
15
+ "loss": 4.1412,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.10012515644555695,
20
+ "grad_norm": 3.199314832687378,
21
+ "learning_rate": 3.75e-06,
22
+ "loss": 4.1637,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.15018773466833543,
27
+ "grad_norm": 3.3083832263946533,
28
+ "learning_rate": 5.625e-06,
29
+ "loss": 4.0438,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.2002503128911139,
34
+ "grad_norm": 4.229264736175537,
35
+ "learning_rate": 7.5e-06,
36
+ "loss": 3.8012,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.2503128911138924,
41
+ "grad_norm": 5.718367576599121,
42
+ "learning_rate": 9.375000000000001e-06,
43
+ "loss": 3.3779,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.30037546933667086,
48
+ "grad_norm": 6.0788254737854,
49
+ "learning_rate": 1.125e-05,
50
+ "loss": 2.8533,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.3504380475594493,
55
+ "grad_norm": 5.600748538970947,
56
+ "learning_rate": 1.3125e-05,
57
+ "loss": 2.4796,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.4005006257822278,
62
+ "grad_norm": 5.035912990570068,
63
+ "learning_rate": 1.5e-05,
64
+ "loss": 2.253,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.45056320400500627,
69
+ "grad_norm": 4.351953983306885,
70
+ "learning_rate": 1.6875e-05,
71
+ "loss": 2.0939,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.5006257822277848,
76
+ "grad_norm": 3.4278855323791504,
77
+ "learning_rate": 1.8750000000000002e-05,
78
+ "loss": 1.9864,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.5506883604505632,
83
+ "grad_norm": 2.862748384475708,
84
+ "learning_rate": 2.0625e-05,
85
+ "loss": 1.8611,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.6007509386733417,
90
+ "grad_norm": 1.2488276958465576,
91
+ "learning_rate": 2.25e-05,
92
+ "loss": 1.8302,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 0.6508135168961201,
97
+ "grad_norm": 0.6365911960601807,
98
+ "learning_rate": 2.4375e-05,
99
+ "loss": 1.7982,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 0.7008760951188986,
104
+ "grad_norm": 0.5073445439338684,
105
+ "learning_rate": 2.625e-05,
106
+ "loss": 1.6792,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 0.7509386733416771,
111
+ "grad_norm": 0.9256235361099243,
112
+ "learning_rate": 2.8125e-05,
113
+ "loss": 1.7528,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 0.8010012515644556,
118
+ "grad_norm": 5.669793128967285,
119
+ "learning_rate": 3e-05,
120
+ "loss": 1.7688,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 0.851063829787234,
125
+ "grad_norm": 10.126972198486328,
126
+ "learning_rate": 2.979050279329609e-05,
127
+ "loss": 1.6876,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 0.9011264080100125,
132
+ "grad_norm": 1.5617619752883911,
133
+ "learning_rate": 2.958100558659218e-05,
134
+ "loss": 1.6298,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 0.951188986232791,
139
+ "grad_norm": 1.5987392663955688,
140
+ "learning_rate": 2.937150837988827e-05,
141
+ "loss": 1.6106,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 0.9962453066332916,
146
+ "eval_accuracy": 0.6209179170344219,
147
+ "eval_loss": 1.4251551628112793,
148
+ "eval_runtime": 4.8605,
149
+ "eval_samples_per_second": 1398.631,
150
+ "eval_steps_per_second": 43.823,
151
+ "step": 199
152
+ },
153
+ {
154
+ "epoch": 1.0050062578222778,
155
+ "grad_norm": 3.5913760662078857,
156
+ "learning_rate": 2.9162011173184356e-05,
157
+ "loss": 1.6815,
158
  "step": 200
159
  },
160
  {
161
+ "epoch": 1.0550688360450564,
162
+ "grad_norm": 2.8698642253875732,
163
+ "learning_rate": 2.895251396648045e-05,
164
+ "loss": 1.4457,
165
  "step": 210
166
  },
167
  {
168
+ "epoch": 1.1051314142678348,
169
+ "grad_norm": 2.3613011837005615,
170
+ "learning_rate": 2.8743016759776535e-05,
171
+ "loss": 1.4503,
172
  "step": 220
173
  },
174
  {
175
+ "epoch": 1.1551939924906134,
176
+ "grad_norm": 3.4527103900909424,
177
+ "learning_rate": 2.8533519553072625e-05,
178
+ "loss": 1.2686,
179
  "step": 230
180
  },
181
  {
182
+ "epoch": 1.2052565707133918,
183
+ "grad_norm": 4.879206657409668,
184
+ "learning_rate": 2.8324022346368715e-05,
185
+ "loss": 1.2226,
186
  "step": 240
187
  },
188
  {
189
+ "epoch": 1.2553191489361701,
190
+ "grad_norm": 3.2351438999176025,
191
+ "learning_rate": 2.8114525139664805e-05,
192
+ "loss": 1.1545,
193
  "step": 250
194
  },
195
  {
196
+ "epoch": 1.3053817271589487,
197
+ "grad_norm": 5.1034464836120605,
198
+ "learning_rate": 2.7905027932960894e-05,
199
+ "loss": 1.1284,
200
  "step": 260
201
  },
202
  {
203
+ "epoch": 1.355444305381727,
204
+ "grad_norm": 2.128084421157837,
205
+ "learning_rate": 2.7695530726256984e-05,
206
+ "loss": 1.0926,
207
  "step": 270
208
  },
209
  {
210
+ "epoch": 1.4055068836045057,
211
+ "grad_norm": 5.853870391845703,
212
+ "learning_rate": 2.7486033519553074e-05,
213
+ "loss": 1.075,
214
  "step": 280
215
  },
216
  {
217
+ "epoch": 1.455569461827284,
218
+ "grad_norm": 2.4751949310302734,
219
+ "learning_rate": 2.7276536312849163e-05,
220
+ "loss": 0.9992,
221
  "step": 290
222
  },
223
  {
224
+ "epoch": 1.5056320400500627,
225
+ "grad_norm": 3.3400278091430664,
226
+ "learning_rate": 2.7067039106145253e-05,
227
+ "loss": 0.9649,
228
  "step": 300
229
  },
230
  {
231
+ "epoch": 1.555694618272841,
232
+ "grad_norm": 2.893463611602783,
233
+ "learning_rate": 2.685754189944134e-05,
234
+ "loss": 1.0066,
235
  "step": 310
236
  },
237
  {
238
+ "epoch": 1.6057571964956194,
239
+ "grad_norm": 2.179349660873413,
240
+ "learning_rate": 2.6648044692737432e-05,
241
+ "loss": 0.9203,
242
  "step": 320
243
  },
244
  {
245
+ "epoch": 1.655819774718398,
246
+ "grad_norm": 4.882504463195801,
247
+ "learning_rate": 2.643854748603352e-05,
248
+ "loss": 0.9159,
249
  "step": 330
250
  },
251
  {
252
+ "epoch": 1.7058823529411766,
253
+ "grad_norm": 4.354543685913086,
254
+ "learning_rate": 2.622905027932961e-05,
255
+ "loss": 0.8608,
256
  "step": 340
257
  },
258
  {
259
+ "epoch": 1.7559449311639548,
260
+ "grad_norm": 3.870502233505249,
261
+ "learning_rate": 2.60195530726257e-05,
262
+ "loss": 0.8231,
263
  "step": 350
264
  },
265
  {
266
+ "epoch": 1.8060075093867334,
267
+ "grad_norm": 3.579007148742676,
268
+ "learning_rate": 2.5810055865921788e-05,
269
+ "loss": 0.7965,
270
  "step": 360
271
  },
272
  {
273
+ "epoch": 1.856070087609512,
274
+ "grad_norm": 4.881648540496826,
275
+ "learning_rate": 2.5600558659217877e-05,
276
+ "loss": 0.7647,
277
  "step": 370
278
  },
279
  {
280
+ "epoch": 1.9061326658322904,
281
+ "grad_norm": 2.9336421489715576,
282
+ "learning_rate": 2.5391061452513967e-05,
283
+ "loss": 0.7005,
284
  "step": 380
285
  },
286
  {
287
+ "epoch": 1.9561952440550687,
288
+ "grad_norm": 2.542874813079834,
289
+ "learning_rate": 2.5181564245810057e-05,
290
+ "loss": 0.6495,
291
  "step": 390
292
  },
293
  {
294
+ "epoch": 1.9962453066332917,
295
+ "eval_accuracy": 0.9682259488084731,
296
+ "eval_loss": 0.5032486915588379,
297
+ "eval_runtime": 4.9258,
298
+ "eval_samples_per_second": 1380.074,
299
+ "eval_steps_per_second": 43.242,
300
+ "step": 398
301
  },
302
  {
303
+ "epoch": 2.0100125156445556,
304
+ "grad_norm": 1.9056552648544312,
305
+ "learning_rate": 2.4972067039106143e-05,
306
+ "loss": 0.6495,
 
 
307
  "step": 400
308
  },
309
  {
310
+ "epoch": 2.0600750938673342,
311
+ "grad_norm": 3.567265033721924,
312
+ "learning_rate": 2.4762569832402236e-05,
313
+ "loss": 0.5869,
314
  "step": 410
315
  },
316
  {
317
+ "epoch": 2.110137672090113,
318
+ "grad_norm": 2.240018844604492,
319
+ "learning_rate": 2.4553072625698326e-05,
320
+ "loss": 0.5728,
321
  "step": 420
322
  },
323
  {
324
+ "epoch": 2.160200250312891,
325
+ "grad_norm": 2.6313724517822266,
326
+ "learning_rate": 2.4343575418994412e-05,
327
+ "loss": 0.5028,
328
  "step": 430
329
  },
330
  {
331
+ "epoch": 2.2102628285356696,
332
+ "grad_norm": 3.360229015350342,
333
+ "learning_rate": 2.4134078212290505e-05,
334
+ "loss": 0.4928,
335
  "step": 440
336
  },
337
  {
338
+ "epoch": 2.260325406758448,
339
+ "grad_norm": 5.249541282653809,
340
+ "learning_rate": 2.392458100558659e-05,
341
+ "loss": 0.4773,
342
  "step": 450
343
  },
344
  {
345
+ "epoch": 2.3103879849812268,
346
+ "grad_norm": 3.6117191314697266,
347
+ "learning_rate": 2.3715083798882685e-05,
348
+ "loss": 0.4852,
349
  "step": 460
350
  },
351
  {
352
+ "epoch": 2.360450563204005,
353
+ "grad_norm": 4.820945739746094,
354
+ "learning_rate": 2.350558659217877e-05,
355
+ "loss": 0.4331,
356
  "step": 470
357
  },
358
  {
359
+ "epoch": 2.4105131414267835,
360
+ "grad_norm": 4.089610576629639,
361
+ "learning_rate": 2.329608938547486e-05,
362
+ "loss": 0.4246,
363
  "step": 480
364
  },
365
  {
366
+ "epoch": 2.460575719649562,
367
+ "grad_norm": 4.083464622497559,
368
+ "learning_rate": 2.308659217877095e-05,
369
+ "loss": 0.3752,
370
  "step": 490
371
  },
372
  {
373
+ "epoch": 2.5106382978723403,
374
+ "grad_norm": 4.422226428985596,
375
+ "learning_rate": 2.287709497206704e-05,
376
+ "loss": 0.3916,
377
  "step": 500
378
  },
379
  {
380
+ "epoch": 2.560700876095119,
381
+ "grad_norm": 2.952890634536743,
382
+ "learning_rate": 2.266759776536313e-05,
383
+ "loss": 0.3973,
384
  "step": 510
385
  },
386
  {
387
+ "epoch": 2.6107634543178975,
388
+ "grad_norm": 3.720259428024292,
389
+ "learning_rate": 2.245810055865922e-05,
390
+ "loss": 0.3432,
391
  "step": 520
392
  },
393
  {
394
+ "epoch": 2.660826032540676,
395
+ "grad_norm": 4.10168981552124,
396
+ "learning_rate": 2.224860335195531e-05,
397
+ "loss": 0.3479,
398
  "step": 530
399
  },
400
  {
401
+ "epoch": 2.710888610763454,
402
+ "grad_norm": 4.39931058883667,
403
+ "learning_rate": 2.2039106145251395e-05,
404
+ "loss": 0.3418,
405
  "step": 540
406
  },
407
  {
408
+ "epoch": 2.760951188986233,
409
+ "grad_norm": 2.6174728870391846,
410
+ "learning_rate": 2.182960893854749e-05,
411
+ "loss": 0.3153,
412
  "step": 550
413
  },
414
  {
415
+ "epoch": 2.8110137672090114,
416
+ "grad_norm": 3.489020347595215,
417
+ "learning_rate": 2.1620111731843575e-05,
418
+ "loss": 0.3242,
419
  "step": 560
420
  },
421
  {
422
+ "epoch": 2.8610763454317896,
423
+ "grad_norm": 3.2841830253601074,
424
+ "learning_rate": 2.1410614525139664e-05,
425
+ "loss": 0.3016,
426
  "step": 570
427
  },
428
  {
429
+ "epoch": 2.911138923654568,
430
+ "grad_norm": 4.06994104385376,
431
+ "learning_rate": 2.1201117318435754e-05,
432
+ "loss": 0.3237,
433
  "step": 580
434
  },
435
  {
436
+ "epoch": 2.9612015018773468,
437
+ "grad_norm": 2.4799962043762207,
438
+ "learning_rate": 2.0991620111731844e-05,
439
+ "loss": 0.2978,
440
  "step": 590
441
  },
442
  {
443
+ "epoch": 2.9962453066332917,
444
+ "eval_accuracy": 0.9782288908502501,
445
+ "eval_loss": 0.19027507305145264,
446
+ "eval_runtime": 4.8737,
447
+ "eval_samples_per_second": 1394.823,
448
+ "eval_steps_per_second": 43.704,
449
+ "step": 597
450
+ },
451
+ {
452
+ "epoch": 3.0150187734668337,
453
+ "grad_norm": 2.1213083267211914,
454
+ "learning_rate": 2.0782122905027933e-05,
455
+ "loss": 0.3231,
456
  "step": 600
457
  },
458
  {
459
+ "epoch": 3.065081351689612,
460
+ "grad_norm": 4.8361945152282715,
461
+ "learning_rate": 2.0572625698324023e-05,
462
+ "loss": 0.302,
463
  "step": 610
464
  },
465
  {
466
+ "epoch": 3.1151439299123904,
467
+ "grad_norm": 3.293104887008667,
468
+ "learning_rate": 2.0363128491620113e-05,
469
+ "loss": 0.2883,
470
  "step": 620
471
  },
472
  {
473
+ "epoch": 3.165206508135169,
474
+ "grad_norm": 3.274291515350342,
475
+ "learning_rate": 2.01536312849162e-05,
476
+ "loss": 0.2961,
477
  "step": 630
478
  },
479
  {
480
+ "epoch": 3.2152690863579476,
481
+ "grad_norm": 3.7976105213165283,
482
+ "learning_rate": 1.9944134078212292e-05,
483
+ "loss": 0.2688,
484
  "step": 640
485
  },
486
  {
487
+ "epoch": 3.2653316645807258,
488
+ "grad_norm": 2.9893229007720947,
489
+ "learning_rate": 1.973463687150838e-05,
490
+ "loss": 0.2446,
491
  "step": 650
492
  },
493
  {
494
+ "epoch": 3.3153942428035044,
495
+ "grad_norm": 2.2266604900360107,
496
+ "learning_rate": 1.952513966480447e-05,
497
+ "loss": 0.2613,
498
  "step": 660
499
  },
500
  {
501
+ "epoch": 3.365456821026283,
502
+ "grad_norm": 3.621093511581421,
503
+ "learning_rate": 1.9315642458100558e-05,
504
+ "loss": 0.2512,
505
  "step": 670
506
  },
507
  {
508
+ "epoch": 3.415519399249061,
509
+ "grad_norm": 2.952971935272217,
510
+ "learning_rate": 1.9106145251396648e-05,
511
+ "loss": 0.2536,
512
  "step": 680
513
  },
514
  {
515
+ "epoch": 3.4655819774718397,
516
+ "grad_norm": 2.7361905574798584,
517
+ "learning_rate": 1.889664804469274e-05,
518
+ "loss": 0.2396,
519
  "step": 690
520
  },
521
  {
522
+ "epoch": 3.5156445556946183,
523
+ "grad_norm": 2.3844313621520996,
524
+ "learning_rate": 1.8687150837988827e-05,
525
+ "loss": 0.2518,
526
  "step": 700
527
  },
528
  {
529
+ "epoch": 3.565707133917397,
530
+ "grad_norm": 3.0508193969726562,
531
+ "learning_rate": 1.8477653631284917e-05,
532
+ "loss": 0.2325,
533
  "step": 710
534
  },
535
  {
536
+ "epoch": 3.615769712140175,
537
+ "grad_norm": 3.923941135406494,
538
+ "learning_rate": 1.8268156424581006e-05,
539
+ "loss": 0.2277,
540
  "step": 720
541
  },
542
  {
543
+ "epoch": 3.6658322903629537,
544
+ "grad_norm": 2.638787031173706,
545
+ "learning_rate": 1.8058659217877096e-05,
546
+ "loss": 0.2292,
547
  "step": 730
548
  },
549
  {
550
+ "epoch": 3.7158948685857323,
551
+ "grad_norm": 2.75313138961792,
552
+ "learning_rate": 1.7849162011173182e-05,
553
+ "loss": 0.2364,
554
  "step": 740
555
  },
556
  {
557
+ "epoch": 3.7659574468085104,
558
+ "grad_norm": 3.686354398727417,
559
+ "learning_rate": 1.7639664804469275e-05,
560
+ "loss": 0.2409,
561
  "step": 750
562
  },
563
  {
564
+ "epoch": 3.816020025031289,
565
+ "grad_norm": 4.230103969573975,
566
+ "learning_rate": 1.7430167597765365e-05,
567
+ "loss": 0.2293,
568
  "step": 760
569
  },
570
  {
571
+ "epoch": 3.8660826032540676,
572
+ "grad_norm": 4.4972100257873535,
573
+ "learning_rate": 1.722067039106145e-05,
574
+ "loss": 0.2431,
575
  "step": 770
576
  },
577
  {
578
+ "epoch": 3.916145181476846,
579
+ "grad_norm": 3.6224372386932373,
580
+ "learning_rate": 1.7011173184357544e-05,
581
+ "loss": 0.2099,
582
  "step": 780
583
  },
584
  {
585
+ "epoch": 3.966207759699625,
586
+ "grad_norm": 3.072998285293579,
587
+ "learning_rate": 1.680167597765363e-05,
588
+ "loss": 0.2273,
589
  "step": 790
590
  },
591
  {
592
+ "epoch": 3.9962453066332917,
593
+ "eval_accuracy": 0.9771991762283024,
594
+ "eval_loss": 0.14364813268184662,
595
+ "eval_runtime": 4.9858,
596
+ "eval_samples_per_second": 1363.468,
597
+ "eval_steps_per_second": 42.721,
598
+ "step": 796
599
  },
600
  {
601
+ "epoch": 4.020025031289111,
602
+ "grad_norm": 3.3532874584198,
603
+ "learning_rate": 1.659217877094972e-05,
604
+ "loss": 0.23,
 
 
605
  "step": 800
606
  },
607
  {
608
+ "epoch": 4.07008760951189,
609
+ "grad_norm": 2.384181261062622,
610
+ "learning_rate": 1.638268156424581e-05,
611
+ "loss": 0.2157,
612
  "step": 810
613
  },
614
  {
615
+ "epoch": 4.1201501877346685,
616
+ "grad_norm": 4.237877368927002,
617
+ "learning_rate": 1.61731843575419e-05,
618
+ "loss": 0.2141,
619
  "step": 820
620
  },
621
  {
622
+ "epoch": 4.170212765957447,
623
+ "grad_norm": 3.8752825260162354,
624
+ "learning_rate": 1.5963687150837986e-05,
625
+ "loss": 0.214,
626
  "step": 830
627
  },
628
  {
629
+ "epoch": 4.220275344180226,
630
+ "grad_norm": 2.571617364883423,
631
+ "learning_rate": 1.575418994413408e-05,
632
+ "loss": 0.2124,
633
  "step": 840
634
  },
635
  {
636
+ "epoch": 4.270337922403003,
637
+ "grad_norm": 1.8986117839813232,
638
+ "learning_rate": 1.554469273743017e-05,
639
+ "loss": 0.2142,
640
  "step": 850
641
  },
642
  {
643
+ "epoch": 4.320400500625782,
644
+ "grad_norm": 3.71712589263916,
645
+ "learning_rate": 1.533519553072626e-05,
646
+ "loss": 0.1939,
647
  "step": 860
648
  },
649
  {
650
+ "epoch": 4.370463078848561,
651
+ "grad_norm": 2.1387696266174316,
652
+ "learning_rate": 1.5125698324022348e-05,
653
+ "loss": 0.1905,
654
  "step": 870
655
  },
656
  {
657
+ "epoch": 4.420525657071339,
658
+ "grad_norm": 3.3053841590881348,
659
+ "learning_rate": 1.4916201117318435e-05,
660
+ "loss": 0.2156,
661
  "step": 880
662
  },
663
  {
664
+ "epoch": 4.470588235294118,
665
+ "grad_norm": 2.574657440185547,
666
+ "learning_rate": 1.4706703910614526e-05,
667
+ "loss": 0.198,
668
  "step": 890
669
  },
670
  {
671
+ "epoch": 4.520650813516896,
672
+ "grad_norm": 2.290309429168701,
673
+ "learning_rate": 1.4497206703910616e-05,
674
+ "loss": 0.1754,
675
  "step": 900
676
  },
677
  {
678
+ "epoch": 4.570713391739675,
679
+ "grad_norm": 2.4253950119018555,
680
+ "learning_rate": 1.4287709497206705e-05,
681
+ "loss": 0.1971,
682
  "step": 910
683
  },
684
  {
685
+ "epoch": 4.6207759699624535,
686
+ "grad_norm": 3.070322275161743,
687
+ "learning_rate": 1.4078212290502793e-05,
688
+ "loss": 0.1928,
689
  "step": 920
690
  },
691
  {
692
+ "epoch": 4.670838548185231,
693
+ "grad_norm": 1.5523446798324585,
694
+ "learning_rate": 1.3868715083798883e-05,
695
+ "loss": 0.1948,
696
  "step": 930
697
  },
698
  {
699
+ "epoch": 4.72090112640801,
700
+ "grad_norm": 3.076679229736328,
701
+ "learning_rate": 1.3659217877094973e-05,
702
+ "loss": 0.1822,
703
  "step": 940
704
  },
705
  {
706
+ "epoch": 4.7709637046307884,
707
+ "grad_norm": 3.6825084686279297,
708
+ "learning_rate": 1.344972067039106e-05,
709
+ "loss": 0.1954,
710
  "step": 950
711
  },
712
  {
713
+ "epoch": 4.821026282853567,
714
+ "grad_norm": 4.037261009216309,
715
+ "learning_rate": 1.324022346368715e-05,
716
+ "loss": 0.1946,
717
  "step": 960
718
  },
719
  {
720
+ "epoch": 4.871088861076346,
721
+ "grad_norm": 3.026543378829956,
722
+ "learning_rate": 1.3030726256983242e-05,
723
+ "loss": 0.2155,
724
  "step": 970
725
  },
726
  {
727
+ "epoch": 4.921151439299124,
728
+ "grad_norm": 2.0362164974212646,
729
+ "learning_rate": 1.282122905027933e-05,
730
+ "loss": 0.1851,
731
  "step": 980
732
  },
733
  {
734
+ "epoch": 4.971214017521902,
735
+ "grad_norm": 1.8788719177246094,
736
+ "learning_rate": 1.261173184357542e-05,
737
+ "loss": 0.1866,
738
  "step": 990
739
  },
740
  {
741
+ "epoch": 4.996245306633291,
742
+ "eval_accuracy": 0.981759340982642,
743
+ "eval_loss": 0.11026876419782639,
744
+ "eval_runtime": 4.7749,
745
+ "eval_samples_per_second": 1423.702,
746
+ "eval_steps_per_second": 44.608,
747
+ "step": 995
748
+ },
749
+ {
750
+ "epoch": 5.025031289111389,
751
+ "grad_norm": 3.612476110458374,
752
+ "learning_rate": 1.2402234636871509e-05,
753
+ "loss": 0.1911,
754
  "step": 1000
755
  },
756
  {
757
+ "epoch": 5.075093867334168,
758
+ "grad_norm": 1.3886641263961792,
759
+ "learning_rate": 1.2192737430167599e-05,
760
+ "loss": 0.1768,
761
  "step": 1010
762
  },
763
  {
764
+ "epoch": 5.1251564455569465,
765
+ "grad_norm": 3.217656135559082,
766
+ "learning_rate": 1.1983240223463687e-05,
767
+ "loss": 0.1835,
768
  "step": 1020
769
  },
770
  {
771
+ "epoch": 5.175219023779725,
772
+ "grad_norm": 2.281695604324341,
773
+ "learning_rate": 1.1773743016759776e-05,
774
+ "loss": 0.1858,
775
  "step": 1030
776
  },
777
  {
778
+ "epoch": 5.225281602002503,
779
+ "grad_norm": 2.7055630683898926,
780
+ "learning_rate": 1.1564245810055866e-05,
781
+ "loss": 0.1718,
782
  "step": 1040
783
  },
784
  {
785
+ "epoch": 5.275344180225281,
786
+ "grad_norm": 2.00937819480896,
787
+ "learning_rate": 1.1354748603351954e-05,
788
+ "loss": 0.1479,
789
  "step": 1050
790
  },
791
  {
792
+ "epoch": 5.32540675844806,
793
+ "grad_norm": 2.65446138381958,
794
+ "learning_rate": 1.1145251396648046e-05,
795
+ "loss": 0.1664,
796
  "step": 1060
797
  },
798
  {
799
+ "epoch": 5.375469336670839,
800
+ "grad_norm": 2.499176502227783,
801
+ "learning_rate": 1.0935754189944135e-05,
802
+ "loss": 0.1882,
803
  "step": 1070
804
  },
805
  {
806
+ "epoch": 5.425531914893617,
807
+ "grad_norm": 3.318516492843628,
808
+ "learning_rate": 1.0726256983240223e-05,
809
+ "loss": 0.1823,
810
  "step": 1080
811
  },
812
  {
813
+ "epoch": 5.475594493116396,
814
+ "grad_norm": 2.1236233711242676,
815
+ "learning_rate": 1.0516759776536313e-05,
816
+ "loss": 0.1657,
817
  "step": 1090
818
  },
819
  {
820
+ "epoch": 5.5256570713391735,
821
+ "grad_norm": 3.342689037322998,
822
+ "learning_rate": 1.0307262569832403e-05,
823
+ "loss": 0.1823,
824
  "step": 1100
825
  },
826
  {
827
+ "epoch": 5.575719649561952,
828
+ "grad_norm": 2.687920331954956,
829
+ "learning_rate": 1.0097765363128492e-05,
830
+ "loss": 0.1888,
831
  "step": 1110
832
  },
833
  {
834
+ "epoch": 5.625782227784731,
835
+ "grad_norm": 3.692422866821289,
836
+ "learning_rate": 9.88826815642458e-06,
837
+ "loss": 0.2053,
838
  "step": 1120
839
  },
840
  {
841
+ "epoch": 5.675844806007509,
842
+ "grad_norm": 3.453005790710449,
843
+ "learning_rate": 9.67877094972067e-06,
844
+ "loss": 0.179,
845
  "step": 1130
846
  },
847
  {
848
+ "epoch": 5.725907384230288,
849
+ "grad_norm": 4.005608081817627,
850
+ "learning_rate": 9.46927374301676e-06,
851
+ "loss": 0.1748,
852
  "step": 1140
853
  },
854
  {
855
+ "epoch": 5.7759699624530665,
856
+ "grad_norm": 2.1113505363464355,
857
+ "learning_rate": 9.25977653631285e-06,
858
+ "loss": 0.1574,
859
  "step": 1150
860
  },
861
  {
862
+ "epoch": 5.826032540675845,
863
+ "grad_norm": 4.529311180114746,
864
+ "learning_rate": 9.050279329608939e-06,
865
+ "loss": 0.1599,
866
  "step": 1160
867
  },
868
  {
869
+ "epoch": 5.876095118898624,
870
+ "grad_norm": 1.885956048965454,
871
+ "learning_rate": 8.840782122905029e-06,
872
+ "loss": 0.1909,
873
  "step": 1170
874
  },
875
  {
876
+ "epoch": 5.926157697121401,
877
+ "grad_norm": 2.369316816329956,
878
+ "learning_rate": 8.631284916201118e-06,
879
+ "loss": 0.1603,
880
  "step": 1180
881
  },
882
  {
883
+ "epoch": 5.97622027534418,
884
+ "grad_norm": 1.402648687362671,
885
+ "learning_rate": 8.421787709497206e-06,
886
+ "loss": 0.1616,
887
  "step": 1190
888
  },
889
  {
890
+ "epoch": 5.996245306633291,
891
+ "eval_accuracy": 0.9819064430714917,
892
+ "eval_loss": 0.0981006771326065,
893
+ "eval_runtime": 4.775,
894
+ "eval_samples_per_second": 1423.653,
895
+ "eval_steps_per_second": 44.607,
896
+ "step": 1194
897
  },
898
  {
899
+ "epoch": 6.030037546933667,
900
+ "grad_norm": 1.6693238019943237,
901
+ "learning_rate": 8.212290502793296e-06,
902
+ "loss": 0.1931,
 
 
903
  "step": 1200
904
  },
905
  {
906
+ "epoch": 6.080100125156446,
907
+ "grad_norm": 2.3462257385253906,
908
+ "learning_rate": 8.002793296089386e-06,
909
+ "loss": 0.1506,
910
  "step": 1210
911
  },
912
  {
913
+ "epoch": 6.130162703379224,
914
+ "grad_norm": 1.6939945220947266,
915
+ "learning_rate": 7.793296089385474e-06,
916
+ "loss": 0.1677,
917
  "step": 1220
918
  },
919
  {
920
+ "epoch": 6.180225281602002,
921
+ "grad_norm": 1.728092908859253,
922
+ "learning_rate": 7.583798882681565e-06,
923
+ "loss": 0.1569,
924
  "step": 1230
925
  },
926
  {
927
+ "epoch": 6.230287859824781,
928
+ "grad_norm": 1.6664111614227295,
929
+ "learning_rate": 7.374301675977653e-06,
930
+ "loss": 0.1564,
931
  "step": 1240
932
  },
933
  {
934
+ "epoch": 6.280350438047559,
935
+ "grad_norm": 2.0160274505615234,
936
+ "learning_rate": 7.164804469273744e-06,
937
+ "loss": 0.1513,
938
  "step": 1250
939
  },
940
  {
941
+ "epoch": 6.330413016270338,
942
+ "grad_norm": 4.013051986694336,
943
+ "learning_rate": 6.9553072625698325e-06,
944
+ "loss": 0.1594,
945
  "step": 1260
946
  },
947
  {
948
+ "epoch": 6.380475594493117,
949
+ "grad_norm": 3.11110258102417,
950
+ "learning_rate": 6.745810055865922e-06,
951
+ "loss": 0.1445,
952
  "step": 1270
953
  },
954
  {
955
+ "epoch": 6.430538172715895,
956
+ "grad_norm": 3.418999433517456,
957
+ "learning_rate": 6.536312849162011e-06,
958
+ "loss": 0.1679,
959
  "step": 1280
960
  },
961
  {
962
+ "epoch": 6.480600750938673,
963
+ "grad_norm": 2.4514362812042236,
964
+ "learning_rate": 6.326815642458101e-06,
965
+ "loss": 0.152,
966
  "step": 1290
967
  },
968
  {
969
+ "epoch": 6.5306633291614515,
970
+ "grad_norm": 3.2242462635040283,
971
+ "learning_rate": 6.1173184357541904e-06,
972
+ "loss": 0.1676,
973
  "step": 1300
974
  },
975
  {
976
+ "epoch": 6.58072590738423,
977
+ "grad_norm": 4.046393871307373,
978
+ "learning_rate": 5.907821229050279e-06,
979
+ "loss": 0.1615,
980
  "step": 1310
981
  },
982
  {
983
+ "epoch": 6.630788485607009,
984
+ "grad_norm": 1.9088122844696045,
985
+ "learning_rate": 5.698324022346369e-06,
986
+ "loss": 0.1465,
987
  "step": 1320
988
  },
989
  {
990
+ "epoch": 6.680851063829787,
991
+ "grad_norm": 2.5699033737182617,
992
+ "learning_rate": 5.488826815642458e-06,
993
+ "loss": 0.1472,
994
  "step": 1330
995
  },
996
  {
997
+ "epoch": 6.730913642052566,
998
+ "grad_norm": 1.9140872955322266,
999
+ "learning_rate": 5.2793296089385475e-06,
1000
+ "loss": 0.1585,
1001
  "step": 1340
1002
  },
1003
  {
1004
+ "epoch": 6.7809762202753445,
1005
+ "grad_norm": 2.022095203399658,
1006
+ "learning_rate": 5.069832402234637e-06,
1007
+ "loss": 0.1455,
1008
  "step": 1350
1009
  },
1010
  {
1011
+ "epoch": 6.831038798498122,
1012
+ "grad_norm": 2.5366971492767334,
1013
+ "learning_rate": 4.860335195530726e-06,
1014
+ "loss": 0.1542,
1015
  "step": 1360
1016
  },
1017
  {
1018
+ "epoch": 6.881101376720901,
1019
+ "grad_norm": 1.6112697124481201,
1020
+ "learning_rate": 4.650837988826816e-06,
1021
+ "loss": 0.1569,
1022
  "step": 1370
1023
  },
1024
  {
1025
+ "epoch": 6.931163954943679,
1026
+ "grad_norm": 2.8735201358795166,
1027
+ "learning_rate": 4.441340782122905e-06,
1028
+ "loss": 0.1628,
1029
  "step": 1380
1030
  },
1031
  {
1032
+ "epoch": 6.981226533166458,
1033
+ "grad_norm": 2.044304132461548,
1034
+ "learning_rate": 4.231843575418994e-06,
1035
+ "loss": 0.1385,
1036
  "step": 1390
1037
  },
1038
  {
1039
+ "epoch": 6.996245306633291,
1040
+ "eval_accuracy": 0.9830832597822889,
1041
+ "eval_loss": 0.0956372618675232,
1042
+ "eval_runtime": 4.8597,
1043
+ "eval_samples_per_second": 1398.844,
1044
+ "eval_steps_per_second": 43.83,
1045
+ "step": 1393
1046
+ },
1047
+ {
1048
+ "epoch": 7.035043804755945,
1049
+ "grad_norm": 1.4923343658447266,
1050
+ "learning_rate": 4.022346368715084e-06,
1051
+ "loss": 0.1558,
1052
  "step": 1400
1053
  },
1054
  {
1055
+ "epoch": 7.085106382978723,
1056
+ "grad_norm": 3.339181423187256,
1057
+ "learning_rate": 3.812849162011173e-06,
1058
+ "loss": 0.1391,
1059
  "step": 1410
1060
  },
1061
  {
1062
+ "epoch": 7.135168961201502,
1063
+ "grad_norm": 2.091777801513672,
1064
+ "learning_rate": 3.6033519553072625e-06,
1065
+ "loss": 0.1541,
1066
  "step": 1420
1067
  },
1068
  {
1069
+ "epoch": 7.18523153942428,
1070
+ "grad_norm": 2.6580100059509277,
1071
+ "learning_rate": 3.393854748603352e-06,
1072
+ "loss": 0.1379,
1073
  "step": 1430
1074
  },
1075
  {
1076
+ "epoch": 7.235294117647059,
1077
+ "grad_norm": 2.4065537452697754,
1078
+ "learning_rate": 3.1843575418994414e-06,
1079
+ "loss": 0.1475,
1080
  "step": 1440
1081
  },
1082
  {
1083
+ "epoch": 7.2853566958698375,
1084
+ "grad_norm": 2.618218183517456,
1085
+ "learning_rate": 2.974860335195531e-06,
1086
+ "loss": 0.1828,
1087
  "step": 1450
1088
  },
1089
  {
1090
+ "epoch": 7.335419274092616,
1091
+ "grad_norm": 3.5904743671417236,
1092
+ "learning_rate": 2.7653631284916204e-06,
1093
+ "loss": 0.1365,
1094
  "step": 1460
1095
  },
1096
  {
1097
+ "epoch": 7.385481852315394,
1098
+ "grad_norm": 2.245260000228882,
1099
+ "learning_rate": 2.555865921787709e-06,
1100
+ "loss": 0.1394,
1101
  "step": 1470
1102
  },
1103
  {
1104
+ "epoch": 7.435544430538172,
1105
+ "grad_norm": 2.558086395263672,
1106
+ "learning_rate": 2.346368715083799e-06,
1107
+ "loss": 0.1533,
1108
  "step": 1480
1109
  },
1110
  {
1111
+ "epoch": 7.485607008760951,
1112
+ "grad_norm": 2.851020097732544,
1113
+ "learning_rate": 2.136871508379888e-06,
1114
+ "loss": 0.1313,
1115
  "step": 1490
1116
  },
1117
  {
1118
+ "epoch": 7.53566958698373,
1119
+ "grad_norm": 1.7011760473251343,
1120
+ "learning_rate": 1.927374301675978e-06,
1121
+ "loss": 0.1509,
1122
  "step": 1500
1123
  },
1124
  {
1125
+ "epoch": 7.585732165206508,
1126
+ "grad_norm": 2.6264467239379883,
1127
+ "learning_rate": 1.717877094972067e-06,
1128
+ "loss": 0.1515,
1129
  "step": 1510
1130
  },
1131
  {
1132
+ "epoch": 7.635794743429287,
1133
+ "grad_norm": 1.6332521438598633,
1134
+ "learning_rate": 1.5083798882681566e-06,
1135
+ "loss": 0.1489,
1136
  "step": 1520
1137
  },
1138
  {
1139
+ "epoch": 7.685857321652065,
1140
+ "grad_norm": 2.0622401237487793,
1141
+ "learning_rate": 1.2988826815642458e-06,
1142
+ "loss": 0.1594,
1143
  "step": 1530
1144
  },
1145
  {
1146
+ "epoch": 7.735919899874844,
1147
+ "grad_norm": 2.3861618041992188,
1148
+ "learning_rate": 1.0893854748603353e-06,
1149
+ "loss": 0.1669,
1150
  "step": 1540
1151
  },
1152
  {
1153
+ "epoch": 7.785982478097622,
1154
+ "grad_norm": 4.30822229385376,
1155
+ "learning_rate": 8.798882681564246e-07,
1156
+ "loss": 0.1759,
1157
  "step": 1550
1158
  },
1159
  {
1160
+ "epoch": 7.8360450563204,
1161
+ "grad_norm": 1.4631046056747437,
1162
+ "learning_rate": 6.70391061452514e-07,
1163
+ "loss": 0.1645,
1164
  "step": 1560
1165
  },
1166
  {
1167
+ "epoch": 7.886107634543179,
1168
+ "grad_norm": 2.452613115310669,
1169
+ "learning_rate": 4.608938547486033e-07,
1170
+ "loss": 0.1575,
1171
  "step": 1570
1172
  },
1173
  {
1174
+ "epoch": 7.9361702127659575,
1175
+ "grad_norm": 1.279895305633545,
1176
+ "learning_rate": 2.5139664804469275e-07,
1177
+ "loss": 0.149,
1178
  "step": 1580
1179
  },
1180
  {
1181
+ "epoch": 7.986232790988736,
1182
+ "grad_norm": 2.141481399536133,
1183
+ "learning_rate": 4.189944134078212e-08,
1184
+ "loss": 0.1524,
1185
  "step": 1590
1186
  },
1187
  {
1188
+ "epoch": 7.996245306633291,
1189
+ "eval_accuracy": 0.9824948514268903,
1190
+ "eval_loss": 0.09257339686155319,
1191
+ "eval_runtime": 5.6151,
1192
+ "eval_samples_per_second": 1210.67,
1193
+ "eval_steps_per_second": 37.934,
1194
+ "step": 1592
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1195
  },
1196
  {
1197
+ "epoch": 7.996245306633291,
1198
+ "step": 1592,
1199
+ "total_flos": 3.777723239743488e+18,
1200
+ "train_loss": 0.596273283347787,
1201
+ "train_runtime": 640.7753,
1202
+ "train_samples_per_second": 637.902,
1203
+ "train_steps_per_second": 2.484
1204
  }
1205
  ],
1206
  "logging_steps": 10,
1207
+ "max_steps": 1592,
1208
  "num_input_tokens_seen": 0,
1209
+ "num_train_epochs": 8,
1210
  "save_steps": 500,
1211
  "stateful_callbacks": {
1212
  "TrainerControl": {
 
1220
  "attributes": {}
1221
  }
1222
  },
1223
+ "total_flos": 3.777723239743488e+18,
1224
+ "train_batch_size": 64,
1225
  "trial_name": null,
1226
  "trial_params": null
1227
  }