Zheng Li commited on
Commit
3602a49
·
verified ·
1 Parent(s): ce0b089

End of training

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: facebook/wav2vec2-base
5
  tags:
 
6
  - generated_from_trainer
7
  datasets:
8
  - superb
 
3
  license: apache-2.0
4
  base_model: facebook/wav2vec2-base
5
  tags:
6
+ - audio-classification
7
  - generated_from_trainer
8
  datasets:
9
  - superb
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 7.996245306633291,
3
- "eval_accuracy": 0.9830832597822889,
4
- "eval_loss": 0.0956372618675232,
5
- "eval_runtime": 5.6145,
6
- "eval_samples_per_second": 1210.799,
7
- "eval_steps_per_second": 37.938,
8
  "total_flos": 3.777723239743488e+18,
9
- "train_loss": 0.596273283347787,
10
- "train_runtime": 640.7753,
11
- "train_samples_per_second": 637.902,
12
- "train_steps_per_second": 2.484
13
  }
 
1
  {
2
  "epoch": 7.996245306633291,
3
+ "eval_accuracy": 0.9763165636952045,
4
+ "eval_loss": 0.40624934434890747,
5
+ "eval_runtime": 5.5217,
6
+ "eval_samples_per_second": 1231.15,
7
+ "eval_steps_per_second": 38.575,
8
  "total_flos": 3.777723239743488e+18,
9
+ "train_loss": 1.152813568037359,
10
+ "train_runtime": 637.2674,
11
+ "train_samples_per_second": 641.414,
12
+ "train_steps_per_second": 2.498
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 7.996245306633291,
3
- "eval_accuracy": 0.9830832597822889,
4
- "eval_loss": 0.0956372618675232,
5
- "eval_runtime": 5.6145,
6
- "eval_samples_per_second": 1210.799,
7
- "eval_steps_per_second": 37.938
8
  }
 
1
  {
2
  "epoch": 7.996245306633291,
3
+ "eval_accuracy": 0.9763165636952045,
4
+ "eval_loss": 0.40624934434890747,
5
+ "eval_runtime": 5.5217,
6
+ "eval_samples_per_second": 1231.15,
7
+ "eval_steps_per_second": 38.575
8
  }
runs/May14_16-06-35_cs-Precision-7960-Tower/events.out.tfevents.1747253871.cs-Precision-7960-Tower.128840.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2152ebbe8021d73f3e8a88a5c56305eda145e557c9a4a77848f71d3a815b7b72
3
+ size 411
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 7.996245306633291,
3
  "total_flos": 3.777723239743488e+18,
4
- "train_loss": 0.596273283347787,
5
- "train_runtime": 640.7753,
6
- "train_samples_per_second": 637.902,
7
- "train_steps_per_second": 2.484
8
  }
 
1
  {
2
  "epoch": 7.996245306633291,
3
  "total_flos": 3.777723239743488e+18,
4
+ "train_loss": 1.152813568037359,
5
+ "train_runtime": 637.2674,
6
+ "train_samples_per_second": 641.414,
7
+ "train_steps_per_second": 2.498
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.9830832597822889,
3
- "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1393",
4
  "epoch": 7.996245306633291,
5
  "eval_steps": 500,
6
  "global_step": 1592,
@@ -10,1197 +10,1197 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05006257822277847,
13
- "grad_norm": 2.299422264099121,
14
- "learning_rate": 1.875e-06,
15
- "loss": 4.1412,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.10012515644555695,
20
- "grad_norm": 3.199314832687378,
21
- "learning_rate": 3.75e-06,
22
- "loss": 4.1637,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.15018773466833543,
27
- "grad_norm": 3.3083832263946533,
28
- "learning_rate": 5.625e-06,
29
- "loss": 4.0438,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2002503128911139,
34
- "grad_norm": 4.229264736175537,
35
- "learning_rate": 7.5e-06,
36
- "loss": 3.8012,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.2503128911138924,
41
- "grad_norm": 5.718367576599121,
42
- "learning_rate": 9.375000000000001e-06,
43
- "loss": 3.3779,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.30037546933667086,
48
- "grad_norm": 6.0788254737854,
49
- "learning_rate": 1.125e-05,
50
- "loss": 2.8533,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.3504380475594493,
55
- "grad_norm": 5.600748538970947,
56
- "learning_rate": 1.3125e-05,
57
- "loss": 2.4796,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.4005006257822278,
62
- "grad_norm": 5.035912990570068,
63
- "learning_rate": 1.5e-05,
64
- "loss": 2.253,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.45056320400500627,
69
- "grad_norm": 4.351953983306885,
70
- "learning_rate": 1.6875e-05,
71
- "loss": 2.0939,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.5006257822277848,
76
- "grad_norm": 3.4278855323791504,
77
- "learning_rate": 1.8750000000000002e-05,
78
- "loss": 1.9864,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.5506883604505632,
83
- "grad_norm": 2.862748384475708,
84
- "learning_rate": 2.0625e-05,
85
- "loss": 1.8611,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.6007509386733417,
90
- "grad_norm": 1.2488276958465576,
91
- "learning_rate": 2.25e-05,
92
- "loss": 1.8302,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.6508135168961201,
97
- "grad_norm": 0.6365911960601807,
98
- "learning_rate": 2.4375e-05,
99
- "loss": 1.7982,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.7008760951188986,
104
- "grad_norm": 0.5073445439338684,
105
- "learning_rate": 2.625e-05,
106
- "loss": 1.6792,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.7509386733416771,
111
- "grad_norm": 0.9256235361099243,
112
- "learning_rate": 2.8125e-05,
113
- "loss": 1.7528,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.8010012515644556,
118
- "grad_norm": 5.669793128967285,
119
- "learning_rate": 3e-05,
120
- "loss": 1.7688,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.851063829787234,
125
- "grad_norm": 10.126972198486328,
126
- "learning_rate": 2.979050279329609e-05,
127
- "loss": 1.6876,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.9011264080100125,
132
- "grad_norm": 1.5617619752883911,
133
- "learning_rate": 2.958100558659218e-05,
134
- "loss": 1.6298,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.951188986232791,
139
- "grad_norm": 1.5987392663955688,
140
- "learning_rate": 2.937150837988827e-05,
141
- "loss": 1.6106,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.9962453066332916,
146
  "eval_accuracy": 0.6209179170344219,
147
- "eval_loss": 1.4251551628112793,
148
- "eval_runtime": 4.8605,
149
- "eval_samples_per_second": 1398.631,
150
- "eval_steps_per_second": 43.823,
151
  "step": 199
152
  },
153
  {
154
  "epoch": 1.0050062578222778,
155
- "grad_norm": 3.5913760662078857,
156
- "learning_rate": 2.9162011173184356e-05,
157
- "loss": 1.6815,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.0550688360450564,
162
- "grad_norm": 2.8698642253875732,
163
- "learning_rate": 2.895251396648045e-05,
164
- "loss": 1.4457,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.1051314142678348,
169
- "grad_norm": 2.3613011837005615,
170
- "learning_rate": 2.8743016759776535e-05,
171
- "loss": 1.4503,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.1551939924906134,
176
- "grad_norm": 3.4527103900909424,
177
- "learning_rate": 2.8533519553072625e-05,
178
- "loss": 1.2686,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.2052565707133918,
183
- "grad_norm": 4.879206657409668,
184
- "learning_rate": 2.8324022346368715e-05,
185
- "loss": 1.2226,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.2553191489361701,
190
- "grad_norm": 3.2351438999176025,
191
- "learning_rate": 2.8114525139664805e-05,
192
- "loss": 1.1545,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.3053817271589487,
197
- "grad_norm": 5.1034464836120605,
198
- "learning_rate": 2.7905027932960894e-05,
199
- "loss": 1.1284,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.355444305381727,
204
- "grad_norm": 2.128084421157837,
205
- "learning_rate": 2.7695530726256984e-05,
206
- "loss": 1.0926,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.4055068836045057,
211
- "grad_norm": 5.853870391845703,
212
- "learning_rate": 2.7486033519553074e-05,
213
- "loss": 1.075,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.455569461827284,
218
- "grad_norm": 2.4751949310302734,
219
- "learning_rate": 2.7276536312849163e-05,
220
- "loss": 0.9992,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.5056320400500627,
225
- "grad_norm": 3.3400278091430664,
226
- "learning_rate": 2.7067039106145253e-05,
227
- "loss": 0.9649,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.555694618272841,
232
- "grad_norm": 2.893463611602783,
233
- "learning_rate": 2.685754189944134e-05,
234
- "loss": 1.0066,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.6057571964956194,
239
- "grad_norm": 2.179349660873413,
240
- "learning_rate": 2.6648044692737432e-05,
241
- "loss": 0.9203,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.655819774718398,
246
- "grad_norm": 4.882504463195801,
247
- "learning_rate": 2.643854748603352e-05,
248
- "loss": 0.9159,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.7058823529411766,
253
- "grad_norm": 4.354543685913086,
254
- "learning_rate": 2.622905027932961e-05,
255
- "loss": 0.8608,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.7559449311639548,
260
- "grad_norm": 3.870502233505249,
261
- "learning_rate": 2.60195530726257e-05,
262
- "loss": 0.8231,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.8060075093867334,
267
- "grad_norm": 3.579007148742676,
268
- "learning_rate": 2.5810055865921788e-05,
269
- "loss": 0.7965,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.856070087609512,
274
- "grad_norm": 4.881648540496826,
275
- "learning_rate": 2.5600558659217877e-05,
276
- "loss": 0.7647,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.9061326658322904,
281
- "grad_norm": 2.9336421489715576,
282
- "learning_rate": 2.5391061452513967e-05,
283
- "loss": 0.7005,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.9561952440550687,
288
- "grad_norm": 2.542874813079834,
289
- "learning_rate": 2.5181564245810057e-05,
290
- "loss": 0.6495,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.9962453066332917,
295
- "eval_accuracy": 0.9682259488084731,
296
- "eval_loss": 0.5032486915588379,
297
- "eval_runtime": 4.9258,
298
- "eval_samples_per_second": 1380.074,
299
- "eval_steps_per_second": 43.242,
300
  "step": 398
301
  },
302
  {
303
  "epoch": 2.0100125156445556,
304
- "grad_norm": 1.9056552648544312,
305
- "learning_rate": 2.4972067039106143e-05,
306
- "loss": 0.6495,
307
  "step": 400
308
  },
309
  {
310
  "epoch": 2.0600750938673342,
311
- "grad_norm": 3.567265033721924,
312
- "learning_rate": 2.4762569832402236e-05,
313
- "loss": 0.5869,
314
  "step": 410
315
  },
316
  {
317
  "epoch": 2.110137672090113,
318
- "grad_norm": 2.240018844604492,
319
- "learning_rate": 2.4553072625698326e-05,
320
- "loss": 0.5728,
321
  "step": 420
322
  },
323
  {
324
  "epoch": 2.160200250312891,
325
- "grad_norm": 2.6313724517822266,
326
- "learning_rate": 2.4343575418994412e-05,
327
- "loss": 0.5028,
328
  "step": 430
329
  },
330
  {
331
  "epoch": 2.2102628285356696,
332
- "grad_norm": 3.360229015350342,
333
- "learning_rate": 2.4134078212290505e-05,
334
- "loss": 0.4928,
335
  "step": 440
336
  },
337
  {
338
  "epoch": 2.260325406758448,
339
- "grad_norm": 5.249541282653809,
340
- "learning_rate": 2.392458100558659e-05,
341
- "loss": 0.4773,
342
  "step": 450
343
  },
344
  {
345
  "epoch": 2.3103879849812268,
346
- "grad_norm": 3.6117191314697266,
347
- "learning_rate": 2.3715083798882685e-05,
348
- "loss": 0.4852,
349
  "step": 460
350
  },
351
  {
352
  "epoch": 2.360450563204005,
353
- "grad_norm": 4.820945739746094,
354
- "learning_rate": 2.350558659217877e-05,
355
- "loss": 0.4331,
356
  "step": 470
357
  },
358
  {
359
  "epoch": 2.4105131414267835,
360
- "grad_norm": 4.089610576629639,
361
- "learning_rate": 2.329608938547486e-05,
362
- "loss": 0.4246,
363
  "step": 480
364
  },
365
  {
366
  "epoch": 2.460575719649562,
367
- "grad_norm": 4.083464622497559,
368
- "learning_rate": 2.308659217877095e-05,
369
- "loss": 0.3752,
370
  "step": 490
371
  },
372
  {
373
  "epoch": 2.5106382978723403,
374
- "grad_norm": 4.422226428985596,
375
- "learning_rate": 2.287709497206704e-05,
376
- "loss": 0.3916,
377
  "step": 500
378
  },
379
  {
380
  "epoch": 2.560700876095119,
381
- "grad_norm": 2.952890634536743,
382
- "learning_rate": 2.266759776536313e-05,
383
- "loss": 0.3973,
384
  "step": 510
385
  },
386
  {
387
  "epoch": 2.6107634543178975,
388
- "grad_norm": 3.720259428024292,
389
- "learning_rate": 2.245810055865922e-05,
390
- "loss": 0.3432,
391
  "step": 520
392
  },
393
  {
394
  "epoch": 2.660826032540676,
395
- "grad_norm": 4.10168981552124,
396
- "learning_rate": 2.224860335195531e-05,
397
- "loss": 0.3479,
398
  "step": 530
399
  },
400
  {
401
  "epoch": 2.710888610763454,
402
- "grad_norm": 4.39931058883667,
403
- "learning_rate": 2.2039106145251395e-05,
404
- "loss": 0.3418,
405
  "step": 540
406
  },
407
  {
408
  "epoch": 2.760951188986233,
409
- "grad_norm": 2.6174728870391846,
410
- "learning_rate": 2.182960893854749e-05,
411
- "loss": 0.3153,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.8110137672090114,
416
- "grad_norm": 3.489020347595215,
417
- "learning_rate": 2.1620111731843575e-05,
418
- "loss": 0.3242,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.8610763454317896,
423
- "grad_norm": 3.2841830253601074,
424
- "learning_rate": 2.1410614525139664e-05,
425
- "loss": 0.3016,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.911138923654568,
430
- "grad_norm": 4.06994104385376,
431
- "learning_rate": 2.1201117318435754e-05,
432
- "loss": 0.3237,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.9612015018773468,
437
- "grad_norm": 2.4799962043762207,
438
- "learning_rate": 2.0991620111731844e-05,
439
- "loss": 0.2978,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.9962453066332917,
444
- "eval_accuracy": 0.9782288908502501,
445
- "eval_loss": 0.19027507305145264,
446
- "eval_runtime": 4.8737,
447
- "eval_samples_per_second": 1394.823,
448
- "eval_steps_per_second": 43.704,
449
  "step": 597
450
  },
451
  {
452
  "epoch": 3.0150187734668337,
453
- "grad_norm": 2.1213083267211914,
454
- "learning_rate": 2.0782122905027933e-05,
455
- "loss": 0.3231,
456
  "step": 600
457
  },
458
  {
459
  "epoch": 3.065081351689612,
460
- "grad_norm": 4.8361945152282715,
461
- "learning_rate": 2.0572625698324023e-05,
462
- "loss": 0.302,
463
  "step": 610
464
  },
465
  {
466
  "epoch": 3.1151439299123904,
467
- "grad_norm": 3.293104887008667,
468
- "learning_rate": 2.0363128491620113e-05,
469
- "loss": 0.2883,
470
  "step": 620
471
  },
472
  {
473
  "epoch": 3.165206508135169,
474
- "grad_norm": 3.274291515350342,
475
- "learning_rate": 2.01536312849162e-05,
476
- "loss": 0.2961,
477
  "step": 630
478
  },
479
  {
480
  "epoch": 3.2152690863579476,
481
- "grad_norm": 3.7976105213165283,
482
- "learning_rate": 1.9944134078212292e-05,
483
- "loss": 0.2688,
484
  "step": 640
485
  },
486
  {
487
  "epoch": 3.2653316645807258,
488
- "grad_norm": 2.9893229007720947,
489
- "learning_rate": 1.973463687150838e-05,
490
- "loss": 0.2446,
491
  "step": 650
492
  },
493
  {
494
  "epoch": 3.3153942428035044,
495
- "grad_norm": 2.2266604900360107,
496
- "learning_rate": 1.952513966480447e-05,
497
- "loss": 0.2613,
498
  "step": 660
499
  },
500
  {
501
  "epoch": 3.365456821026283,
502
- "grad_norm": 3.621093511581421,
503
- "learning_rate": 1.9315642458100558e-05,
504
- "loss": 0.2512,
505
  "step": 670
506
  },
507
  {
508
  "epoch": 3.415519399249061,
509
- "grad_norm": 2.952971935272217,
510
- "learning_rate": 1.9106145251396648e-05,
511
- "loss": 0.2536,
512
  "step": 680
513
  },
514
  {
515
  "epoch": 3.4655819774718397,
516
- "grad_norm": 2.7361905574798584,
517
- "learning_rate": 1.889664804469274e-05,
518
- "loss": 0.2396,
519
  "step": 690
520
  },
521
  {
522
  "epoch": 3.5156445556946183,
523
- "grad_norm": 2.3844313621520996,
524
- "learning_rate": 1.8687150837988827e-05,
525
- "loss": 0.2518,
526
  "step": 700
527
  },
528
  {
529
  "epoch": 3.565707133917397,
530
- "grad_norm": 3.0508193969726562,
531
- "learning_rate": 1.8477653631284917e-05,
532
- "loss": 0.2325,
533
  "step": 710
534
  },
535
  {
536
  "epoch": 3.615769712140175,
537
- "grad_norm": 3.923941135406494,
538
- "learning_rate": 1.8268156424581006e-05,
539
- "loss": 0.2277,
540
  "step": 720
541
  },
542
  {
543
  "epoch": 3.6658322903629537,
544
- "grad_norm": 2.638787031173706,
545
- "learning_rate": 1.8058659217877096e-05,
546
- "loss": 0.2292,
547
  "step": 730
548
  },
549
  {
550
  "epoch": 3.7158948685857323,
551
- "grad_norm": 2.75313138961792,
552
- "learning_rate": 1.7849162011173182e-05,
553
- "loss": 0.2364,
554
  "step": 740
555
  },
556
  {
557
  "epoch": 3.7659574468085104,
558
- "grad_norm": 3.686354398727417,
559
- "learning_rate": 1.7639664804469275e-05,
560
- "loss": 0.2409,
561
  "step": 750
562
  },
563
  {
564
  "epoch": 3.816020025031289,
565
- "grad_norm": 4.230103969573975,
566
- "learning_rate": 1.7430167597765365e-05,
567
- "loss": 0.2293,
568
  "step": 760
569
  },
570
  {
571
  "epoch": 3.8660826032540676,
572
- "grad_norm": 4.4972100257873535,
573
- "learning_rate": 1.722067039106145e-05,
574
- "loss": 0.2431,
575
  "step": 770
576
  },
577
  {
578
  "epoch": 3.916145181476846,
579
- "grad_norm": 3.6224372386932373,
580
- "learning_rate": 1.7011173184357544e-05,
581
- "loss": 0.2099,
582
  "step": 780
583
  },
584
  {
585
  "epoch": 3.966207759699625,
586
- "grad_norm": 3.072998285293579,
587
- "learning_rate": 1.680167597765363e-05,
588
- "loss": 0.2273,
589
  "step": 790
590
  },
591
  {
592
  "epoch": 3.9962453066332917,
593
- "eval_accuracy": 0.9771991762283024,
594
- "eval_loss": 0.14364813268184662,
595
- "eval_runtime": 4.9858,
596
- "eval_samples_per_second": 1363.468,
597
- "eval_steps_per_second": 42.721,
598
  "step": 796
599
  },
600
  {
601
  "epoch": 4.020025031289111,
602
- "grad_norm": 3.3532874584198,
603
- "learning_rate": 1.659217877094972e-05,
604
- "loss": 0.23,
605
  "step": 800
606
  },
607
  {
608
  "epoch": 4.07008760951189,
609
- "grad_norm": 2.384181261062622,
610
- "learning_rate": 1.638268156424581e-05,
611
- "loss": 0.2157,
612
  "step": 810
613
  },
614
  {
615
  "epoch": 4.1201501877346685,
616
- "grad_norm": 4.237877368927002,
617
- "learning_rate": 1.61731843575419e-05,
618
- "loss": 0.2141,
619
  "step": 820
620
  },
621
  {
622
  "epoch": 4.170212765957447,
623
- "grad_norm": 3.8752825260162354,
624
- "learning_rate": 1.5963687150837986e-05,
625
- "loss": 0.214,
626
  "step": 830
627
  },
628
  {
629
  "epoch": 4.220275344180226,
630
- "grad_norm": 2.571617364883423,
631
- "learning_rate": 1.575418994413408e-05,
632
- "loss": 0.2124,
633
  "step": 840
634
  },
635
  {
636
  "epoch": 4.270337922403003,
637
- "grad_norm": 1.8986117839813232,
638
- "learning_rate": 1.554469273743017e-05,
639
- "loss": 0.2142,
640
  "step": 850
641
  },
642
  {
643
  "epoch": 4.320400500625782,
644
- "grad_norm": 3.71712589263916,
645
- "learning_rate": 1.533519553072626e-05,
646
- "loss": 0.1939,
647
  "step": 860
648
  },
649
  {
650
  "epoch": 4.370463078848561,
651
- "grad_norm": 2.1387696266174316,
652
- "learning_rate": 1.5125698324022348e-05,
653
- "loss": 0.1905,
654
  "step": 870
655
  },
656
  {
657
  "epoch": 4.420525657071339,
658
- "grad_norm": 3.3053841590881348,
659
- "learning_rate": 1.4916201117318435e-05,
660
- "loss": 0.2156,
661
  "step": 880
662
  },
663
  {
664
  "epoch": 4.470588235294118,
665
- "grad_norm": 2.574657440185547,
666
- "learning_rate": 1.4706703910614526e-05,
667
- "loss": 0.198,
668
  "step": 890
669
  },
670
  {
671
  "epoch": 4.520650813516896,
672
- "grad_norm": 2.290309429168701,
673
- "learning_rate": 1.4497206703910616e-05,
674
- "loss": 0.1754,
675
  "step": 900
676
  },
677
  {
678
  "epoch": 4.570713391739675,
679
- "grad_norm": 2.4253950119018555,
680
- "learning_rate": 1.4287709497206705e-05,
681
- "loss": 0.1971,
682
  "step": 910
683
  },
684
  {
685
  "epoch": 4.6207759699624535,
686
- "grad_norm": 3.070322275161743,
687
- "learning_rate": 1.4078212290502793e-05,
688
- "loss": 0.1928,
689
  "step": 920
690
  },
691
  {
692
  "epoch": 4.670838548185231,
693
- "grad_norm": 1.5523446798324585,
694
- "learning_rate": 1.3868715083798883e-05,
695
- "loss": 0.1948,
696
  "step": 930
697
  },
698
  {
699
  "epoch": 4.72090112640801,
700
- "grad_norm": 3.076679229736328,
701
- "learning_rate": 1.3659217877094973e-05,
702
- "loss": 0.1822,
703
  "step": 940
704
  },
705
  {
706
  "epoch": 4.7709637046307884,
707
- "grad_norm": 3.6825084686279297,
708
- "learning_rate": 1.344972067039106e-05,
709
- "loss": 0.1954,
710
  "step": 950
711
  },
712
  {
713
  "epoch": 4.821026282853567,
714
- "grad_norm": 4.037261009216309,
715
- "learning_rate": 1.324022346368715e-05,
716
- "loss": 0.1946,
717
  "step": 960
718
  },
719
  {
720
  "epoch": 4.871088861076346,
721
- "grad_norm": 3.026543378829956,
722
- "learning_rate": 1.3030726256983242e-05,
723
- "loss": 0.2155,
724
  "step": 970
725
  },
726
  {
727
  "epoch": 4.921151439299124,
728
- "grad_norm": 2.0362164974212646,
729
- "learning_rate": 1.282122905027933e-05,
730
- "loss": 0.1851,
731
  "step": 980
732
  },
733
  {
734
  "epoch": 4.971214017521902,
735
- "grad_norm": 1.8788719177246094,
736
- "learning_rate": 1.261173184357542e-05,
737
- "loss": 0.1866,
738
  "step": 990
739
  },
740
  {
741
  "epoch": 4.996245306633291,
742
- "eval_accuracy": 0.981759340982642,
743
- "eval_loss": 0.11026876419782639,
744
- "eval_runtime": 4.7749,
745
- "eval_samples_per_second": 1423.702,
746
- "eval_steps_per_second": 44.608,
747
  "step": 995
748
  },
749
  {
750
  "epoch": 5.025031289111389,
751
- "grad_norm": 3.612476110458374,
752
- "learning_rate": 1.2402234636871509e-05,
753
- "loss": 0.1911,
754
  "step": 1000
755
  },
756
  {
757
  "epoch": 5.075093867334168,
758
- "grad_norm": 1.3886641263961792,
759
- "learning_rate": 1.2192737430167599e-05,
760
- "loss": 0.1768,
761
  "step": 1010
762
  },
763
  {
764
  "epoch": 5.1251564455569465,
765
- "grad_norm": 3.217656135559082,
766
- "learning_rate": 1.1983240223463687e-05,
767
- "loss": 0.1835,
768
  "step": 1020
769
  },
770
  {
771
  "epoch": 5.175219023779725,
772
- "grad_norm": 2.281695604324341,
773
- "learning_rate": 1.1773743016759776e-05,
774
- "loss": 0.1858,
775
  "step": 1030
776
  },
777
  {
778
  "epoch": 5.225281602002503,
779
- "grad_norm": 2.7055630683898926,
780
- "learning_rate": 1.1564245810055866e-05,
781
- "loss": 0.1718,
782
  "step": 1040
783
  },
784
  {
785
  "epoch": 5.275344180225281,
786
- "grad_norm": 2.00937819480896,
787
- "learning_rate": 1.1354748603351954e-05,
788
- "loss": 0.1479,
789
  "step": 1050
790
  },
791
  {
792
  "epoch": 5.32540675844806,
793
- "grad_norm": 2.65446138381958,
794
- "learning_rate": 1.1145251396648046e-05,
795
- "loss": 0.1664,
796
  "step": 1060
797
  },
798
  {
799
  "epoch": 5.375469336670839,
800
- "grad_norm": 2.499176502227783,
801
- "learning_rate": 1.0935754189944135e-05,
802
- "loss": 0.1882,
803
  "step": 1070
804
  },
805
  {
806
  "epoch": 5.425531914893617,
807
- "grad_norm": 3.318516492843628,
808
- "learning_rate": 1.0726256983240223e-05,
809
- "loss": 0.1823,
810
  "step": 1080
811
  },
812
  {
813
  "epoch": 5.475594493116396,
814
- "grad_norm": 2.1236233711242676,
815
- "learning_rate": 1.0516759776536313e-05,
816
- "loss": 0.1657,
817
  "step": 1090
818
  },
819
  {
820
  "epoch": 5.5256570713391735,
821
- "grad_norm": 3.342689037322998,
822
- "learning_rate": 1.0307262569832403e-05,
823
- "loss": 0.1823,
824
  "step": 1100
825
  },
826
  {
827
  "epoch": 5.575719649561952,
828
- "grad_norm": 2.687920331954956,
829
- "learning_rate": 1.0097765363128492e-05,
830
- "loss": 0.1888,
831
  "step": 1110
832
  },
833
  {
834
  "epoch": 5.625782227784731,
835
- "grad_norm": 3.692422866821289,
836
- "learning_rate": 9.88826815642458e-06,
837
- "loss": 0.2053,
838
  "step": 1120
839
  },
840
  {
841
  "epoch": 5.675844806007509,
842
- "grad_norm": 3.453005790710449,
843
- "learning_rate": 9.67877094972067e-06,
844
- "loss": 0.179,
845
  "step": 1130
846
  },
847
  {
848
  "epoch": 5.725907384230288,
849
- "grad_norm": 4.005608081817627,
850
- "learning_rate": 9.46927374301676e-06,
851
- "loss": 0.1748,
852
  "step": 1140
853
  },
854
  {
855
  "epoch": 5.7759699624530665,
856
- "grad_norm": 2.1113505363464355,
857
- "learning_rate": 9.25977653631285e-06,
858
- "loss": 0.1574,
859
  "step": 1150
860
  },
861
  {
862
  "epoch": 5.826032540675845,
863
- "grad_norm": 4.529311180114746,
864
- "learning_rate": 9.050279329608939e-06,
865
- "loss": 0.1599,
866
  "step": 1160
867
  },
868
  {
869
  "epoch": 5.876095118898624,
870
- "grad_norm": 1.885956048965454,
871
- "learning_rate": 8.840782122905029e-06,
872
- "loss": 0.1909,
873
  "step": 1170
874
  },
875
  {
876
  "epoch": 5.926157697121401,
877
- "grad_norm": 2.369316816329956,
878
- "learning_rate": 8.631284916201118e-06,
879
- "loss": 0.1603,
880
  "step": 1180
881
  },
882
  {
883
  "epoch": 5.97622027534418,
884
- "grad_norm": 1.402648687362671,
885
- "learning_rate": 8.421787709497206e-06,
886
- "loss": 0.1616,
887
  "step": 1190
888
  },
889
  {
890
  "epoch": 5.996245306633291,
891
- "eval_accuracy": 0.9819064430714917,
892
- "eval_loss": 0.0981006771326065,
893
- "eval_runtime": 4.775,
894
- "eval_samples_per_second": 1423.653,
895
- "eval_steps_per_second": 44.607,
896
  "step": 1194
897
  },
898
  {
899
  "epoch": 6.030037546933667,
900
- "grad_norm": 1.6693238019943237,
901
- "learning_rate": 8.212290502793296e-06,
902
- "loss": 0.1931,
903
  "step": 1200
904
  },
905
  {
906
  "epoch": 6.080100125156446,
907
- "grad_norm": 2.3462257385253906,
908
- "learning_rate": 8.002793296089386e-06,
909
- "loss": 0.1506,
910
  "step": 1210
911
  },
912
  {
913
  "epoch": 6.130162703379224,
914
- "grad_norm": 1.6939945220947266,
915
- "learning_rate": 7.793296089385474e-06,
916
- "loss": 0.1677,
917
  "step": 1220
918
  },
919
  {
920
  "epoch": 6.180225281602002,
921
- "grad_norm": 1.728092908859253,
922
- "learning_rate": 7.583798882681565e-06,
923
- "loss": 0.1569,
924
  "step": 1230
925
  },
926
  {
927
  "epoch": 6.230287859824781,
928
- "grad_norm": 1.6664111614227295,
929
- "learning_rate": 7.374301675977653e-06,
930
- "loss": 0.1564,
931
  "step": 1240
932
  },
933
  {
934
  "epoch": 6.280350438047559,
935
- "grad_norm": 2.0160274505615234,
936
- "learning_rate": 7.164804469273744e-06,
937
- "loss": 0.1513,
938
  "step": 1250
939
  },
940
  {
941
  "epoch": 6.330413016270338,
942
- "grad_norm": 4.013051986694336,
943
- "learning_rate": 6.9553072625698325e-06,
944
- "loss": 0.1594,
945
  "step": 1260
946
  },
947
  {
948
  "epoch": 6.380475594493117,
949
- "grad_norm": 3.11110258102417,
950
- "learning_rate": 6.745810055865922e-06,
951
- "loss": 0.1445,
952
  "step": 1270
953
  },
954
  {
955
  "epoch": 6.430538172715895,
956
- "grad_norm": 3.418999433517456,
957
- "learning_rate": 6.536312849162011e-06,
958
- "loss": 0.1679,
959
  "step": 1280
960
  },
961
  {
962
  "epoch": 6.480600750938673,
963
- "grad_norm": 2.4514362812042236,
964
- "learning_rate": 6.326815642458101e-06,
965
- "loss": 0.152,
966
  "step": 1290
967
  },
968
  {
969
  "epoch": 6.5306633291614515,
970
- "grad_norm": 3.2242462635040283,
971
- "learning_rate": 6.1173184357541904e-06,
972
- "loss": 0.1676,
973
  "step": 1300
974
  },
975
  {
976
  "epoch": 6.58072590738423,
977
- "grad_norm": 4.046393871307373,
978
- "learning_rate": 5.907821229050279e-06,
979
- "loss": 0.1615,
980
  "step": 1310
981
  },
982
  {
983
  "epoch": 6.630788485607009,
984
- "grad_norm": 1.9088122844696045,
985
- "learning_rate": 5.698324022346369e-06,
986
- "loss": 0.1465,
987
  "step": 1320
988
  },
989
  {
990
  "epoch": 6.680851063829787,
991
- "grad_norm": 2.5699033737182617,
992
- "learning_rate": 5.488826815642458e-06,
993
- "loss": 0.1472,
994
  "step": 1330
995
  },
996
  {
997
  "epoch": 6.730913642052566,
998
- "grad_norm": 1.9140872955322266,
999
- "learning_rate": 5.2793296089385475e-06,
1000
- "loss": 0.1585,
1001
  "step": 1340
1002
  },
1003
  {
1004
  "epoch": 6.7809762202753445,
1005
- "grad_norm": 2.022095203399658,
1006
- "learning_rate": 5.069832402234637e-06,
1007
- "loss": 0.1455,
1008
  "step": 1350
1009
  },
1010
  {
1011
  "epoch": 6.831038798498122,
1012
- "grad_norm": 2.5366971492767334,
1013
- "learning_rate": 4.860335195530726e-06,
1014
- "loss": 0.1542,
1015
  "step": 1360
1016
  },
1017
  {
1018
  "epoch": 6.881101376720901,
1019
- "grad_norm": 1.6112697124481201,
1020
- "learning_rate": 4.650837988826816e-06,
1021
- "loss": 0.1569,
1022
  "step": 1370
1023
  },
1024
  {
1025
  "epoch": 6.931163954943679,
1026
- "grad_norm": 2.8735201358795166,
1027
- "learning_rate": 4.441340782122905e-06,
1028
- "loss": 0.1628,
1029
  "step": 1380
1030
  },
1031
  {
1032
  "epoch": 6.981226533166458,
1033
- "grad_norm": 2.044304132461548,
1034
- "learning_rate": 4.231843575418994e-06,
1035
- "loss": 0.1385,
1036
  "step": 1390
1037
  },
1038
  {
1039
  "epoch": 6.996245306633291,
1040
- "eval_accuracy": 0.9830832597822889,
1041
- "eval_loss": 0.0956372618675232,
1042
- "eval_runtime": 4.8597,
1043
- "eval_samples_per_second": 1398.844,
1044
- "eval_steps_per_second": 43.83,
1045
  "step": 1393
1046
  },
1047
  {
1048
  "epoch": 7.035043804755945,
1049
- "grad_norm": 1.4923343658447266,
1050
- "learning_rate": 4.022346368715084e-06,
1051
- "loss": 0.1558,
1052
  "step": 1400
1053
  },
1054
  {
1055
  "epoch": 7.085106382978723,
1056
- "grad_norm": 3.339181423187256,
1057
- "learning_rate": 3.812849162011173e-06,
1058
- "loss": 0.1391,
1059
  "step": 1410
1060
  },
1061
  {
1062
  "epoch": 7.135168961201502,
1063
- "grad_norm": 2.091777801513672,
1064
- "learning_rate": 3.6033519553072625e-06,
1065
- "loss": 0.1541,
1066
  "step": 1420
1067
  },
1068
  {
1069
  "epoch": 7.18523153942428,
1070
- "grad_norm": 2.6580100059509277,
1071
- "learning_rate": 3.393854748603352e-06,
1072
- "loss": 0.1379,
1073
  "step": 1430
1074
  },
1075
  {
1076
  "epoch": 7.235294117647059,
1077
- "grad_norm": 2.4065537452697754,
1078
- "learning_rate": 3.1843575418994414e-06,
1079
- "loss": 0.1475,
1080
  "step": 1440
1081
  },
1082
  {
1083
  "epoch": 7.2853566958698375,
1084
- "grad_norm": 2.618218183517456,
1085
- "learning_rate": 2.974860335195531e-06,
1086
- "loss": 0.1828,
1087
  "step": 1450
1088
  },
1089
  {
1090
  "epoch": 7.335419274092616,
1091
- "grad_norm": 3.5904743671417236,
1092
- "learning_rate": 2.7653631284916204e-06,
1093
- "loss": 0.1365,
1094
  "step": 1460
1095
  },
1096
  {
1097
  "epoch": 7.385481852315394,
1098
- "grad_norm": 2.245260000228882,
1099
- "learning_rate": 2.555865921787709e-06,
1100
- "loss": 0.1394,
1101
  "step": 1470
1102
  },
1103
  {
1104
  "epoch": 7.435544430538172,
1105
- "grad_norm": 2.558086395263672,
1106
- "learning_rate": 2.346368715083799e-06,
1107
- "loss": 0.1533,
1108
  "step": 1480
1109
  },
1110
  {
1111
  "epoch": 7.485607008760951,
1112
- "grad_norm": 2.851020097732544,
1113
- "learning_rate": 2.136871508379888e-06,
1114
- "loss": 0.1313,
1115
  "step": 1490
1116
  },
1117
  {
1118
  "epoch": 7.53566958698373,
1119
- "grad_norm": 1.7011760473251343,
1120
- "learning_rate": 1.927374301675978e-06,
1121
- "loss": 0.1509,
1122
  "step": 1500
1123
  },
1124
  {
1125
  "epoch": 7.585732165206508,
1126
- "grad_norm": 2.6264467239379883,
1127
- "learning_rate": 1.717877094972067e-06,
1128
- "loss": 0.1515,
1129
  "step": 1510
1130
  },
1131
  {
1132
  "epoch": 7.635794743429287,
1133
- "grad_norm": 1.6332521438598633,
1134
- "learning_rate": 1.5083798882681566e-06,
1135
- "loss": 0.1489,
1136
  "step": 1520
1137
  },
1138
  {
1139
  "epoch": 7.685857321652065,
1140
- "grad_norm": 2.0622401237487793,
1141
- "learning_rate": 1.2988826815642458e-06,
1142
- "loss": 0.1594,
1143
  "step": 1530
1144
  },
1145
  {
1146
  "epoch": 7.735919899874844,
1147
- "grad_norm": 2.3861618041992188,
1148
- "learning_rate": 1.0893854748603353e-06,
1149
- "loss": 0.1669,
1150
  "step": 1540
1151
  },
1152
  {
1153
  "epoch": 7.785982478097622,
1154
- "grad_norm": 4.30822229385376,
1155
- "learning_rate": 8.798882681564246e-07,
1156
- "loss": 0.1759,
1157
  "step": 1550
1158
  },
1159
  {
1160
  "epoch": 7.8360450563204,
1161
- "grad_norm": 1.4631046056747437,
1162
- "learning_rate": 6.70391061452514e-07,
1163
- "loss": 0.1645,
1164
  "step": 1560
1165
  },
1166
  {
1167
  "epoch": 7.886107634543179,
1168
- "grad_norm": 2.452613115310669,
1169
- "learning_rate": 4.608938547486033e-07,
1170
- "loss": 0.1575,
1171
  "step": 1570
1172
  },
1173
  {
1174
  "epoch": 7.9361702127659575,
1175
- "grad_norm": 1.279895305633545,
1176
- "learning_rate": 2.5139664804469275e-07,
1177
- "loss": 0.149,
1178
  "step": 1580
1179
  },
1180
  {
1181
  "epoch": 7.986232790988736,
1182
- "grad_norm": 2.141481399536133,
1183
- "learning_rate": 4.189944134078212e-08,
1184
- "loss": 0.1524,
1185
  "step": 1590
1186
  },
1187
  {
1188
  "epoch": 7.996245306633291,
1189
- "eval_accuracy": 0.9824948514268903,
1190
- "eval_loss": 0.09257339686155319,
1191
- "eval_runtime": 5.6151,
1192
- "eval_samples_per_second": 1210.67,
1193
- "eval_steps_per_second": 37.934,
1194
  "step": 1592
1195
  },
1196
  {
1197
  "epoch": 7.996245306633291,
1198
  "step": 1592,
1199
  "total_flos": 3.777723239743488e+18,
1200
- "train_loss": 0.596273283347787,
1201
- "train_runtime": 640.7753,
1202
- "train_samples_per_second": 637.902,
1203
- "train_steps_per_second": 2.484
1204
  }
1205
  ],
1206
  "logging_steps": 10,
 
1
  {
2
+ "best_metric": 0.9763165636952045,
3
+ "best_model_checkpoint": "wav2vec2-base-ft-keyword-spotting/checkpoint-1592",
4
  "epoch": 7.996245306633291,
5
  "eval_steps": 500,
6
  "global_step": 1592,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05006257822277847,
13
+ "grad_norm": 2.2142410278320312,
14
+ "learning_rate": 6.25e-07,
15
+ "loss": 4.0997,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.10012515644555695,
20
+ "grad_norm": 2.4960668087005615,
21
+ "learning_rate": 1.25e-06,
22
+ "loss": 4.1744,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.15018773466833543,
27
+ "grad_norm": 2.9901962280273438,
28
+ "learning_rate": 1.8750000000000003e-06,
29
+ "loss": 4.1686,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.2002503128911139,
34
+ "grad_norm": 2.9945929050445557,
35
+ "learning_rate": 2.5e-06,
36
+ "loss": 4.1114,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.2503128911138924,
41
+ "grad_norm": 3.4015228748321533,
42
+ "learning_rate": 3.125e-06,
43
+ "loss": 4.0183,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.30037546933667086,
48
+ "grad_norm": 4.165560722351074,
49
+ "learning_rate": 3.7500000000000005e-06,
50
+ "loss": 3.875,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.3504380475594493,
55
+ "grad_norm": 4.998468399047852,
56
+ "learning_rate": 4.3750000000000005e-06,
57
+ "loss": 3.6691,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.4005006257822278,
62
+ "grad_norm": 5.729038238525391,
63
+ "learning_rate": 5e-06,
64
+ "loss": 3.3538,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.45056320400500627,
69
+ "grad_norm": 5.879266738891602,
70
+ "learning_rate": 5.625e-06,
71
+ "loss": 2.9866,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.5006257822277848,
76
+ "grad_norm": 5.562048435211182,
77
+ "learning_rate": 6.25e-06,
78
+ "loss": 2.6957,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.5506883604505632,
83
+ "grad_norm": 5.61751651763916,
84
+ "learning_rate": 6.875e-06,
85
+ "loss": 2.4759,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.6007509386733417,
90
+ "grad_norm": 4.866910457611084,
91
+ "learning_rate": 7.500000000000001e-06,
92
+ "loss": 2.358,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.6508135168961201,
97
+ "grad_norm": 4.472853183746338,
98
+ "learning_rate": 8.125000000000001e-06,
99
+ "loss": 2.2576,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.7008760951188986,
104
+ "grad_norm": 4.833339691162109,
105
+ "learning_rate": 8.750000000000001e-06,
106
+ "loss": 2.1076,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.7509386733416771,
111
+ "grad_norm": 4.425817966461182,
112
+ "learning_rate": 9.375000000000001e-06,
113
+ "loss": 2.0808,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.8010012515644556,
118
+ "grad_norm": 3.747729539871216,
119
+ "learning_rate": 1e-05,
120
+ "loss": 2.0467,
121
  "step": 160
122
  },
123
  {
124
  "epoch": 0.851063829787234,
125
+ "grad_norm": 3.6987853050231934,
126
+ "learning_rate": 9.930167597765364e-06,
127
+ "loss": 1.9358,
128
  "step": 170
129
  },
130
  {
131
  "epoch": 0.9011264080100125,
132
+ "grad_norm": 3.0652871131896973,
133
+ "learning_rate": 9.860335195530727e-06,
134
+ "loss": 1.904,
135
  "step": 180
136
  },
137
  {
138
  "epoch": 0.951188986232791,
139
+ "grad_norm": 2.3470098972320557,
140
+ "learning_rate": 9.79050279329609e-06,
141
+ "loss": 1.8801,
142
  "step": 190
143
  },
144
  {
145
  "epoch": 0.9962453066332916,
146
  "eval_accuracy": 0.6209179170344219,
147
+ "eval_loss": 1.7453958988189697,
148
+ "eval_runtime": 4.9497,
149
+ "eval_samples_per_second": 1373.405,
150
+ "eval_steps_per_second": 43.033,
151
  "step": 199
152
  },
153
  {
154
  "epoch": 1.0050062578222778,
155
+ "grad_norm": 2.7552437782287598,
156
+ "learning_rate": 9.720670391061454e-06,
157
+ "loss": 1.9461,
158
  "step": 200
159
  },
160
  {
161
  "epoch": 1.0550688360450564,
162
+ "grad_norm": 1.5700269937515259,
163
+ "learning_rate": 9.650837988826817e-06,
164
+ "loss": 1.7812,
165
  "step": 210
166
  },
167
  {
168
  "epoch": 1.1051314142678348,
169
+ "grad_norm": 1.4513113498687744,
170
+ "learning_rate": 9.581005586592178e-06,
171
+ "loss": 1.7865,
172
  "step": 220
173
  },
174
  {
175
  "epoch": 1.1551939924906134,
176
+ "grad_norm": 1.5353403091430664,
177
+ "learning_rate": 9.511173184357543e-06,
178
+ "loss": 1.7024,
179
  "step": 230
180
  },
181
  {
182
  "epoch": 1.2052565707133918,
183
+ "grad_norm": 1.4006606340408325,
184
+ "learning_rate": 9.441340782122905e-06,
185
+ "loss": 1.7307,
186
  "step": 240
187
  },
188
  {
189
  "epoch": 1.2553191489361701,
190
+ "grad_norm": 2.397796630859375,
191
+ "learning_rate": 9.371508379888268e-06,
192
+ "loss": 1.7121,
193
  "step": 250
194
  },
195
  {
196
  "epoch": 1.3053817271589487,
197
+ "grad_norm": 2.891803503036499,
198
+ "learning_rate": 9.301675977653633e-06,
199
+ "loss": 1.7195,
200
  "step": 260
201
  },
202
  {
203
  "epoch": 1.355444305381727,
204
+ "grad_norm": 13.237679481506348,
205
+ "learning_rate": 9.231843575418995e-06,
206
+ "loss": 1.6628,
207
  "step": 270
208
  },
209
  {
210
  "epoch": 1.4055068836045057,
211
+ "grad_norm": 4.088449478149414,
212
+ "learning_rate": 9.162011173184358e-06,
213
+ "loss": 1.6614,
214
  "step": 280
215
  },
216
  {
217
  "epoch": 1.455569461827284,
218
+ "grad_norm": 4.54097318649292,
219
+ "learning_rate": 9.092178770949721e-06,
220
+ "loss": 1.5913,
221
  "step": 290
222
  },
223
  {
224
  "epoch": 1.5056320400500627,
225
+ "grad_norm": 2.6582536697387695,
226
+ "learning_rate": 9.022346368715084e-06,
227
+ "loss": 1.5883,
228
  "step": 300
229
  },
230
  {
231
  "epoch": 1.555694618272841,
232
+ "grad_norm": 5.469503402709961,
233
+ "learning_rate": 8.952513966480448e-06,
234
+ "loss": 1.6299,
235
  "step": 310
236
  },
237
  {
238
  "epoch": 1.6057571964956194,
239
+ "grad_norm": 2.4134175777435303,
240
+ "learning_rate": 8.88268156424581e-06,
241
+ "loss": 1.5386,
242
  "step": 320
243
  },
244
  {
245
  "epoch": 1.655819774718398,
246
+ "grad_norm": 5.472087860107422,
247
+ "learning_rate": 8.812849162011174e-06,
248
+ "loss": 1.5774,
249
  "step": 330
250
  },
251
  {
252
  "epoch": 1.7058823529411766,
253
+ "grad_norm": 3.7888548374176025,
254
+ "learning_rate": 8.743016759776537e-06,
255
+ "loss": 1.5314,
256
  "step": 340
257
  },
258
  {
259
  "epoch": 1.7559449311639548,
260
+ "grad_norm": 6.527620315551758,
261
+ "learning_rate": 8.6731843575419e-06,
262
+ "loss": 1.4678,
263
  "step": 350
264
  },
265
  {
266
  "epoch": 1.8060075093867334,
267
+ "grad_norm": 6.796471118927002,
268
+ "learning_rate": 8.603351955307264e-06,
269
+ "loss": 1.4769,
270
  "step": 360
271
  },
272
  {
273
  "epoch": 1.856070087609512,
274
+ "grad_norm": 4.604396343231201,
275
+ "learning_rate": 8.533519553072627e-06,
276
+ "loss": 1.4285,
277
  "step": 370
278
  },
279
  {
280
  "epoch": 1.9061326658322904,
281
+ "grad_norm": 5.599514484405518,
282
+ "learning_rate": 8.463687150837988e-06,
283
+ "loss": 1.3812,
284
  "step": 380
285
  },
286
  {
287
  "epoch": 1.9561952440550687,
288
+ "grad_norm": 13.588865280151367,
289
+ "learning_rate": 8.393854748603353e-06,
290
+ "loss": 1.389,
291
  "step": 390
292
  },
293
  {
294
  "epoch": 1.9962453066332917,
295
+ "eval_accuracy": 0.6518093556928508,
296
+ "eval_loss": 1.240417718887329,
297
+ "eval_runtime": 4.8608,
298
+ "eval_samples_per_second": 1398.548,
299
+ "eval_steps_per_second": 43.82,
300
  "step": 398
301
  },
302
  {
303
  "epoch": 2.0100125156445556,
304
+ "grad_norm": 5.914628028869629,
305
+ "learning_rate": 8.324022346368715e-06,
306
+ "loss": 1.4118,
307
  "step": 400
308
  },
309
  {
310
  "epoch": 2.0600750938673342,
311
+ "grad_norm": 9.35991096496582,
312
+ "learning_rate": 8.254189944134078e-06,
313
+ "loss": 1.3324,
314
  "step": 410
315
  },
316
  {
317
  "epoch": 2.110137672090113,
318
+ "grad_norm": 7.8319220542907715,
319
+ "learning_rate": 8.184357541899443e-06,
320
+ "loss": 1.3393,
321
  "step": 420
322
  },
323
  {
324
  "epoch": 2.160200250312891,
325
+ "grad_norm": 3.685518264770508,
326
+ "learning_rate": 8.114525139664805e-06,
327
+ "loss": 1.3176,
328
  "step": 430
329
  },
330
  {
331
  "epoch": 2.2102628285356696,
332
+ "grad_norm": 4.464322566986084,
333
+ "learning_rate": 8.044692737430168e-06,
334
+ "loss": 1.2491,
335
  "step": 440
336
  },
337
  {
338
  "epoch": 2.260325406758448,
339
+ "grad_norm": 6.714467525482178,
340
+ "learning_rate": 7.974860335195531e-06,
341
+ "loss": 1.2923,
342
  "step": 450
343
  },
344
  {
345
  "epoch": 2.3103879849812268,
346
+ "grad_norm": 3.9189820289611816,
347
+ "learning_rate": 7.905027932960894e-06,
348
+ "loss": 1.2584,
349
  "step": 460
350
  },
351
  {
352
  "epoch": 2.360450563204005,
353
+ "grad_norm": 10.34443473815918,
354
+ "learning_rate": 7.835195530726258e-06,
355
+ "loss": 1.2372,
356
  "step": 470
357
  },
358
  {
359
  "epoch": 2.4105131414267835,
360
+ "grad_norm": 3.698418140411377,
361
+ "learning_rate": 7.76536312849162e-06,
362
+ "loss": 1.225,
363
  "step": 480
364
  },
365
  {
366
  "epoch": 2.460575719649562,
367
+ "grad_norm": 8.271072387695312,
368
+ "learning_rate": 7.695530726256984e-06,
369
+ "loss": 1.1353,
370
  "step": 490
371
  },
372
  {
373
  "epoch": 2.5106382978723403,
374
+ "grad_norm": 3.2551236152648926,
375
+ "learning_rate": 7.625698324022347e-06,
376
+ "loss": 1.1894,
377
  "step": 500
378
  },
379
  {
380
  "epoch": 2.560700876095119,
381
+ "grad_norm": 7.808169364929199,
382
+ "learning_rate": 7.5558659217877105e-06,
383
+ "loss": 1.2102,
384
  "step": 510
385
  },
386
  {
387
  "epoch": 2.6107634543178975,
388
+ "grad_norm": 10.614561080932617,
389
+ "learning_rate": 7.486033519553073e-06,
390
+ "loss": 1.1493,
391
  "step": 520
392
  },
393
  {
394
  "epoch": 2.660826032540676,
395
+ "grad_norm": 4.759502410888672,
396
+ "learning_rate": 7.416201117318437e-06,
397
+ "loss": 1.1384,
398
  "step": 530
399
  },
400
  {
401
  "epoch": 2.710888610763454,
402
+ "grad_norm": 8.49401569366455,
403
+ "learning_rate": 7.346368715083799e-06,
404
+ "loss": 1.0993,
405
  "step": 540
406
  },
407
  {
408
  "epoch": 2.760951188986233,
409
+ "grad_norm": 4.307897567749023,
410
+ "learning_rate": 7.2765363128491625e-06,
411
+ "loss": 1.1151,
412
  "step": 550
413
  },
414
  {
415
  "epoch": 2.8110137672090114,
416
+ "grad_norm": 4.177582740783691,
417
+ "learning_rate": 7.206703910614526e-06,
418
+ "loss": 1.1375,
419
  "step": 560
420
  },
421
  {
422
  "epoch": 2.8610763454317896,
423
+ "grad_norm": 3.8811707496643066,
424
+ "learning_rate": 7.136871508379889e-06,
425
+ "loss": 1.1109,
426
  "step": 570
427
  },
428
  {
429
  "epoch": 2.911138923654568,
430
+ "grad_norm": 2.818143129348755,
431
+ "learning_rate": 7.067039106145251e-06,
432
+ "loss": 1.1036,
433
  "step": 580
434
  },
435
  {
436
  "epoch": 2.9612015018773468,
437
+ "grad_norm": 3.4761757850646973,
438
+ "learning_rate": 6.9972067039106154e-06,
439
+ "loss": 1.1239,
440
  "step": 590
441
  },
442
  {
443
  "epoch": 2.9962453066332917,
444
+ "eval_accuracy": 0.7880258899676376,
445
+ "eval_loss": 1.069029688835144,
446
+ "eval_runtime": 4.802,
447
+ "eval_samples_per_second": 1415.653,
448
+ "eval_steps_per_second": 44.356,
449
  "step": 597
450
  },
451
  {
452
  "epoch": 3.0150187734668337,
453
+ "grad_norm": 4.441803455352783,
454
+ "learning_rate": 6.927374301675979e-06,
455
+ "loss": 1.1924,
456
  "step": 600
457
  },
458
  {
459
  "epoch": 3.065081351689612,
460
+ "grad_norm": 6.063540458679199,
461
+ "learning_rate": 6.857541899441341e-06,
462
+ "loss": 1.0628,
463
  "step": 610
464
  },
465
  {
466
  "epoch": 3.1151439299123904,
467
+ "grad_norm": 4.163120746612549,
468
+ "learning_rate": 6.787709497206705e-06,
469
+ "loss": 1.0444,
470
  "step": 620
471
  },
472
  {
473
  "epoch": 3.165206508135169,
474
+ "grad_norm": 6.099351406097412,
475
+ "learning_rate": 6.7178770949720675e-06,
476
+ "loss": 1.1028,
477
  "step": 630
478
  },
479
  {
480
  "epoch": 3.2152690863579476,
481
+ "grad_norm": 4.259471416473389,
482
+ "learning_rate": 6.648044692737431e-06,
483
+ "loss": 1.0317,
484
  "step": 640
485
  },
486
  {
487
  "epoch": 3.2653316645807258,
488
+ "grad_norm": 4.617487907409668,
489
+ "learning_rate": 6.578212290502793e-06,
490
+ "loss": 1.0323,
491
  "step": 650
492
  },
493
  {
494
  "epoch": 3.3153942428035044,
495
+ "grad_norm": 4.48543119430542,
496
+ "learning_rate": 6.508379888268157e-06,
497
+ "loss": 1.0185,
498
  "step": 660
499
  },
500
  {
501
  "epoch": 3.365456821026283,
502
+ "grad_norm": 4.388581275939941,
503
+ "learning_rate": 6.43854748603352e-06,
504
+ "loss": 0.9979,
505
  "step": 670
506
  },
507
  {
508
  "epoch": 3.415519399249061,
509
+ "grad_norm": 7.762565612792969,
510
+ "learning_rate": 6.368715083798883e-06,
511
+ "loss": 0.9999,
512
  "step": 680
513
  },
514
  {
515
  "epoch": 3.4655819774718397,
516
+ "grad_norm": 3.058173179626465,
517
+ "learning_rate": 6.298882681564247e-06,
518
+ "loss": 0.9729,
519
  "step": 690
520
  },
521
  {
522
  "epoch": 3.5156445556946183,
523
+ "grad_norm": 6.832275390625,
524
+ "learning_rate": 6.229050279329609e-06,
525
+ "loss": 1.0058,
526
  "step": 700
527
  },
528
  {
529
  "epoch": 3.565707133917397,
530
+ "grad_norm": 3.4098267555236816,
531
+ "learning_rate": 6.1592178770949725e-06,
532
+ "loss": 0.9634,
533
  "step": 710
534
  },
535
  {
536
  "epoch": 3.615769712140175,
537
+ "grad_norm": 5.360341548919678,
538
+ "learning_rate": 6.089385474860336e-06,
539
+ "loss": 0.9622,
540
  "step": 720
541
  },
542
  {
543
  "epoch": 3.6658322903629537,
544
+ "grad_norm": 3.9530041217803955,
545
+ "learning_rate": 6.019553072625699e-06,
546
+ "loss": 0.9585,
547
  "step": 730
548
  },
549
  {
550
  "epoch": 3.7158948685857323,
551
+ "grad_norm": 8.933693885803223,
552
+ "learning_rate": 5.949720670391061e-06,
553
+ "loss": 0.953,
554
  "step": 740
555
  },
556
  {
557
  "epoch": 3.7659574468085104,
558
+ "grad_norm": 3.6110997200012207,
559
+ "learning_rate": 5.879888268156425e-06,
560
+ "loss": 0.9176,
561
  "step": 750
562
  },
563
  {
564
  "epoch": 3.816020025031289,
565
+ "grad_norm": 3.877078056335449,
566
+ "learning_rate": 5.810055865921789e-06,
567
+ "loss": 0.9133,
568
  "step": 760
569
  },
570
  {
571
  "epoch": 3.8660826032540676,
572
+ "grad_norm": 6.531003952026367,
573
+ "learning_rate": 5.740223463687151e-06,
574
+ "loss": 0.9288,
575
  "step": 770
576
  },
577
  {
578
  "epoch": 3.916145181476846,
579
+ "grad_norm": 3.672001600265503,
580
+ "learning_rate": 5.670391061452515e-06,
581
+ "loss": 0.8899,
582
  "step": 780
583
  },
584
  {
585
  "epoch": 3.966207759699625,
586
+ "grad_norm": 3.4693164825439453,
587
+ "learning_rate": 5.6005586592178775e-06,
588
+ "loss": 0.9107,
589
  "step": 790
590
  },
591
  {
592
  "epoch": 3.9962453066332917,
593
+ "eval_accuracy": 0.8961459252721389,
594
+ "eval_loss": 0.7700026035308838,
595
+ "eval_runtime": 4.7597,
596
+ "eval_samples_per_second": 1428.236,
597
+ "eval_steps_per_second": 44.751,
598
  "step": 796
599
  },
600
  {
601
  "epoch": 4.020025031289111,
602
+ "grad_norm": 5.503833293914795,
603
+ "learning_rate": 5.530726256983241e-06,
604
+ "loss": 0.9085,
605
  "step": 800
606
  },
607
  {
608
  "epoch": 4.07008760951189,
609
+ "grad_norm": 5.365431785583496,
610
+ "learning_rate": 5.460893854748603e-06,
611
+ "loss": 0.908,
612
  "step": 810
613
  },
614
  {
615
  "epoch": 4.1201501877346685,
616
+ "grad_norm": 4.582799434661865,
617
+ "learning_rate": 5.391061452513967e-06,
618
+ "loss": 0.8559,
619
  "step": 820
620
  },
621
  {
622
  "epoch": 4.170212765957447,
623
+ "grad_norm": 3.201195478439331,
624
+ "learning_rate": 5.3212290502793296e-06,
625
+ "loss": 0.8901,
626
  "step": 830
627
  },
628
  {
629
  "epoch": 4.220275344180226,
630
+ "grad_norm": 6.477810382843018,
631
+ "learning_rate": 5.251396648044693e-06,
632
+ "loss": 0.88,
633
  "step": 840
634
  },
635
  {
636
  "epoch": 4.270337922403003,
637
+ "grad_norm": 3.525606870651245,
638
+ "learning_rate": 5.181564245810057e-06,
639
+ "loss": 0.8099,
640
  "step": 850
641
  },
642
  {
643
  "epoch": 4.320400500625782,
644
+ "grad_norm": 3.92795729637146,
645
+ "learning_rate": 5.111731843575419e-06,
646
+ "loss": 0.8326,
647
  "step": 860
648
  },
649
  {
650
  "epoch": 4.370463078848561,
651
+ "grad_norm": 4.019381523132324,
652
+ "learning_rate": 5.041899441340783e-06,
653
+ "loss": 0.7902,
654
  "step": 870
655
  },
656
  {
657
  "epoch": 4.420525657071339,
658
+ "grad_norm": 3.7563273906707764,
659
+ "learning_rate": 4.972067039106146e-06,
660
+ "loss": 0.8322,
661
  "step": 880
662
  },
663
  {
664
  "epoch": 4.470588235294118,
665
+ "grad_norm": 4.296588897705078,
666
+ "learning_rate": 4.902234636871509e-06,
667
+ "loss": 0.7835,
668
  "step": 890
669
  },
670
  {
671
  "epoch": 4.520650813516896,
672
+ "grad_norm": 4.3726277351379395,
673
+ "learning_rate": 4.832402234636872e-06,
674
+ "loss": 0.7505,
675
  "step": 900
676
  },
677
  {
678
  "epoch": 4.570713391739675,
679
+ "grad_norm": 3.2456297874450684,
680
+ "learning_rate": 4.762569832402235e-06,
681
+ "loss": 0.7588,
682
  "step": 910
683
  },
684
  {
685
  "epoch": 4.6207759699624535,
686
+ "grad_norm": 4.264461994171143,
687
+ "learning_rate": 4.692737430167599e-06,
688
+ "loss": 0.7566,
689
  "step": 920
690
  },
691
  {
692
  "epoch": 4.670838548185231,
693
+ "grad_norm": 3.387613296508789,
694
+ "learning_rate": 4.622905027932961e-06,
695
+ "loss": 0.7585,
696
  "step": 930
697
  },
698
  {
699
  "epoch": 4.72090112640801,
700
+ "grad_norm": 4.655552864074707,
701
+ "learning_rate": 4.553072625698324e-06,
702
+ "loss": 0.7171,
703
  "step": 940
704
  },
705
  {
706
  "epoch": 4.7709637046307884,
707
+ "grad_norm": 4.884917259216309,
708
+ "learning_rate": 4.4832402234636875e-06,
709
+ "loss": 0.789,
710
  "step": 950
711
  },
712
  {
713
  "epoch": 4.821026282853567,
714
+ "grad_norm": 5.926691055297852,
715
+ "learning_rate": 4.413407821229051e-06,
716
+ "loss": 0.7626,
717
  "step": 960
718
  },
719
  {
720
  "epoch": 4.871088861076346,
721
+ "grad_norm": 5.080448150634766,
722
+ "learning_rate": 4.343575418994414e-06,
723
+ "loss": 0.7362,
724
  "step": 970
725
  },
726
  {
727
  "epoch": 4.921151439299124,
728
+ "grad_norm": 3.3272018432617188,
729
+ "learning_rate": 4.273743016759777e-06,
730
+ "loss": 0.753,
731
  "step": 980
732
  },
733
  {
734
  "epoch": 4.971214017521902,
735
+ "grad_norm": 4.017044544219971,
736
+ "learning_rate": 4.20391061452514e-06,
737
+ "loss": 0.7231,
738
  "step": 990
739
  },
740
  {
741
  "epoch": 4.996245306633291,
742
+ "eval_accuracy": 0.9658723153868785,
743
+ "eval_loss": 0.6167460680007935,
744
+ "eval_runtime": 4.7239,
745
+ "eval_samples_per_second": 1439.062,
746
+ "eval_steps_per_second": 45.09,
747
  "step": 995
748
  },
749
  {
750
  "epoch": 5.025031289111389,
751
+ "grad_norm": 3.775956392288208,
752
+ "learning_rate": 4.134078212290504e-06,
753
+ "loss": 0.7468,
754
  "step": 1000
755
  },
756
  {
757
  "epoch": 5.075093867334168,
758
+ "grad_norm": 3.734192371368408,
759
+ "learning_rate": 4.064245810055866e-06,
760
+ "loss": 0.7145,
761
  "step": 1010
762
  },
763
  {
764
  "epoch": 5.1251564455569465,
765
+ "grad_norm": 5.420441150665283,
766
+ "learning_rate": 3.994413407821229e-06,
767
+ "loss": 0.6958,
768
  "step": 1020
769
  },
770
  {
771
  "epoch": 5.175219023779725,
772
+ "grad_norm": 4.203883171081543,
773
+ "learning_rate": 3.9245810055865924e-06,
774
+ "loss": 0.714,
775
  "step": 1030
776
  },
777
  {
778
  "epoch": 5.225281602002503,
779
+ "grad_norm": 4.452067852020264,
780
+ "learning_rate": 3.854748603351956e-06,
781
+ "loss": 0.6791,
782
  "step": 1040
783
  },
784
  {
785
  "epoch": 5.275344180225281,
786
+ "grad_norm": 3.5976009368896484,
787
+ "learning_rate": 3.7849162011173185e-06,
788
+ "loss": 0.656,
789
  "step": 1050
790
  },
791
  {
792
  "epoch": 5.32540675844806,
793
+ "grad_norm": 3.9160947799682617,
794
+ "learning_rate": 3.715083798882682e-06,
795
+ "loss": 0.6789,
796
  "step": 1060
797
  },
798
  {
799
  "epoch": 5.375469336670839,
800
+ "grad_norm": 3.4226489067077637,
801
+ "learning_rate": 3.6452513966480454e-06,
802
+ "loss": 0.6889,
803
  "step": 1070
804
  },
805
  {
806
  "epoch": 5.425531914893617,
807
+ "grad_norm": 5.1838788986206055,
808
+ "learning_rate": 3.575418994413408e-06,
809
+ "loss": 0.6529,
810
  "step": 1080
811
  },
812
  {
813
  "epoch": 5.475594493116396,
814
+ "grad_norm": 3.7870497703552246,
815
+ "learning_rate": 3.5055865921787714e-06,
816
+ "loss": 0.6201,
817
  "step": 1090
818
  },
819
  {
820
  "epoch": 5.5256570713391735,
821
+ "grad_norm": 5.270695686340332,
822
+ "learning_rate": 3.435754189944134e-06,
823
+ "loss": 0.6675,
824
  "step": 1100
825
  },
826
  {
827
  "epoch": 5.575719649561952,
828
+ "grad_norm": 3.8207719326019287,
829
+ "learning_rate": 3.3659217877094974e-06,
830
+ "loss": 0.6715,
831
  "step": 1110
832
  },
833
  {
834
  "epoch": 5.625782227784731,
835
+ "grad_norm": 3.1934285163879395,
836
+ "learning_rate": 3.2960893854748607e-06,
837
+ "loss": 0.6449,
838
  "step": 1120
839
  },
840
  {
841
  "epoch": 5.675844806007509,
842
+ "grad_norm": 4.760712623596191,
843
+ "learning_rate": 3.2262569832402235e-06,
844
+ "loss": 0.6331,
845
  "step": 1130
846
  },
847
  {
848
  "epoch": 5.725907384230288,
849
+ "grad_norm": 3.9951508045196533,
850
+ "learning_rate": 3.1564245810055867e-06,
851
+ "loss": 0.6289,
852
  "step": 1140
853
  },
854
  {
855
  "epoch": 5.7759699624530665,
856
+ "grad_norm": 5.959083557128906,
857
+ "learning_rate": 3.0865921787709503e-06,
858
+ "loss": 0.6101,
859
  "step": 1150
860
  },
861
  {
862
  "epoch": 5.826032540675845,
863
+ "grad_norm": 5.146860599517822,
864
+ "learning_rate": 3.016759776536313e-06,
865
+ "loss": 0.6235,
866
  "step": 1160
867
  },
868
  {
869
  "epoch": 5.876095118898624,
870
+ "grad_norm": 4.0103230476379395,
871
+ "learning_rate": 2.9469273743016764e-06,
872
+ "loss": 0.6472,
873
  "step": 1170
874
  },
875
  {
876
  "epoch": 5.926157697121401,
877
+ "grad_norm": 4.349458694458008,
878
+ "learning_rate": 2.877094972067039e-06,
879
+ "loss": 0.5924,
880
  "step": 1180
881
  },
882
  {
883
  "epoch": 5.97622027534418,
884
+ "grad_norm": 4.416366100311279,
885
+ "learning_rate": 2.8072625698324024e-06,
886
+ "loss": 0.5972,
887
  "step": 1190
888
  },
889
  {
890
  "epoch": 5.996245306633291,
891
+ "eval_accuracy": 0.9735216240070609,
892
+ "eval_loss": 0.48377424478530884,
893
+ "eval_runtime": 4.9277,
894
+ "eval_samples_per_second": 1379.544,
895
+ "eval_steps_per_second": 43.225,
896
  "step": 1194
897
  },
898
  {
899
  "epoch": 6.030037546933667,
900
+ "grad_norm": 5.7863569259643555,
901
+ "learning_rate": 2.7374301675977656e-06,
902
+ "loss": 0.6391,
903
  "step": 1200
904
  },
905
  {
906
  "epoch": 6.080100125156446,
907
+ "grad_norm": 3.5653481483459473,
908
+ "learning_rate": 2.6675977653631285e-06,
909
+ "loss": 0.577,
910
  "step": 1210
911
  },
912
  {
913
  "epoch": 6.130162703379224,
914
+ "grad_norm": 2.4277288913726807,
915
+ "learning_rate": 2.5977653631284917e-06,
916
+ "loss": 0.5942,
917
  "step": 1220
918
  },
919
  {
920
  "epoch": 6.180225281602002,
921
+ "grad_norm": 4.4123406410217285,
922
+ "learning_rate": 2.5279329608938553e-06,
923
+ "loss": 0.5952,
924
  "step": 1230
925
  },
926
  {
927
  "epoch": 6.230287859824781,
928
+ "grad_norm": 3.4356672763824463,
929
+ "learning_rate": 2.458100558659218e-06,
930
+ "loss": 0.5597,
931
  "step": 1240
932
  },
933
  {
934
  "epoch": 6.280350438047559,
935
+ "grad_norm": 3.3665220737457275,
936
+ "learning_rate": 2.3882681564245814e-06,
937
+ "loss": 0.586,
938
  "step": 1250
939
  },
940
  {
941
  "epoch": 6.330413016270338,
942
+ "grad_norm": 4.899147987365723,
943
+ "learning_rate": 2.318435754189944e-06,
944
+ "loss": 0.5691,
945
  "step": 1260
946
  },
947
  {
948
  "epoch": 6.380475594493117,
949
+ "grad_norm": 3.53438663482666,
950
+ "learning_rate": 2.2486033519553074e-06,
951
+ "loss": 0.5598,
952
  "step": 1270
953
  },
954
  {
955
  "epoch": 6.430538172715895,
956
+ "grad_norm": 2.7354676723480225,
957
+ "learning_rate": 2.1787709497206706e-06,
958
+ "loss": 0.5858,
959
  "step": 1280
960
  },
961
  {
962
  "epoch": 6.480600750938673,
963
+ "grad_norm": 4.669964790344238,
964
+ "learning_rate": 2.108938547486034e-06,
965
+ "loss": 0.5727,
966
  "step": 1290
967
  },
968
  {
969
  "epoch": 6.5306633291614515,
970
+ "grad_norm": 4.332780361175537,
971
+ "learning_rate": 2.039106145251397e-06,
972
+ "loss": 0.5416,
973
  "step": 1300
974
  },
975
  {
976
  "epoch": 6.58072590738423,
977
+ "grad_norm": 5.418635845184326,
978
+ "learning_rate": 1.96927374301676e-06,
979
+ "loss": 0.5533,
980
  "step": 1310
981
  },
982
  {
983
  "epoch": 6.630788485607009,
984
+ "grad_norm": 2.963872194290161,
985
+ "learning_rate": 1.899441340782123e-06,
986
+ "loss": 0.5441,
987
  "step": 1320
988
  },
989
  {
990
  "epoch": 6.680851063829787,
991
+ "grad_norm": 3.982882261276245,
992
+ "learning_rate": 1.8296089385474861e-06,
993
+ "loss": 0.5522,
994
  "step": 1330
995
  },
996
  {
997
  "epoch": 6.730913642052566,
998
+ "grad_norm": 3.4341752529144287,
999
+ "learning_rate": 1.7597765363128494e-06,
1000
+ "loss": 0.5156,
1001
  "step": 1340
1002
  },
1003
  {
1004
  "epoch": 6.7809762202753445,
1005
+ "grad_norm": 3.866319179534912,
1006
+ "learning_rate": 1.6899441340782124e-06,
1007
+ "loss": 0.543,
1008
  "step": 1350
1009
  },
1010
  {
1011
  "epoch": 6.831038798498122,
1012
+ "grad_norm": 4.521007537841797,
1013
+ "learning_rate": 1.6201117318435754e-06,
1014
+ "loss": 0.5466,
1015
  "step": 1360
1016
  },
1017
  {
1018
  "epoch": 6.881101376720901,
1019
+ "grad_norm": 4.014106273651123,
1020
+ "learning_rate": 1.5502793296089386e-06,
1021
+ "loss": 0.5131,
1022
  "step": 1370
1023
  },
1024
  {
1025
  "epoch": 6.931163954943679,
1026
+ "grad_norm": 4.237702369689941,
1027
+ "learning_rate": 1.4804469273743019e-06,
1028
+ "loss": 0.5626,
1029
  "step": 1380
1030
  },
1031
  {
1032
  "epoch": 6.981226533166458,
1033
+ "grad_norm": 3.3088533878326416,
1034
+ "learning_rate": 1.4106145251396649e-06,
1035
+ "loss": 0.5143,
1036
  "step": 1390
1037
  },
1038
  {
1039
  "epoch": 6.996245306633291,
1040
+ "eval_accuracy": 0.9761694616063548,
1041
+ "eval_loss": 0.4227001368999481,
1042
+ "eval_runtime": 4.8947,
1043
+ "eval_samples_per_second": 1388.847,
1044
+ "eval_steps_per_second": 43.516,
1045
  "step": 1393
1046
  },
1047
  {
1048
  "epoch": 7.035043804755945,
1049
+ "grad_norm": 3.7986509799957275,
1050
+ "learning_rate": 1.3407821229050281e-06,
1051
+ "loss": 0.574,
1052
  "step": 1400
1053
  },
1054
  {
1055
  "epoch": 7.085106382978723,
1056
+ "grad_norm": 4.211392879486084,
1057
+ "learning_rate": 1.2709497206703911e-06,
1058
+ "loss": 0.5194,
1059
  "step": 1410
1060
  },
1061
  {
1062
  "epoch": 7.135168961201502,
1063
+ "grad_norm": 3.6076323986053467,
1064
+ "learning_rate": 1.2011173184357544e-06,
1065
+ "loss": 0.5085,
1066
  "step": 1420
1067
  },
1068
  {
1069
  "epoch": 7.18523153942428,
1070
+ "grad_norm": 5.054622650146484,
1071
+ "learning_rate": 1.1312849162011174e-06,
1072
+ "loss": 0.5059,
1073
  "step": 1430
1074
  },
1075
  {
1076
  "epoch": 7.235294117647059,
1077
+ "grad_norm": 5.9564313888549805,
1078
+ "learning_rate": 1.0614525139664806e-06,
1079
+ "loss": 0.509,
1080
  "step": 1440
1081
  },
1082
  {
1083
  "epoch": 7.2853566958698375,
1084
+ "grad_norm": 5.2771220207214355,
1085
+ "learning_rate": 9.916201117318436e-07,
1086
+ "loss": 0.5613,
1087
  "step": 1450
1088
  },
1089
  {
1090
  "epoch": 7.335419274092616,
1091
+ "grad_norm": 3.4556643962860107,
1092
+ "learning_rate": 9.217877094972068e-07,
1093
+ "loss": 0.5164,
1094
  "step": 1460
1095
  },
1096
  {
1097
  "epoch": 7.385481852315394,
1098
+ "grad_norm": 4.0196003913879395,
1099
+ "learning_rate": 8.519553072625699e-07,
1100
+ "loss": 0.5255,
1101
  "step": 1470
1102
  },
1103
  {
1104
  "epoch": 7.435544430538172,
1105
+ "grad_norm": 2.3283958435058594,
1106
+ "learning_rate": 7.82122905027933e-07,
1107
+ "loss": 0.5133,
1108
  "step": 1480
1109
  },
1110
  {
1111
  "epoch": 7.485607008760951,
1112
+ "grad_norm": 3.3878517150878906,
1113
+ "learning_rate": 7.122905027932961e-07,
1114
+ "loss": 0.5028,
1115
  "step": 1490
1116
  },
1117
  {
1118
  "epoch": 7.53566958698373,
1119
+ "grad_norm": 4.05161190032959,
1120
+ "learning_rate": 6.424581005586592e-07,
1121
+ "loss": 0.5191,
1122
  "step": 1500
1123
  },
1124
  {
1125
  "epoch": 7.585732165206508,
1126
+ "grad_norm": 3.4434776306152344,
1127
+ "learning_rate": 5.726256983240224e-07,
1128
+ "loss": 0.5347,
1129
  "step": 1510
1130
  },
1131
  {
1132
  "epoch": 7.635794743429287,
1133
+ "grad_norm": 3.151704788208008,
1134
+ "learning_rate": 5.027932960893855e-07,
1135
+ "loss": 0.5233,
1136
  "step": 1520
1137
  },
1138
  {
1139
  "epoch": 7.685857321652065,
1140
+ "grad_norm": 4.1916046142578125,
1141
+ "learning_rate": 4.3296089385474867e-07,
1142
+ "loss": 0.5469,
1143
  "step": 1530
1144
  },
1145
  {
1146
  "epoch": 7.735919899874844,
1147
+ "grad_norm": 5.041410446166992,
1148
+ "learning_rate": 3.631284916201118e-07,
1149
+ "loss": 0.5272,
1150
  "step": 1540
1151
  },
1152
  {
1153
  "epoch": 7.785982478097622,
1154
+ "grad_norm": 4.148556709289551,
1155
+ "learning_rate": 2.932960893854749e-07,
1156
+ "loss": 0.5222,
1157
  "step": 1550
1158
  },
1159
  {
1160
  "epoch": 7.8360450563204,
1161
+ "grad_norm": 2.708613872528076,
1162
+ "learning_rate": 2.23463687150838e-07,
1163
+ "loss": 0.5405,
1164
  "step": 1560
1165
  },
1166
  {
1167
  "epoch": 7.886107634543179,
1168
+ "grad_norm": 3.7867846488952637,
1169
+ "learning_rate": 1.5363128491620113e-07,
1170
+ "loss": 0.5249,
1171
  "step": 1570
1172
  },
1173
  {
1174
  "epoch": 7.9361702127659575,
1175
+ "grad_norm": 3.1069061756134033,
1176
+ "learning_rate": 8.379888268156426e-08,
1177
+ "loss": 0.52,
1178
  "step": 1580
1179
  },
1180
  {
1181
  "epoch": 7.986232790988736,
1182
+ "grad_norm": 4.343489646911621,
1183
+ "learning_rate": 1.3966480446927376e-08,
1184
+ "loss": 0.5159,
1185
  "step": 1590
1186
  },
1187
  {
1188
  "epoch": 7.996245306633291,
1189
+ "eval_accuracy": 0.9763165636952045,
1190
+ "eval_loss": 0.40624934434890747,
1191
+ "eval_runtime": 5.5236,
1192
+ "eval_samples_per_second": 1230.722,
1193
+ "eval_steps_per_second": 38.562,
1194
  "step": 1592
1195
  },
1196
  {
1197
  "epoch": 7.996245306633291,
1198
  "step": 1592,
1199
  "total_flos": 3.777723239743488e+18,
1200
+ "train_loss": 1.152813568037359,
1201
+ "train_runtime": 637.2674,
1202
+ "train_samples_per_second": 641.414,
1203
+ "train_steps_per_second": 2.498
1204
  }
1205
  ],
1206
  "logging_steps": 10,