cervisiarius commited on
Commit
bc53047
·
verified ·
1 Parent(s): 9379aa1

Model save

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 1.2589733614033306e+17,
4
- "train_loss": 0.6479685707790096,
5
- "train_runtime": 1278.5189,
6
  "train_samples": 16710,
7
- "train_samples_per_second": 10.26,
8
- "train_steps_per_second": 0.641
9
  }
 
1
  {
2
+ "epoch": 0.9995213020584012,
3
+ "total_flos": 2.304533815525294e+17,
4
+ "train_loss": 0.5449674188862359,
5
+ "train_runtime": 2251.699,
6
  "train_samples": 16710,
7
+ "train_samples_per_second": 7.421,
8
+ "train_steps_per_second": 0.464
9
  }
runs/Feb06_02-57-47_GCRAZGDL1601/events.out.tfevents.1738810680.GCRAZGDL1601.2598766.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec719610f06d04ebb7c149463b19399791101828afd866c619ba4b545551aabb
3
- size 49925
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb4ce255f5a7de7395b40f55bc12936c5072df5a73d13a025482a027259adf8f
3
+ size 50279
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 1.2589733614033306e+17,
4
- "train_loss": 0.6479685707790096,
5
- "train_runtime": 1278.5189,
6
  "train_samples": 16710,
7
- "train_samples_per_second": 10.26,
8
- "train_steps_per_second": 0.641
9
  }
 
1
  {
2
+ "epoch": 0.9995213020584012,
3
+ "total_flos": 2.304533815525294e+17,
4
+ "train_loss": 0.5449674188862359,
5
+ "train_runtime": 2251.699,
6
  "train_samples": 16710,
7
+ "train_samples_per_second": 7.421,
8
+ "train_steps_per_second": 0.464
9
  }
trainer_state.json CHANGED
@@ -1,1173 +1,1481 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 820,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.006097560975609756,
13
- "grad_norm": 0.19113780558109283,
14
  "learning_rate": 0.0002,
15
- "loss": 1.3542,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 0.012195121951219513,
20
- "grad_norm": 0.157912939786911,
21
  "learning_rate": 0.0002,
22
- "loss": 1.147,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.018292682926829267,
27
- "grad_norm": 0.1430707424879074,
28
  "learning_rate": 0.0002,
29
- "loss": 1.0458,
30
  "step": 15
31
  },
32
  {
33
- "epoch": 0.024390243902439025,
34
- "grad_norm": 0.1274123191833496,
35
  "learning_rate": 0.0002,
36
- "loss": 0.8969,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.03048780487804878,
41
- "grad_norm": 0.10518011450767517,
42
  "learning_rate": 0.0002,
43
- "loss": 0.8286,
44
  "step": 25
45
  },
46
  {
47
- "epoch": 0.036585365853658534,
48
- "grad_norm": 0.0880563035607338,
49
  "learning_rate": 0.0002,
50
- "loss": 0.7717,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.042682926829268296,
55
- "grad_norm": 0.09094688296318054,
56
  "learning_rate": 0.0002,
57
- "loss": 0.7419,
58
  "step": 35
59
  },
60
  {
61
- "epoch": 0.04878048780487805,
62
- "grad_norm": 0.08669694513082504,
63
  "learning_rate": 0.0002,
64
- "loss": 0.7358,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.054878048780487805,
69
- "grad_norm": 0.07508991658687592,
70
  "learning_rate": 0.0002,
71
- "loss": 0.7465,
72
  "step": 45
73
  },
74
  {
75
- "epoch": 0.06097560975609756,
76
- "grad_norm": 0.0900682881474495,
77
  "learning_rate": 0.0002,
78
- "loss": 0.7026,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.06707317073170732,
83
- "grad_norm": 0.07082549482584,
84
  "learning_rate": 0.0002,
85
- "loss": 0.6993,
86
  "step": 55
87
  },
88
  {
89
- "epoch": 0.07317073170731707,
90
- "grad_norm": 0.07301679253578186,
91
  "learning_rate": 0.0002,
92
- "loss": 0.7251,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.07926829268292683,
97
- "grad_norm": 0.08057350665330887,
98
  "learning_rate": 0.0002,
99
- "loss": 0.6984,
100
  "step": 65
101
  },
102
  {
103
- "epoch": 0.08536585365853659,
104
- "grad_norm": 0.07914779335260391,
105
  "learning_rate": 0.0002,
106
- "loss": 0.6966,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.09146341463414634,
111
- "grad_norm": 0.077100470662117,
112
  "learning_rate": 0.0002,
113
- "loss": 0.7044,
114
  "step": 75
115
  },
116
  {
117
- "epoch": 0.0975609756097561,
118
- "grad_norm": 0.07273847609758377,
119
  "learning_rate": 0.0002,
120
- "loss": 0.6682,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.10365853658536585,
125
- "grad_norm": 0.07714827358722687,
126
  "learning_rate": 0.0002,
127
- "loss": 0.7168,
128
  "step": 85
129
  },
130
  {
131
- "epoch": 0.10975609756097561,
132
- "grad_norm": 0.07556980848312378,
133
  "learning_rate": 0.0002,
134
- "loss": 0.6809,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.11585365853658537,
139
- "grad_norm": 0.07348775863647461,
140
  "learning_rate": 0.0002,
141
- "loss": 0.6643,
142
  "step": 95
143
  },
144
  {
145
- "epoch": 0.12195121951219512,
146
- "grad_norm": 0.07931828498840332,
147
  "learning_rate": 0.0002,
148
- "loss": 0.6769,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.12804878048780488,
153
- "grad_norm": 0.08518531173467636,
154
  "learning_rate": 0.0002,
155
- "loss": 0.6364,
156
  "step": 105
157
  },
158
  {
159
- "epoch": 0.13414634146341464,
160
- "grad_norm": 0.08019699901342392,
161
  "learning_rate": 0.0002,
162
- "loss": 0.6837,
163
  "step": 110
164
  },
165
  {
166
- "epoch": 0.1402439024390244,
167
- "grad_norm": 0.07904317229986191,
168
  "learning_rate": 0.0002,
169
- "loss": 0.6418,
170
  "step": 115
171
  },
172
  {
173
- "epoch": 0.14634146341463414,
174
- "grad_norm": 0.08703485876321793,
175
  "learning_rate": 0.0002,
176
- "loss": 0.7023,
177
  "step": 120
178
  },
179
  {
180
- "epoch": 0.1524390243902439,
181
- "grad_norm": 0.08188563585281372,
182
  "learning_rate": 0.0002,
183
- "loss": 0.6844,
184
  "step": 125
185
  },
186
  {
187
- "epoch": 0.15853658536585366,
188
- "grad_norm": 0.08216766268014908,
189
  "learning_rate": 0.0002,
190
- "loss": 0.6584,
191
  "step": 130
192
  },
193
  {
194
- "epoch": 0.16463414634146342,
195
- "grad_norm": 0.09054507315158844,
196
  "learning_rate": 0.0002,
197
- "loss": 0.6607,
198
  "step": 135
199
  },
200
  {
201
- "epoch": 0.17073170731707318,
202
- "grad_norm": 0.07661531865596771,
203
  "learning_rate": 0.0002,
204
- "loss": 0.6797,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 0.17682926829268292,
209
- "grad_norm": 0.08329407870769501,
210
  "learning_rate": 0.0002,
211
- "loss": 0.624,
212
  "step": 145
213
  },
214
  {
215
- "epoch": 0.18292682926829268,
216
- "grad_norm": 0.08145114779472351,
217
  "learning_rate": 0.0002,
218
- "loss": 0.6673,
219
  "step": 150
220
  },
221
  {
222
- "epoch": 0.18902439024390244,
223
- "grad_norm": 0.07938922196626663,
224
  "learning_rate": 0.0002,
225
- "loss": 0.6262,
226
  "step": 155
227
  },
228
  {
229
- "epoch": 0.1951219512195122,
230
- "grad_norm": 0.0885365903377533,
231
  "learning_rate": 0.0002,
232
- "loss": 0.6665,
233
  "step": 160
234
  },
235
  {
236
- "epoch": 0.20121951219512196,
237
- "grad_norm": 0.08919111639261246,
238
  "learning_rate": 0.0002,
239
- "loss": 0.6829,
240
  "step": 165
241
  },
242
  {
243
- "epoch": 0.2073170731707317,
244
- "grad_norm": 0.09292899817228317,
245
  "learning_rate": 0.0002,
246
- "loss": 0.6834,
247
  "step": 170
248
  },
249
  {
250
- "epoch": 0.21341463414634146,
251
- "grad_norm": 0.08889784663915634,
252
  "learning_rate": 0.0002,
253
- "loss": 0.6771,
254
  "step": 175
255
  },
256
  {
257
- "epoch": 0.21951219512195122,
258
- "grad_norm": 0.08751650154590607,
259
  "learning_rate": 0.0002,
260
- "loss": 0.6217,
261
  "step": 180
262
  },
263
  {
264
- "epoch": 0.22560975609756098,
265
- "grad_norm": 0.08591675013303757,
266
  "learning_rate": 0.0002,
267
- "loss": 0.6741,
268
  "step": 185
269
  },
270
  {
271
- "epoch": 0.23170731707317074,
272
- "grad_norm": 0.08414967358112335,
273
  "learning_rate": 0.0002,
274
- "loss": 0.6755,
275
  "step": 190
276
  },
277
  {
278
- "epoch": 0.23780487804878048,
279
- "grad_norm": 0.088834747672081,
280
  "learning_rate": 0.0002,
281
- "loss": 0.6364,
282
  "step": 195
283
  },
284
  {
285
- "epoch": 0.24390243902439024,
286
- "grad_norm": 0.08591052889823914,
287
  "learning_rate": 0.0002,
288
- "loss": 0.6212,
289
  "step": 200
290
  },
291
  {
292
- "epoch": 0.25,
293
- "grad_norm": 0.08160480856895447,
294
  "learning_rate": 0.0002,
295
- "loss": 0.6014,
296
  "step": 205
297
  },
298
  {
299
- "epoch": 0.25609756097560976,
300
- "grad_norm": 0.09206949174404144,
301
  "learning_rate": 0.0002,
302
- "loss": 0.6994,
303
  "step": 210
304
  },
305
  {
306
- "epoch": 0.2621951219512195,
307
- "grad_norm": 0.09698819369077682,
308
  "learning_rate": 0.0002,
309
- "loss": 0.6297,
310
  "step": 215
311
  },
312
  {
313
- "epoch": 0.2682926829268293,
314
- "grad_norm": 0.08840637654066086,
315
  "learning_rate": 0.0002,
316
- "loss": 0.6024,
317
  "step": 220
318
  },
319
  {
320
- "epoch": 0.27439024390243905,
321
- "grad_norm": 0.07851487398147583,
322
  "learning_rate": 0.0002,
323
- "loss": 0.6461,
324
  "step": 225
325
  },
326
  {
327
- "epoch": 0.2804878048780488,
328
- "grad_norm": 0.09180750697851181,
329
  "learning_rate": 0.0002,
330
- "loss": 0.6133,
331
  "step": 230
332
  },
333
  {
334
- "epoch": 0.2865853658536585,
335
- "grad_norm": 0.09095776826143265,
336
  "learning_rate": 0.0002,
337
- "loss": 0.6824,
338
  "step": 235
339
  },
340
  {
341
- "epoch": 0.2926829268292683,
342
- "grad_norm": 0.09543343633413315,
343
  "learning_rate": 0.0002,
344
- "loss": 0.6737,
345
  "step": 240
346
  },
347
  {
348
- "epoch": 0.29878048780487804,
349
- "grad_norm": 0.0848107561469078,
350
  "learning_rate": 0.0002,
351
- "loss": 0.642,
352
  "step": 245
353
  },
354
  {
355
- "epoch": 0.3048780487804878,
356
- "grad_norm": 0.09167467057704926,
357
  "learning_rate": 0.0002,
358
- "loss": 0.6658,
359
  "step": 250
360
  },
361
  {
362
- "epoch": 0.31097560975609756,
363
- "grad_norm": 0.08741264045238495,
364
  "learning_rate": 0.0002,
365
- "loss": 0.6361,
366
  "step": 255
367
  },
368
  {
369
- "epoch": 0.3170731707317073,
370
- "grad_norm": 0.09368050843477249,
371
  "learning_rate": 0.0002,
372
- "loss": 0.6352,
373
  "step": 260
374
  },
375
  {
376
- "epoch": 0.3231707317073171,
377
- "grad_norm": 0.09895329922437668,
378
  "learning_rate": 0.0002,
379
- "loss": 0.6882,
380
  "step": 265
381
  },
382
  {
383
- "epoch": 0.32926829268292684,
384
- "grad_norm": 0.07941333204507828,
385
  "learning_rate": 0.0002,
386
- "loss": 0.602,
387
  "step": 270
388
  },
389
  {
390
- "epoch": 0.3353658536585366,
391
- "grad_norm": 0.09358822554349899,
392
  "learning_rate": 0.0002,
393
- "loss": 0.5926,
394
  "step": 275
395
  },
396
  {
397
- "epoch": 0.34146341463414637,
398
- "grad_norm": 0.09718990325927734,
399
  "learning_rate": 0.0002,
400
- "loss": 0.6292,
401
  "step": 280
402
  },
403
  {
404
- "epoch": 0.3475609756097561,
405
- "grad_norm": 0.09431279450654984,
406
  "learning_rate": 0.0002,
407
- "loss": 0.683,
408
  "step": 285
409
  },
410
  {
411
- "epoch": 0.35365853658536583,
412
- "grad_norm": 0.09181628376245499,
413
  "learning_rate": 0.0002,
414
- "loss": 0.6491,
415
  "step": 290
416
  },
417
  {
418
- "epoch": 0.3597560975609756,
419
- "grad_norm": 0.08341574668884277,
420
  "learning_rate": 0.0002,
421
- "loss": 0.6286,
422
  "step": 295
423
  },
424
  {
425
- "epoch": 0.36585365853658536,
426
- "grad_norm": 0.08599329739809036,
427
  "learning_rate": 0.0002,
428
- "loss": 0.6533,
429
  "step": 300
430
  },
431
  {
432
- "epoch": 0.3719512195121951,
433
- "grad_norm": 0.08686517924070358,
434
  "learning_rate": 0.0002,
435
- "loss": 0.6322,
436
  "step": 305
437
  },
438
  {
439
- "epoch": 0.3780487804878049,
440
- "grad_norm": 0.08503744751214981,
441
  "learning_rate": 0.0002,
442
- "loss": 0.6669,
443
  "step": 310
444
  },
445
  {
446
- "epoch": 0.38414634146341464,
447
- "grad_norm": 0.08044194430112839,
448
  "learning_rate": 0.0002,
449
- "loss": 0.6183,
450
  "step": 315
451
  },
452
  {
453
- "epoch": 0.3902439024390244,
454
- "grad_norm": 0.09079768508672714,
455
  "learning_rate": 0.0002,
456
- "loss": 0.6423,
457
  "step": 320
458
  },
459
  {
460
- "epoch": 0.39634146341463417,
461
- "grad_norm": 0.10418431460857391,
462
  "learning_rate": 0.0002,
463
- "loss": 0.6807,
464
  "step": 325
465
  },
466
  {
467
- "epoch": 0.4024390243902439,
468
- "grad_norm": 0.08268652856349945,
469
  "learning_rate": 0.0002,
470
- "loss": 0.6402,
471
  "step": 330
472
  },
473
  {
474
- "epoch": 0.40853658536585363,
475
- "grad_norm": 0.08460263162851334,
476
  "learning_rate": 0.0002,
477
- "loss": 0.6321,
478
  "step": 335
479
  },
480
  {
481
- "epoch": 0.4146341463414634,
482
- "grad_norm": 0.09432059526443481,
483
  "learning_rate": 0.0002,
484
- "loss": 0.6492,
485
  "step": 340
486
  },
487
  {
488
- "epoch": 0.42073170731707316,
489
- "grad_norm": 0.09088627994060516,
490
  "learning_rate": 0.0002,
491
- "loss": 0.594,
492
  "step": 345
493
  },
494
  {
495
- "epoch": 0.4268292682926829,
496
- "grad_norm": 0.08435555547475815,
497
  "learning_rate": 0.0002,
498
- "loss": 0.5455,
499
  "step": 350
500
  },
501
  {
502
- "epoch": 0.4329268292682927,
503
- "grad_norm": 0.09127935767173767,
504
  "learning_rate": 0.0002,
505
- "loss": 0.6041,
506
  "step": 355
507
  },
508
  {
509
- "epoch": 0.43902439024390244,
510
- "grad_norm": 0.08697493374347687,
511
  "learning_rate": 0.0002,
512
- "loss": 0.6035,
513
  "step": 360
514
  },
515
  {
516
- "epoch": 0.4451219512195122,
517
- "grad_norm": 0.08682391047477722,
518
  "learning_rate": 0.0002,
519
- "loss": 0.6617,
520
  "step": 365
521
  },
522
  {
523
- "epoch": 0.45121951219512196,
524
- "grad_norm": 0.09005075693130493,
525
  "learning_rate": 0.0002,
526
- "loss": 0.6322,
527
  "step": 370
528
  },
529
  {
530
- "epoch": 0.4573170731707317,
531
- "grad_norm": 0.09503002464771271,
532
  "learning_rate": 0.0002,
533
- "loss": 0.6431,
534
  "step": 375
535
  },
536
  {
537
- "epoch": 0.4634146341463415,
538
- "grad_norm": 0.0834166407585144,
539
  "learning_rate": 0.0002,
540
- "loss": 0.5602,
541
  "step": 380
542
  },
543
  {
544
- "epoch": 0.4695121951219512,
545
- "grad_norm": 0.08948921412229538,
546
  "learning_rate": 0.0002,
547
- "loss": 0.6585,
548
  "step": 385
549
  },
550
  {
551
- "epoch": 0.47560975609756095,
552
- "grad_norm": 0.08504035323858261,
553
  "learning_rate": 0.0002,
554
- "loss": 0.6788,
555
  "step": 390
556
  },
557
  {
558
- "epoch": 0.4817073170731707,
559
- "grad_norm": 0.08549030125141144,
560
  "learning_rate": 0.0002,
561
- "loss": 0.6046,
562
  "step": 395
563
  },
564
  {
565
- "epoch": 0.4878048780487805,
566
- "grad_norm": 0.08406868577003479,
567
  "learning_rate": 0.0002,
568
- "loss": 0.5824,
569
  "step": 400
570
  },
571
  {
572
- "epoch": 0.49390243902439024,
573
- "grad_norm": 0.08946231752634048,
574
  "learning_rate": 0.0002,
575
- "loss": 0.654,
576
  "step": 405
577
  },
578
  {
579
- "epoch": 0.5,
580
- "grad_norm": 0.08700671792030334,
581
  "learning_rate": 0.0002,
582
- "loss": 0.584,
583
  "step": 410
584
  },
585
  {
586
- "epoch": 0.5060975609756098,
587
- "grad_norm": 0.09547651559114456,
588
  "learning_rate": 0.0002,
589
- "loss": 0.6726,
590
  "step": 415
591
  },
592
  {
593
- "epoch": 0.5121951219512195,
594
- "grad_norm": 0.09309537708759308,
595
  "learning_rate": 0.0002,
596
- "loss": 0.6264,
597
  "step": 420
598
  },
599
  {
600
- "epoch": 0.5182926829268293,
601
- "grad_norm": 0.08844203501939774,
602
  "learning_rate": 0.0002,
603
- "loss": 0.6075,
604
  "step": 425
605
  },
606
  {
607
- "epoch": 0.524390243902439,
608
- "grad_norm": 0.09487327188253403,
609
  "learning_rate": 0.0002,
610
- "loss": 0.6576,
611
  "step": 430
612
  },
613
  {
614
- "epoch": 0.5304878048780488,
615
- "grad_norm": 0.08794911205768585,
616
  "learning_rate": 0.0002,
617
- "loss": 0.6324,
618
  "step": 435
619
  },
620
  {
621
- "epoch": 0.5365853658536586,
622
- "grad_norm": 0.08871318399906158,
623
  "learning_rate": 0.0002,
624
- "loss": 0.605,
625
  "step": 440
626
  },
627
  {
628
- "epoch": 0.5426829268292683,
629
- "grad_norm": 0.09224696457386017,
630
  "learning_rate": 0.0002,
631
- "loss": 0.5909,
632
  "step": 445
633
  },
634
  {
635
- "epoch": 0.5487804878048781,
636
- "grad_norm": 0.08521644026041031,
637
  "learning_rate": 0.0002,
638
- "loss": 0.6364,
639
  "step": 450
640
  },
641
  {
642
- "epoch": 0.5548780487804879,
643
- "grad_norm": 0.08880337327718735,
644
  "learning_rate": 0.0002,
645
- "loss": 0.5485,
646
  "step": 455
647
  },
648
  {
649
- "epoch": 0.5609756097560976,
650
- "grad_norm": 0.10320650786161423,
651
  "learning_rate": 0.0002,
652
- "loss": 0.6577,
653
  "step": 460
654
  },
655
  {
656
- "epoch": 0.5670731707317073,
657
- "grad_norm": 0.08618949353694916,
658
  "learning_rate": 0.0002,
659
- "loss": 0.626,
660
  "step": 465
661
  },
662
  {
663
- "epoch": 0.573170731707317,
664
- "grad_norm": 0.09187562018632889,
665
  "learning_rate": 0.0002,
666
- "loss": 0.6237,
667
  "step": 470
668
  },
669
  {
670
- "epoch": 0.5792682926829268,
671
- "grad_norm": 0.08180835843086243,
672
  "learning_rate": 0.0002,
673
- "loss": 0.659,
674
  "step": 475
675
  },
676
  {
677
- "epoch": 0.5853658536585366,
678
- "grad_norm": 0.09724608063697815,
679
  "learning_rate": 0.0002,
680
- "loss": 0.6702,
681
  "step": 480
682
  },
683
  {
684
- "epoch": 0.5914634146341463,
685
- "grad_norm": 0.08215058594942093,
686
  "learning_rate": 0.0002,
687
- "loss": 0.6184,
688
  "step": 485
689
  },
690
  {
691
- "epoch": 0.5975609756097561,
692
- "grad_norm": 0.08703969419002533,
693
  "learning_rate": 0.0002,
694
- "loss": 0.5848,
695
  "step": 490
696
  },
697
  {
698
- "epoch": 0.6036585365853658,
699
- "grad_norm": 0.09472667425870895,
700
  "learning_rate": 0.0002,
701
- "loss": 0.6405,
702
  "step": 495
703
  },
704
  {
705
- "epoch": 0.6097560975609756,
706
- "grad_norm": 0.08890362828969955,
707
  "learning_rate": 0.0002,
708
- "loss": 0.6271,
709
  "step": 500
710
  },
711
  {
712
- "epoch": 0.6158536585365854,
713
- "grad_norm": 0.08213219791650772,
714
  "learning_rate": 0.0002,
715
- "loss": 0.6017,
716
  "step": 505
717
  },
718
  {
719
- "epoch": 0.6219512195121951,
720
- "grad_norm": 0.09955822676420212,
721
  "learning_rate": 0.0002,
722
- "loss": 0.6676,
723
  "step": 510
724
  },
725
  {
726
- "epoch": 0.6280487804878049,
727
- "grad_norm": 0.08851809799671173,
728
  "learning_rate": 0.0002,
729
- "loss": 0.5929,
730
  "step": 515
731
  },
732
  {
733
- "epoch": 0.6341463414634146,
734
- "grad_norm": 0.08627015352249146,
735
  "learning_rate": 0.0002,
736
- "loss": 0.6031,
737
  "step": 520
738
  },
739
  {
740
- "epoch": 0.6402439024390244,
741
- "grad_norm": 0.09393417090177536,
742
  "learning_rate": 0.0002,
743
- "loss": 0.6116,
744
  "step": 525
745
  },
746
  {
747
- "epoch": 0.6463414634146342,
748
- "grad_norm": 0.08550355583429337,
749
  "learning_rate": 0.0002,
750
- "loss": 0.6356,
751
  "step": 530
752
  },
753
  {
754
- "epoch": 0.6524390243902439,
755
- "grad_norm": 0.07930731773376465,
756
  "learning_rate": 0.0002,
757
- "loss": 0.6059,
758
  "step": 535
759
  },
760
  {
761
- "epoch": 0.6585365853658537,
762
- "grad_norm": 0.09117292612791061,
763
  "learning_rate": 0.0002,
764
- "loss": 0.5806,
765
  "step": 540
766
  },
767
  {
768
- "epoch": 0.6646341463414634,
769
- "grad_norm": 0.0977688878774643,
770
  "learning_rate": 0.0002,
771
- "loss": 0.6672,
772
  "step": 545
773
  },
774
  {
775
- "epoch": 0.6707317073170732,
776
- "grad_norm": 0.08388745784759521,
777
  "learning_rate": 0.0002,
778
- "loss": 0.6185,
779
  "step": 550
780
  },
781
  {
782
- "epoch": 0.676829268292683,
783
- "grad_norm": 0.08963685482740402,
784
  "learning_rate": 0.0002,
785
- "loss": 0.6802,
786
  "step": 555
787
  },
788
  {
789
- "epoch": 0.6829268292682927,
790
- "grad_norm": 0.09435810893774033,
791
  "learning_rate": 0.0002,
792
- "loss": 0.6239,
793
  "step": 560
794
  },
795
  {
796
- "epoch": 0.6890243902439024,
797
- "grad_norm": 0.08647770434617996,
798
  "learning_rate": 0.0002,
799
- "loss": 0.6488,
800
  "step": 565
801
  },
802
  {
803
- "epoch": 0.6951219512195121,
804
- "grad_norm": 0.08766207844018936,
805
  "learning_rate": 0.0002,
806
- "loss": 0.6015,
807
  "step": 570
808
  },
809
  {
810
- "epoch": 0.7012195121951219,
811
- "grad_norm": 0.0898912101984024,
812
  "learning_rate": 0.0002,
813
- "loss": 0.599,
814
  "step": 575
815
  },
816
  {
817
- "epoch": 0.7073170731707317,
818
- "grad_norm": 0.08568281680345535,
819
  "learning_rate": 0.0002,
820
- "loss": 0.6032,
821
  "step": 580
822
  },
823
  {
824
- "epoch": 0.7134146341463414,
825
- "grad_norm": 0.08536507189273834,
826
  "learning_rate": 0.0002,
827
- "loss": 0.6271,
828
  "step": 585
829
  },
830
  {
831
- "epoch": 0.7195121951219512,
832
- "grad_norm": 0.08574043214321136,
833
  "learning_rate": 0.0002,
834
- "loss": 0.654,
835
  "step": 590
836
  },
837
  {
838
- "epoch": 0.725609756097561,
839
- "grad_norm": 0.08919310569763184,
840
  "learning_rate": 0.0002,
841
- "loss": 0.6216,
842
  "step": 595
843
  },
844
  {
845
- "epoch": 0.7317073170731707,
846
- "grad_norm": 0.08294546604156494,
847
  "learning_rate": 0.0002,
848
- "loss": 0.5901,
849
  "step": 600
850
  },
851
  {
852
- "epoch": 0.7378048780487805,
853
- "grad_norm": 0.08501895517110825,
854
  "learning_rate": 0.0002,
855
- "loss": 0.6032,
856
  "step": 605
857
  },
858
  {
859
- "epoch": 0.7439024390243902,
860
- "grad_norm": 0.0810956135392189,
861
  "learning_rate": 0.0002,
862
- "loss": 0.6315,
863
  "step": 610
864
  },
865
  {
866
- "epoch": 0.75,
867
- "grad_norm": 0.09748444706201553,
868
  "learning_rate": 0.0002,
869
- "loss": 0.6022,
870
  "step": 615
871
  },
872
  {
873
- "epoch": 0.7560975609756098,
874
- "grad_norm": 0.08739516139030457,
875
  "learning_rate": 0.0002,
876
- "loss": 0.6217,
877
  "step": 620
878
  },
879
  {
880
- "epoch": 0.7621951219512195,
881
- "grad_norm": 0.08666383475065231,
882
  "learning_rate": 0.0002,
883
- "loss": 0.6076,
884
  "step": 625
885
  },
886
  {
887
- "epoch": 0.7682926829268293,
888
- "grad_norm": 0.08574234694242477,
889
  "learning_rate": 0.0002,
890
- "loss": 0.595,
891
  "step": 630
892
  },
893
  {
894
- "epoch": 0.774390243902439,
895
- "grad_norm": 0.08672145009040833,
896
  "learning_rate": 0.0002,
897
- "loss": 0.6133,
898
  "step": 635
899
  },
900
  {
901
- "epoch": 0.7804878048780488,
902
- "grad_norm": 0.09276317059993744,
903
  "learning_rate": 0.0002,
904
- "loss": 0.6109,
905
  "step": 640
906
  },
907
  {
908
- "epoch": 0.7865853658536586,
909
- "grad_norm": 0.08828898519277573,
910
  "learning_rate": 0.0002,
911
- "loss": 0.5842,
912
  "step": 645
913
  },
914
  {
915
- "epoch": 0.7926829268292683,
916
- "grad_norm": 0.08746568858623505,
917
  "learning_rate": 0.0002,
918
- "loss": 0.6303,
919
  "step": 650
920
  },
921
  {
922
- "epoch": 0.7987804878048781,
923
- "grad_norm": 0.08017181605100632,
924
  "learning_rate": 0.0002,
925
- "loss": 0.5927,
926
  "step": 655
927
  },
928
  {
929
- "epoch": 0.8048780487804879,
930
- "grad_norm": 0.09101402759552002,
931
  "learning_rate": 0.0002,
932
- "loss": 0.6027,
933
  "step": 660
934
  },
935
  {
936
- "epoch": 0.8109756097560976,
937
- "grad_norm": 0.08576813340187073,
938
  "learning_rate": 0.0002,
939
- "loss": 0.5837,
940
  "step": 665
941
  },
942
  {
943
- "epoch": 0.8170731707317073,
944
- "grad_norm": 0.07945378124713898,
945
  "learning_rate": 0.0002,
946
- "loss": 0.6018,
947
  "step": 670
948
  },
949
  {
950
- "epoch": 0.823170731707317,
951
- "grad_norm": 0.0857338160276413,
952
  "learning_rate": 0.0002,
953
- "loss": 0.5735,
954
  "step": 675
955
  },
956
  {
957
- "epoch": 0.8292682926829268,
958
- "grad_norm": 0.08402560651302338,
959
  "learning_rate": 0.0002,
960
- "loss": 0.606,
961
  "step": 680
962
  },
963
  {
964
- "epoch": 0.8353658536585366,
965
- "grad_norm": 0.08645153790712357,
966
  "learning_rate": 0.0002,
967
- "loss": 0.6124,
968
  "step": 685
969
  },
970
  {
971
- "epoch": 0.8414634146341463,
972
- "grad_norm": 0.08320162445306778,
973
  "learning_rate": 0.0002,
974
- "loss": 0.5604,
975
  "step": 690
976
  },
977
  {
978
- "epoch": 0.8475609756097561,
979
- "grad_norm": 0.10030201077461243,
980
  "learning_rate": 0.0002,
981
- "loss": 0.6139,
982
  "step": 695
983
  },
984
  {
985
- "epoch": 0.8536585365853658,
986
- "grad_norm": 0.09603551030158997,
987
  "learning_rate": 0.0002,
988
- "loss": 0.6529,
989
  "step": 700
990
  },
991
  {
992
- "epoch": 0.8597560975609756,
993
- "grad_norm": 0.08779994398355484,
994
  "learning_rate": 0.0002,
995
- "loss": 0.6365,
996
  "step": 705
997
  },
998
  {
999
- "epoch": 0.8658536585365854,
1000
- "grad_norm": 0.09174113720655441,
1001
  "learning_rate": 0.0002,
1002
- "loss": 0.6072,
1003
  "step": 710
1004
  },
1005
  {
1006
- "epoch": 0.8719512195121951,
1007
- "grad_norm": 0.09346111863851547,
1008
  "learning_rate": 0.0002,
1009
- "loss": 0.6309,
1010
  "step": 715
1011
  },
1012
  {
1013
- "epoch": 0.8780487804878049,
1014
- "grad_norm": 0.09716326743364334,
1015
  "learning_rate": 0.0002,
1016
- "loss": 0.6629,
1017
  "step": 720
1018
  },
1019
  {
1020
- "epoch": 0.8841463414634146,
1021
- "grad_norm": 0.08768226206302643,
1022
  "learning_rate": 0.0002,
1023
- "loss": 0.6286,
1024
  "step": 725
1025
  },
1026
  {
1027
- "epoch": 0.8902439024390244,
1028
- "grad_norm": 0.09532574564218521,
1029
  "learning_rate": 0.0002,
1030
- "loss": 0.6078,
1031
  "step": 730
1032
  },
1033
  {
1034
- "epoch": 0.8963414634146342,
1035
- "grad_norm": 0.0878928005695343,
1036
  "learning_rate": 0.0002,
1037
- "loss": 0.6007,
1038
  "step": 735
1039
  },
1040
  {
1041
- "epoch": 0.9024390243902439,
1042
- "grad_norm": 0.0885293185710907,
1043
  "learning_rate": 0.0002,
1044
- "loss": 0.5646,
1045
  "step": 740
1046
  },
1047
  {
1048
- "epoch": 0.9085365853658537,
1049
- "grad_norm": 0.08721958845853806,
1050
  "learning_rate": 0.0002,
1051
- "loss": 0.5891,
1052
  "step": 745
1053
  },
1054
  {
1055
- "epoch": 0.9146341463414634,
1056
- "grad_norm": 0.0873153954744339,
1057
  "learning_rate": 0.0002,
1058
- "loss": 0.6293,
1059
  "step": 750
1060
  },
1061
  {
1062
- "epoch": 0.9207317073170732,
1063
- "grad_norm": 0.08964630961418152,
1064
  "learning_rate": 0.0002,
1065
- "loss": 0.6194,
1066
  "step": 755
1067
  },
1068
  {
1069
- "epoch": 0.926829268292683,
1070
- "grad_norm": 0.08254794776439667,
1071
  "learning_rate": 0.0002,
1072
- "loss": 0.6601,
1073
  "step": 760
1074
  },
1075
  {
1076
- "epoch": 0.9329268292682927,
1077
- "grad_norm": 0.08083070814609528,
1078
  "learning_rate": 0.0002,
1079
- "loss": 0.5945,
1080
  "step": 765
1081
  },
1082
  {
1083
- "epoch": 0.9390243902439024,
1084
- "grad_norm": 0.09315191954374313,
1085
  "learning_rate": 0.0002,
1086
- "loss": 0.6248,
1087
  "step": 770
1088
  },
1089
  {
1090
- "epoch": 0.9451219512195121,
1091
- "grad_norm": 0.07978613674640656,
1092
  "learning_rate": 0.0002,
1093
- "loss": 0.5855,
1094
  "step": 775
1095
  },
1096
  {
1097
- "epoch": 0.9512195121951219,
1098
- "grad_norm": 0.09159720689058304,
1099
  "learning_rate": 0.0002,
1100
- "loss": 0.6231,
1101
  "step": 780
1102
  },
1103
  {
1104
- "epoch": 0.9573170731707317,
1105
- "grad_norm": 0.08803162723779678,
1106
  "learning_rate": 0.0002,
1107
- "loss": 0.6112,
1108
  "step": 785
1109
  },
1110
  {
1111
- "epoch": 0.9634146341463414,
1112
- "grad_norm": 0.08242770284414291,
1113
  "learning_rate": 0.0002,
1114
- "loss": 0.5894,
1115
  "step": 790
1116
  },
1117
  {
1118
- "epoch": 0.9695121951219512,
1119
- "grad_norm": 0.08758191019296646,
1120
  "learning_rate": 0.0002,
1121
- "loss": 0.6261,
1122
  "step": 795
1123
  },
1124
  {
1125
- "epoch": 0.975609756097561,
1126
- "grad_norm": 0.08972202986478806,
1127
  "learning_rate": 0.0002,
1128
- "loss": 0.5641,
1129
  "step": 800
1130
  },
1131
  {
1132
- "epoch": 0.9817073170731707,
1133
- "grad_norm": 0.08977407217025757,
1134
  "learning_rate": 0.0002,
1135
- "loss": 0.6236,
1136
  "step": 805
1137
  },
1138
  {
1139
- "epoch": 0.9878048780487805,
1140
- "grad_norm": 0.09412374347448349,
1141
  "learning_rate": 0.0002,
1142
- "loss": 0.632,
1143
  "step": 810
1144
  },
1145
  {
1146
- "epoch": 0.9939024390243902,
1147
- "grad_norm": 0.0928845927119255,
1148
  "learning_rate": 0.0002,
1149
- "loss": 0.6074,
1150
  "step": 815
1151
  },
1152
  {
1153
- "epoch": 1.0,
1154
- "grad_norm": 0.09269104152917862,
1155
  "learning_rate": 0.0002,
1156
- "loss": 0.602,
1157
  "step": 820
1158
  },
1159
  {
1160
- "epoch": 1.0,
1161
- "step": 820,
1162
- "total_flos": 1.2589733614033306e+17,
1163
- "train_loss": 0.6479685707790096,
1164
- "train_runtime": 1278.5189,
1165
- "train_samples_per_second": 10.26,
1166
- "train_steps_per_second": 0.641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1167
  }
1168
  ],
1169
  "logging_steps": 5,
1170
- "max_steps": 820,
1171
  "num_input_tokens_seen": 0,
1172
  "num_train_epochs": 1,
1173
  "save_steps": 500,
@@ -1183,7 +1491,7 @@
1183
  "attributes": {}
1184
  }
1185
  },
1186
- "total_flos": 1.2589733614033306e+17,
1187
  "train_batch_size": 8,
1188
  "trial_name": null,
1189
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9995213020584012,
5
  "eval_steps": 500,
6
+ "global_step": 1044,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.004786979415988511,
13
+ "grad_norm": 0.24613235890865326,
14
  "learning_rate": 0.0002,
15
+ "loss": 1.2771,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.009573958831977022,
20
+ "grad_norm": 0.16961899399757385,
21
  "learning_rate": 0.0002,
22
+ "loss": 1.0713,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.014360938247965534,
27
+ "grad_norm": 0.15322890877723694,
28
  "learning_rate": 0.0002,
29
+ "loss": 0.9605,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.019147917663954045,
34
+ "grad_norm": 0.1613943725824356,
35
  "learning_rate": 0.0002,
36
+ "loss": 0.7417,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.023934897079942556,
41
+ "grad_norm": 0.15506355464458466,
42
  "learning_rate": 0.0002,
43
+ "loss": 0.6595,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.028721876495931067,
48
+ "grad_norm": 0.1182127594947815,
49
  "learning_rate": 0.0002,
50
+ "loss": 0.6764,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.03350885591191958,
55
+ "grad_norm": 0.10777570307254791,
56
  "learning_rate": 0.0002,
57
+ "loss": 0.6284,
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.03829583532790809,
62
+ "grad_norm": 0.08354483544826508,
63
  "learning_rate": 0.0002,
64
+ "loss": 0.6242,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.0430828147438966,
69
+ "grad_norm": 0.09534800797700882,
70
  "learning_rate": 0.0002,
71
+ "loss": 0.6214,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.04786979415988511,
76
+ "grad_norm": 0.10349296033382416,
77
  "learning_rate": 0.0002,
78
+ "loss": 0.6159,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.052656773575873624,
83
+ "grad_norm": 0.08007140457630157,
84
  "learning_rate": 0.0002,
85
+ "loss": 0.5848,
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.057443752991862135,
90
+ "grad_norm": 0.07333367317914963,
91
  "learning_rate": 0.0002,
92
+ "loss": 0.6728,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.062230732407850646,
97
+ "grad_norm": 0.07483084499835968,
98
  "learning_rate": 0.0002,
99
+ "loss": 0.5873,
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.06701771182383916,
104
+ "grad_norm": 0.07768324017524719,
105
  "learning_rate": 0.0002,
106
+ "loss": 0.6175,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.07180469123982768,
111
+ "grad_norm": 0.07919920980930328,
112
  "learning_rate": 0.0002,
113
+ "loss": 0.5256,
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 0.07659167065581618,
118
+ "grad_norm": 0.09047867357730865,
119
  "learning_rate": 0.0002,
120
+ "loss": 0.5968,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.0813786500718047,
125
+ "grad_norm": 0.08032748848199844,
126
  "learning_rate": 0.0002,
127
+ "loss": 0.6136,
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 0.0861656294877932,
132
+ "grad_norm": 0.0846623107790947,
133
  "learning_rate": 0.0002,
134
+ "loss": 0.5491,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.09095260890378172,
139
+ "grad_norm": 0.0998520776629448,
140
  "learning_rate": 0.0002,
141
+ "loss": 0.5526,
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 0.09573958831977022,
146
+ "grad_norm": 0.08374989777803421,
147
  "learning_rate": 0.0002,
148
+ "loss": 0.5713,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.10052656773575874,
153
+ "grad_norm": 0.07846508920192719,
154
  "learning_rate": 0.0002,
155
+ "loss": 0.5801,
156
  "step": 105
157
  },
158
  {
159
+ "epoch": 0.10531354715174725,
160
+ "grad_norm": 0.08031659573316574,
161
  "learning_rate": 0.0002,
162
+ "loss": 0.6147,
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.11010052656773577,
167
+ "grad_norm": 0.08738347887992859,
168
  "learning_rate": 0.0002,
169
+ "loss": 0.5208,
170
  "step": 115
171
  },
172
  {
173
+ "epoch": 0.11488750598372427,
174
+ "grad_norm": 0.0916195958852768,
175
  "learning_rate": 0.0002,
176
+ "loss": 0.5689,
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.11967448539971279,
181
+ "grad_norm": 0.08580000698566437,
182
  "learning_rate": 0.0002,
183
+ "loss": 0.6158,
184
  "step": 125
185
  },
186
  {
187
+ "epoch": 0.12446146481570129,
188
+ "grad_norm": 0.08231440931558609,
189
  "learning_rate": 0.0002,
190
+ "loss": 0.5989,
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.1292484442316898,
195
+ "grad_norm": 0.09185798466205597,
196
  "learning_rate": 0.0002,
197
+ "loss": 0.5117,
198
  "step": 135
199
  },
200
  {
201
+ "epoch": 0.13403542364767831,
202
+ "grad_norm": 0.0896279439330101,
203
  "learning_rate": 0.0002,
204
+ "loss": 0.4921,
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.13882240306366683,
209
+ "grad_norm": 0.08800848573446274,
210
  "learning_rate": 0.0002,
211
+ "loss": 0.5737,
212
  "step": 145
213
  },
214
  {
215
+ "epoch": 0.14360938247965535,
216
+ "grad_norm": 0.08985792100429535,
217
  "learning_rate": 0.0002,
218
+ "loss": 0.5814,
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.14839636189564384,
223
+ "grad_norm": 0.096456378698349,
224
  "learning_rate": 0.0002,
225
+ "loss": 0.5483,
226
  "step": 155
227
  },
228
  {
229
+ "epoch": 0.15318334131163236,
230
+ "grad_norm": 0.08564233034849167,
231
  "learning_rate": 0.0002,
232
+ "loss": 0.5258,
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.15797032072762088,
237
+ "grad_norm": 0.08352309465408325,
238
  "learning_rate": 0.0002,
239
+ "loss": 0.516,
240
  "step": 165
241
  },
242
  {
243
+ "epoch": 0.1627573001436094,
244
+ "grad_norm": 0.08917209506034851,
245
  "learning_rate": 0.0002,
246
+ "loss": 0.5133,
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.1675442795595979,
251
+ "grad_norm": 0.08113600313663483,
252
  "learning_rate": 0.0002,
253
+ "loss": 0.556,
254
  "step": 175
255
  },
256
  {
257
+ "epoch": 0.1723312589755864,
258
+ "grad_norm": 0.09329506009817123,
259
  "learning_rate": 0.0002,
260
+ "loss": 0.5211,
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.17711823839157492,
265
+ "grad_norm": 0.08815263211727142,
266
  "learning_rate": 0.0002,
267
+ "loss": 0.5338,
268
  "step": 185
269
  },
270
  {
271
+ "epoch": 0.18190521780756344,
272
+ "grad_norm": 0.08324268460273743,
273
  "learning_rate": 0.0002,
274
+ "loss": 0.5066,
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.18669219722355193,
279
+ "grad_norm": 0.0860678032040596,
280
  "learning_rate": 0.0002,
281
+ "loss": 0.5745,
282
  "step": 195
283
  },
284
  {
285
+ "epoch": 0.19147917663954045,
286
+ "grad_norm": 0.07750646024942398,
287
  "learning_rate": 0.0002,
288
+ "loss": 0.4632,
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.19626615605552897,
293
+ "grad_norm": 0.09053143113851547,
294
  "learning_rate": 0.0002,
295
+ "loss": 0.5797,
296
  "step": 205
297
  },
298
  {
299
+ "epoch": 0.20105313547151749,
300
+ "grad_norm": 0.07899998128414154,
301
  "learning_rate": 0.0002,
302
+ "loss": 0.6043,
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.20584011488750598,
307
+ "grad_norm": 0.09660762548446655,
308
  "learning_rate": 0.0002,
309
+ "loss": 0.5559,
310
  "step": 215
311
  },
312
  {
313
+ "epoch": 0.2106270943034945,
314
+ "grad_norm": 0.0966796949505806,
315
  "learning_rate": 0.0002,
316
+ "loss": 0.5965,
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.215414073719483,
321
+ "grad_norm": 0.10608462989330292,
322
  "learning_rate": 0.0002,
323
+ "loss": 0.4921,
324
  "step": 225
325
  },
326
  {
327
+ "epoch": 0.22020105313547153,
328
+ "grad_norm": 0.07869511842727661,
329
  "learning_rate": 0.0002,
330
+ "loss": 0.5416,
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.22498803255146002,
335
+ "grad_norm": 0.10257625579833984,
336
  "learning_rate": 0.0002,
337
+ "loss": 0.5703,
338
  "step": 235
339
  },
340
  {
341
+ "epoch": 0.22977501196744854,
342
+ "grad_norm": 0.09301017224788666,
343
  "learning_rate": 0.0002,
344
+ "loss": 0.5955,
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.23456199138343706,
349
+ "grad_norm": 0.08770053088665009,
350
  "learning_rate": 0.0002,
351
+ "loss": 0.4946,
352
  "step": 245
353
  },
354
  {
355
+ "epoch": 0.23934897079942558,
356
+ "grad_norm": 0.09587664902210236,
357
  "learning_rate": 0.0002,
358
+ "loss": 0.5322,
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.24413595021541407,
363
+ "grad_norm": 0.082343190908432,
364
  "learning_rate": 0.0002,
365
+ "loss": 0.6184,
366
  "step": 255
367
  },
368
  {
369
+ "epoch": 0.24892292963140258,
370
+ "grad_norm": 0.09046710282564163,
371
  "learning_rate": 0.0002,
372
+ "loss": 0.5508,
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.2537099090473911,
377
+ "grad_norm": 0.09608398377895355,
378
  "learning_rate": 0.0002,
379
+ "loss": 0.5634,
380
  "step": 265
381
  },
382
  {
383
+ "epoch": 0.2584968884633796,
384
+ "grad_norm": 0.08927994221448898,
385
  "learning_rate": 0.0002,
386
+ "loss": 0.5631,
387
  "step": 270
388
  },
389
  {
390
+ "epoch": 0.26328386787936814,
391
+ "grad_norm": 0.115423783659935,
392
  "learning_rate": 0.0002,
393
+ "loss": 0.5898,
394
  "step": 275
395
  },
396
  {
397
+ "epoch": 0.26807084729535663,
398
+ "grad_norm": 0.0849870815873146,
399
  "learning_rate": 0.0002,
400
+ "loss": 0.5095,
401
  "step": 280
402
  },
403
  {
404
+ "epoch": 0.2728578267113451,
405
+ "grad_norm": 0.09704048186540604,
406
  "learning_rate": 0.0002,
407
+ "loss": 0.553,
408
  "step": 285
409
  },
410
  {
411
+ "epoch": 0.27764480612733367,
412
+ "grad_norm": 0.0753026083111763,
413
  "learning_rate": 0.0002,
414
+ "loss": 0.496,
415
  "step": 290
416
  },
417
  {
418
+ "epoch": 0.28243178554332216,
419
+ "grad_norm": 0.09067820757627487,
420
  "learning_rate": 0.0002,
421
+ "loss": 0.5093,
422
  "step": 295
423
  },
424
  {
425
+ "epoch": 0.2872187649593107,
426
+ "grad_norm": 0.09334460645914078,
427
  "learning_rate": 0.0002,
428
+ "loss": 0.5467,
429
  "step": 300
430
  },
431
  {
432
+ "epoch": 0.2920057443752992,
433
+ "grad_norm": 0.09724689275026321,
434
  "learning_rate": 0.0002,
435
+ "loss": 0.5533,
436
  "step": 305
437
  },
438
  {
439
+ "epoch": 0.2967927237912877,
440
+ "grad_norm": 0.09164885431528091,
441
  "learning_rate": 0.0002,
442
+ "loss": 0.5436,
443
  "step": 310
444
  },
445
  {
446
+ "epoch": 0.30157970320727623,
447
+ "grad_norm": 0.09583573043346405,
448
  "learning_rate": 0.0002,
449
+ "loss": 0.5408,
450
  "step": 315
451
  },
452
  {
453
+ "epoch": 0.3063666826232647,
454
+ "grad_norm": 0.0860954225063324,
455
  "learning_rate": 0.0002,
456
+ "loss": 0.4828,
457
  "step": 320
458
  },
459
  {
460
+ "epoch": 0.3111536620392532,
461
+ "grad_norm": 0.08259189128875732,
462
  "learning_rate": 0.0002,
463
+ "loss": 0.582,
464
  "step": 325
465
  },
466
  {
467
+ "epoch": 0.31594064145524176,
468
+ "grad_norm": 0.10501275211572647,
469
  "learning_rate": 0.0002,
470
+ "loss": 0.5176,
471
  "step": 330
472
  },
473
  {
474
+ "epoch": 0.32072762087123025,
475
+ "grad_norm": 0.09174875169992447,
476
  "learning_rate": 0.0002,
477
+ "loss": 0.5393,
478
  "step": 335
479
  },
480
  {
481
+ "epoch": 0.3255146002872188,
482
+ "grad_norm": 0.09675736725330353,
483
  "learning_rate": 0.0002,
484
+ "loss": 0.5495,
485
  "step": 340
486
  },
487
  {
488
+ "epoch": 0.3303015797032073,
489
+ "grad_norm": 0.08207903057336807,
490
  "learning_rate": 0.0002,
491
+ "loss": 0.5252,
492
  "step": 345
493
  },
494
  {
495
+ "epoch": 0.3350885591191958,
496
+ "grad_norm": 0.08642390370368958,
497
  "learning_rate": 0.0002,
498
+ "loss": 0.5688,
499
  "step": 350
500
  },
501
  {
502
+ "epoch": 0.3398755385351843,
503
+ "grad_norm": 0.0861140564084053,
504
  "learning_rate": 0.0002,
505
+ "loss": 0.4866,
506
  "step": 355
507
  },
508
  {
509
+ "epoch": 0.3446625179511728,
510
+ "grad_norm": 0.08826491981744766,
511
  "learning_rate": 0.0002,
512
+ "loss": 0.5392,
513
  "step": 360
514
  },
515
  {
516
+ "epoch": 0.3494494973671613,
517
+ "grad_norm": 0.09024737030267715,
518
  "learning_rate": 0.0002,
519
+ "loss": 0.5554,
520
  "step": 365
521
  },
522
  {
523
+ "epoch": 0.35423647678314985,
524
+ "grad_norm": 0.09096304327249527,
525
  "learning_rate": 0.0002,
526
+ "loss": 0.516,
527
  "step": 370
528
  },
529
  {
530
+ "epoch": 0.35902345619913834,
531
+ "grad_norm": 0.0845038965344429,
532
  "learning_rate": 0.0002,
533
+ "loss": 0.5301,
534
  "step": 375
535
  },
536
  {
537
+ "epoch": 0.3638104356151269,
538
+ "grad_norm": 0.08174905180931091,
539
  "learning_rate": 0.0002,
540
+ "loss": 0.5472,
541
  "step": 380
542
  },
543
  {
544
+ "epoch": 0.36859741503111537,
545
+ "grad_norm": 0.08673607558012009,
546
  "learning_rate": 0.0002,
547
+ "loss": 0.5648,
548
  "step": 385
549
  },
550
  {
551
+ "epoch": 0.37338439444710386,
552
+ "grad_norm": 0.08147840946912766,
553
  "learning_rate": 0.0002,
554
+ "loss": 0.5317,
555
  "step": 390
556
  },
557
  {
558
+ "epoch": 0.3781713738630924,
559
+ "grad_norm": 0.08197998255491257,
560
  "learning_rate": 0.0002,
561
+ "loss": 0.5085,
562
  "step": 395
563
  },
564
  {
565
+ "epoch": 0.3829583532790809,
566
+ "grad_norm": 0.09027797728776932,
567
  "learning_rate": 0.0002,
568
+ "loss": 0.5488,
569
  "step": 400
570
  },
571
  {
572
+ "epoch": 0.3877453326950694,
573
+ "grad_norm": 0.08635086566209793,
574
  "learning_rate": 0.0002,
575
+ "loss": 0.5182,
576
  "step": 405
577
  },
578
  {
579
+ "epoch": 0.39253231211105793,
580
+ "grad_norm": 0.09970038384199142,
581
  "learning_rate": 0.0002,
582
+ "loss": 0.5852,
583
  "step": 410
584
  },
585
  {
586
+ "epoch": 0.3973192915270464,
587
+ "grad_norm": 0.08561892062425613,
588
  "learning_rate": 0.0002,
589
+ "loss": 0.5144,
590
  "step": 415
591
  },
592
  {
593
+ "epoch": 0.40210627094303497,
594
+ "grad_norm": 0.08953725546598434,
595
  "learning_rate": 0.0002,
596
+ "loss": 0.5064,
597
  "step": 420
598
  },
599
  {
600
+ "epoch": 0.40689325035902346,
601
+ "grad_norm": 0.09641014784574509,
602
  "learning_rate": 0.0002,
603
+ "loss": 0.4849,
604
  "step": 425
605
  },
606
  {
607
+ "epoch": 0.41168022977501195,
608
+ "grad_norm": 0.09051619470119476,
609
  "learning_rate": 0.0002,
610
+ "loss": 0.5777,
611
  "step": 430
612
  },
613
  {
614
+ "epoch": 0.4164672091910005,
615
+ "grad_norm": 0.08543870598077774,
616
  "learning_rate": 0.0002,
617
+ "loss": 0.5299,
618
  "step": 435
619
  },
620
  {
621
+ "epoch": 0.421254188606989,
622
+ "grad_norm": 0.08574735373258591,
623
  "learning_rate": 0.0002,
624
+ "loss": 0.5573,
625
  "step": 440
626
  },
627
  {
628
+ "epoch": 0.4260411680229775,
629
+ "grad_norm": 0.09401609748601913,
630
  "learning_rate": 0.0002,
631
+ "loss": 0.5643,
632
  "step": 445
633
  },
634
  {
635
+ "epoch": 0.430828147438966,
636
+ "grad_norm": 0.10760053247213364,
637
  "learning_rate": 0.0002,
638
+ "loss": 0.516,
639
  "step": 450
640
  },
641
  {
642
+ "epoch": 0.4356151268549545,
643
+ "grad_norm": 0.09510120749473572,
644
  "learning_rate": 0.0002,
645
+ "loss": 0.505,
646
  "step": 455
647
  },
648
  {
649
+ "epoch": 0.44040210627094306,
650
+ "grad_norm": 0.09105115383863449,
651
  "learning_rate": 0.0002,
652
+ "loss": 0.5717,
653
  "step": 460
654
  },
655
  {
656
+ "epoch": 0.44518908568693155,
657
+ "grad_norm": 0.0891876295208931,
658
  "learning_rate": 0.0002,
659
+ "loss": 0.5258,
660
  "step": 465
661
  },
662
  {
663
+ "epoch": 0.44997606510292004,
664
+ "grad_norm": 0.08933177590370178,
665
  "learning_rate": 0.0002,
666
+ "loss": 0.4951,
667
  "step": 470
668
  },
669
  {
670
+ "epoch": 0.4547630445189086,
671
+ "grad_norm": 0.09821013361215591,
672
  "learning_rate": 0.0002,
673
+ "loss": 0.5422,
674
  "step": 475
675
  },
676
  {
677
+ "epoch": 0.4595500239348971,
678
+ "grad_norm": 0.090922050178051,
679
  "learning_rate": 0.0002,
680
+ "loss": 0.5286,
681
  "step": 480
682
  },
683
  {
684
+ "epoch": 0.46433700335088557,
685
+ "grad_norm": 0.09325899183750153,
686
  "learning_rate": 0.0002,
687
+ "loss": 0.596,
688
  "step": 485
689
  },
690
  {
691
+ "epoch": 0.4691239827668741,
692
+ "grad_norm": 0.09565772861242294,
693
  "learning_rate": 0.0002,
694
+ "loss": 0.4855,
695
  "step": 490
696
  },
697
  {
698
+ "epoch": 0.4739109621828626,
699
+ "grad_norm": 0.08238258212804794,
700
  "learning_rate": 0.0002,
701
+ "loss": 0.534,
702
  "step": 495
703
  },
704
  {
705
+ "epoch": 0.47869794159885115,
706
+ "grad_norm": 0.10455012321472168,
707
  "learning_rate": 0.0002,
708
+ "loss": 0.5615,
709
  "step": 500
710
  },
711
  {
712
+ "epoch": 0.48348492101483964,
713
+ "grad_norm": 0.07809582352638245,
714
  "learning_rate": 0.0002,
715
+ "loss": 0.5319,
716
  "step": 505
717
  },
718
  {
719
+ "epoch": 0.48827190043082813,
720
+ "grad_norm": 0.09158290922641754,
721
  "learning_rate": 0.0002,
722
+ "loss": 0.5149,
723
  "step": 510
724
  },
725
  {
726
+ "epoch": 0.4930588798468167,
727
+ "grad_norm": 0.09475893527269363,
728
  "learning_rate": 0.0002,
729
+ "loss": 0.5548,
730
  "step": 515
731
  },
732
  {
733
+ "epoch": 0.49784585926280517,
734
+ "grad_norm": 0.08862445503473282,
735
  "learning_rate": 0.0002,
736
+ "loss": 0.551,
737
  "step": 520
738
  },
739
  {
740
+ "epoch": 0.5026328386787937,
741
+ "grad_norm": 0.08608075976371765,
742
  "learning_rate": 0.0002,
743
+ "loss": 0.5032,
744
  "step": 525
745
  },
746
  {
747
+ "epoch": 0.5074198180947822,
748
+ "grad_norm": 0.09171325713396072,
749
  "learning_rate": 0.0002,
750
+ "loss": 0.4872,
751
  "step": 530
752
  },
753
  {
754
+ "epoch": 0.5122067975107707,
755
+ "grad_norm": 0.08891316503286362,
756
  "learning_rate": 0.0002,
757
+ "loss": 0.5381,
758
  "step": 535
759
  },
760
  {
761
+ "epoch": 0.5169937769267592,
762
+ "grad_norm": 0.09202417731285095,
763
  "learning_rate": 0.0002,
764
+ "loss": 0.5642,
765
  "step": 540
766
  },
767
  {
768
+ "epoch": 0.5217807563427477,
769
+ "grad_norm": 0.09024330973625183,
770
  "learning_rate": 0.0002,
771
+ "loss": 0.4638,
772
  "step": 545
773
  },
774
  {
775
+ "epoch": 0.5265677357587363,
776
+ "grad_norm": 0.08484344184398651,
777
  "learning_rate": 0.0002,
778
+ "loss": 0.5171,
779
  "step": 550
780
  },
781
  {
782
+ "epoch": 0.5313547151747248,
783
+ "grad_norm": 0.09126883000135422,
784
  "learning_rate": 0.0002,
785
+ "loss": 0.4778,
786
  "step": 555
787
  },
788
  {
789
+ "epoch": 0.5361416945907133,
790
+ "grad_norm": 0.08565142005681992,
791
  "learning_rate": 0.0002,
792
+ "loss": 0.5264,
793
  "step": 560
794
  },
795
  {
796
+ "epoch": 0.5409286740067017,
797
+ "grad_norm": 0.09363921731710434,
798
  "learning_rate": 0.0002,
799
+ "loss": 0.5261,
800
  "step": 565
801
  },
802
  {
803
+ "epoch": 0.5457156534226902,
804
+ "grad_norm": 0.08321545273065567,
805
  "learning_rate": 0.0002,
806
+ "loss": 0.4724,
807
  "step": 570
808
  },
809
  {
810
+ "epoch": 0.5505026328386788,
811
+ "grad_norm": 0.08636103570461273,
812
  "learning_rate": 0.0002,
813
+ "loss": 0.5383,
814
  "step": 575
815
  },
816
  {
817
+ "epoch": 0.5552896122546673,
818
+ "grad_norm": 0.0867634192109108,
819
  "learning_rate": 0.0002,
820
+ "loss": 0.571,
821
  "step": 580
822
  },
823
  {
824
+ "epoch": 0.5600765916706558,
825
+ "grad_norm": 0.09202156215906143,
826
  "learning_rate": 0.0002,
827
+ "loss": 0.4925,
828
  "step": 585
829
  },
830
  {
831
+ "epoch": 0.5648635710866443,
832
+ "grad_norm": 0.08338255435228348,
833
  "learning_rate": 0.0002,
834
+ "loss": 0.4724,
835
  "step": 590
836
  },
837
  {
838
+ "epoch": 0.5696505505026328,
839
+ "grad_norm": 0.09248416125774384,
840
  "learning_rate": 0.0002,
841
+ "loss": 0.5339,
842
  "step": 595
843
  },
844
  {
845
+ "epoch": 0.5744375299186214,
846
+ "grad_norm": 0.08971364796161652,
847
  "learning_rate": 0.0002,
848
+ "loss": 0.5467,
849
  "step": 600
850
  },
851
  {
852
+ "epoch": 0.5792245093346099,
853
+ "grad_norm": 0.10297700017690659,
854
  "learning_rate": 0.0002,
855
+ "loss": 0.5269,
856
  "step": 605
857
  },
858
  {
859
+ "epoch": 0.5840114887505984,
860
+ "grad_norm": 0.09885570406913757,
861
  "learning_rate": 0.0002,
862
+ "loss": 0.5741,
863
  "step": 610
864
  },
865
  {
866
+ "epoch": 0.5887984681665869,
867
+ "grad_norm": 0.0943949893116951,
868
  "learning_rate": 0.0002,
869
+ "loss": 0.5107,
870
  "step": 615
871
  },
872
  {
873
+ "epoch": 0.5935854475825754,
874
+ "grad_norm": 0.09385235607624054,
875
  "learning_rate": 0.0002,
876
+ "loss": 0.5522,
877
  "step": 620
878
  },
879
  {
880
+ "epoch": 0.5983724269985639,
881
+ "grad_norm": 0.0906907171010971,
882
  "learning_rate": 0.0002,
883
+ "loss": 0.4684,
884
  "step": 625
885
  },
886
  {
887
+ "epoch": 0.6031594064145525,
888
+ "grad_norm": 0.08867505192756653,
889
  "learning_rate": 0.0002,
890
+ "loss": 0.4637,
891
  "step": 630
892
  },
893
  {
894
+ "epoch": 0.607946385830541,
895
+ "grad_norm": 0.0929451733827591,
896
  "learning_rate": 0.0002,
897
+ "loss": 0.5462,
898
  "step": 635
899
  },
900
  {
901
+ "epoch": 0.6127333652465294,
902
+ "grad_norm": 0.08720085769891739,
903
  "learning_rate": 0.0002,
904
+ "loss": 0.497,
905
  "step": 640
906
  },
907
  {
908
+ "epoch": 0.6175203446625179,
909
+ "grad_norm": 0.10713039338588715,
910
  "learning_rate": 0.0002,
911
+ "loss": 0.5448,
912
  "step": 645
913
  },
914
  {
915
+ "epoch": 0.6223073240785064,
916
+ "grad_norm": 0.08213481307029724,
917
  "learning_rate": 0.0002,
918
+ "loss": 0.4816,
919
  "step": 650
920
  },
921
  {
922
+ "epoch": 0.627094303494495,
923
+ "grad_norm": 0.08939921110868454,
924
  "learning_rate": 0.0002,
925
+ "loss": 0.4883,
926
  "step": 655
927
  },
928
  {
929
+ "epoch": 0.6318812829104835,
930
+ "grad_norm": 0.09071970731019974,
931
  "learning_rate": 0.0002,
932
+ "loss": 0.5411,
933
  "step": 660
934
  },
935
  {
936
+ "epoch": 0.636668262326472,
937
+ "grad_norm": 0.09525053203105927,
938
  "learning_rate": 0.0002,
939
+ "loss": 0.4966,
940
  "step": 665
941
  },
942
  {
943
+ "epoch": 0.6414552417424605,
944
+ "grad_norm": 0.08770790696144104,
945
  "learning_rate": 0.0002,
946
+ "loss": 0.4786,
947
  "step": 670
948
  },
949
  {
950
+ "epoch": 0.646242221158449,
951
+ "grad_norm": 0.08054076880216599,
952
  "learning_rate": 0.0002,
953
+ "loss": 0.5051,
954
  "step": 675
955
  },
956
  {
957
+ "epoch": 0.6510292005744376,
958
+ "grad_norm": 0.08313776552677155,
959
  "learning_rate": 0.0002,
960
+ "loss": 0.5547,
961
  "step": 680
962
  },
963
  {
964
+ "epoch": 0.6558161799904261,
965
+ "grad_norm": 0.0805881917476654,
966
  "learning_rate": 0.0002,
967
+ "loss": 0.495,
968
  "step": 685
969
  },
970
  {
971
+ "epoch": 0.6606031594064146,
972
+ "grad_norm": 0.10019008070230484,
973
  "learning_rate": 0.0002,
974
+ "loss": 0.4683,
975
  "step": 690
976
  },
977
  {
978
+ "epoch": 0.665390138822403,
979
+ "grad_norm": 0.08097992837429047,
980
  "learning_rate": 0.0002,
981
+ "loss": 0.5511,
982
  "step": 695
983
  },
984
  {
985
+ "epoch": 0.6701771182383915,
986
+ "grad_norm": 0.08138570934534073,
987
  "learning_rate": 0.0002,
988
+ "loss": 0.5638,
989
  "step": 700
990
  },
991
  {
992
+ "epoch": 0.67496409765438,
993
+ "grad_norm": 0.09005066752433777,
994
  "learning_rate": 0.0002,
995
+ "loss": 0.4591,
996
  "step": 705
997
  },
998
  {
999
+ "epoch": 0.6797510770703686,
1000
+ "grad_norm": 0.09737958759069443,
1001
  "learning_rate": 0.0002,
1002
+ "loss": 0.5003,
1003
  "step": 710
1004
  },
1005
  {
1006
+ "epoch": 0.6845380564863571,
1007
+ "grad_norm": 0.0959305465221405,
1008
  "learning_rate": 0.0002,
1009
+ "loss": 0.5645,
1010
  "step": 715
1011
  },
1012
  {
1013
+ "epoch": 0.6893250359023456,
1014
+ "grad_norm": 0.0876409187912941,
1015
  "learning_rate": 0.0002,
1016
+ "loss": 0.531,
1017
  "step": 720
1018
  },
1019
  {
1020
+ "epoch": 0.6941120153183341,
1021
+ "grad_norm": 0.09579559415578842,
1022
  "learning_rate": 0.0002,
1023
+ "loss": 0.5717,
1024
  "step": 725
1025
  },
1026
  {
1027
+ "epoch": 0.6988989947343226,
1028
+ "grad_norm": 0.08657323569059372,
1029
  "learning_rate": 0.0002,
1030
+ "loss": 0.4846,
1031
  "step": 730
1032
  },
1033
  {
1034
+ "epoch": 0.7036859741503112,
1035
+ "grad_norm": 0.08424372225999832,
1036
  "learning_rate": 0.0002,
1037
+ "loss": 0.5175,
1038
  "step": 735
1039
  },
1040
  {
1041
+ "epoch": 0.7084729535662997,
1042
+ "grad_norm": 0.0895078107714653,
1043
  "learning_rate": 0.0002,
1044
+ "loss": 0.518,
1045
  "step": 740
1046
  },
1047
  {
1048
+ "epoch": 0.7132599329822882,
1049
+ "grad_norm": 0.08580939471721649,
1050
  "learning_rate": 0.0002,
1051
+ "loss": 0.5601,
1052
  "step": 745
1053
  },
1054
  {
1055
+ "epoch": 0.7180469123982767,
1056
+ "grad_norm": 0.0797315239906311,
1057
  "learning_rate": 0.0002,
1058
+ "loss": 0.5354,
1059
  "step": 750
1060
  },
1061
  {
1062
+ "epoch": 0.7228338918142652,
1063
+ "grad_norm": 0.08981385827064514,
1064
  "learning_rate": 0.0002,
1065
+ "loss": 0.5638,
1066
  "step": 755
1067
  },
1068
  {
1069
+ "epoch": 0.7276208712302538,
1070
+ "grad_norm": 0.09025374054908752,
1071
  "learning_rate": 0.0002,
1072
+ "loss": 0.5028,
1073
  "step": 760
1074
  },
1075
  {
1076
+ "epoch": 0.7324078506462423,
1077
+ "grad_norm": 0.09753820300102234,
1078
  "learning_rate": 0.0002,
1079
+ "loss": 0.5237,
1080
  "step": 765
1081
  },
1082
  {
1083
+ "epoch": 0.7371948300622307,
1084
+ "grad_norm": 0.08967633545398712,
1085
  "learning_rate": 0.0002,
1086
+ "loss": 0.4636,
1087
  "step": 770
1088
  },
1089
  {
1090
+ "epoch": 0.7419818094782192,
1091
+ "grad_norm": 0.10074934363365173,
1092
  "learning_rate": 0.0002,
1093
+ "loss": 0.5276,
1094
  "step": 775
1095
  },
1096
  {
1097
+ "epoch": 0.7467687888942077,
1098
+ "grad_norm": 0.0874541625380516,
1099
  "learning_rate": 0.0002,
1100
+ "loss": 0.5085,
1101
  "step": 780
1102
  },
1103
  {
1104
+ "epoch": 0.7515557683101962,
1105
+ "grad_norm": 0.084027960896492,
1106
  "learning_rate": 0.0002,
1107
+ "loss": 0.5062,
1108
  "step": 785
1109
  },
1110
  {
1111
+ "epoch": 0.7563427477261848,
1112
+ "grad_norm": 0.08965150266885757,
1113
  "learning_rate": 0.0002,
1114
+ "loss": 0.5214,
1115
  "step": 790
1116
  },
1117
  {
1118
+ "epoch": 0.7611297271421733,
1119
+ "grad_norm": 0.08234406262636185,
1120
  "learning_rate": 0.0002,
1121
+ "loss": 0.4863,
1122
  "step": 795
1123
  },
1124
  {
1125
+ "epoch": 0.7659167065581618,
1126
+ "grad_norm": 0.08266417682170868,
1127
  "learning_rate": 0.0002,
1128
+ "loss": 0.5149,
1129
  "step": 800
1130
  },
1131
  {
1132
+ "epoch": 0.7707036859741503,
1133
+ "grad_norm": 0.08559945225715637,
1134
  "learning_rate": 0.0002,
1135
+ "loss": 0.528,
1136
  "step": 805
1137
  },
1138
  {
1139
+ "epoch": 0.7754906653901388,
1140
+ "grad_norm": 0.08358705043792725,
1141
  "learning_rate": 0.0002,
1142
+ "loss": 0.5283,
1143
  "step": 810
1144
  },
1145
  {
1146
+ "epoch": 0.7802776448061274,
1147
+ "grad_norm": 0.08530480414628983,
1148
  "learning_rate": 0.0002,
1149
+ "loss": 0.4868,
1150
  "step": 815
1151
  },
1152
  {
1153
+ "epoch": 0.7850646242221159,
1154
+ "grad_norm": 0.08576823025941849,
1155
  "learning_rate": 0.0002,
1156
+ "loss": 0.5277,
1157
  "step": 820
1158
  },
1159
  {
1160
+ "epoch": 0.7898516036381044,
1161
+ "grad_norm": 0.10152282565832138,
1162
+ "learning_rate": 0.0002,
1163
+ "loss": 0.4652,
1164
+ "step": 825
1165
+ },
1166
+ {
1167
+ "epoch": 0.7946385830540929,
1168
+ "grad_norm": 0.08843079209327698,
1169
+ "learning_rate": 0.0002,
1170
+ "loss": 0.5194,
1171
+ "step": 830
1172
+ },
1173
+ {
1174
+ "epoch": 0.7994255624700813,
1175
+ "grad_norm": 0.08835287392139435,
1176
+ "learning_rate": 0.0002,
1177
+ "loss": 0.5352,
1178
+ "step": 835
1179
+ },
1180
+ {
1181
+ "epoch": 0.8042125418860699,
1182
+ "grad_norm": 0.08630600571632385,
1183
+ "learning_rate": 0.0002,
1184
+ "loss": 0.4869,
1185
+ "step": 840
1186
+ },
1187
+ {
1188
+ "epoch": 0.8089995213020584,
1189
+ "grad_norm": 0.08701962232589722,
1190
+ "learning_rate": 0.0002,
1191
+ "loss": 0.5048,
1192
+ "step": 845
1193
+ },
1194
+ {
1195
+ "epoch": 0.8137865007180469,
1196
+ "grad_norm": 0.09896954894065857,
1197
+ "learning_rate": 0.0002,
1198
+ "loss": 0.5398,
1199
+ "step": 850
1200
+ },
1201
+ {
1202
+ "epoch": 0.8185734801340354,
1203
+ "grad_norm": 0.0876292958855629,
1204
+ "learning_rate": 0.0002,
1205
+ "loss": 0.5801,
1206
+ "step": 855
1207
+ },
1208
+ {
1209
+ "epoch": 0.8233604595500239,
1210
+ "grad_norm": 0.08727893233299255,
1211
+ "learning_rate": 0.0002,
1212
+ "loss": 0.5378,
1213
+ "step": 860
1214
+ },
1215
+ {
1216
+ "epoch": 0.8281474389660124,
1217
+ "grad_norm": 0.08662202209234238,
1218
+ "learning_rate": 0.0002,
1219
+ "loss": 0.5804,
1220
+ "step": 865
1221
+ },
1222
+ {
1223
+ "epoch": 0.832934418382001,
1224
+ "grad_norm": 0.08253654092550278,
1225
+ "learning_rate": 0.0002,
1226
+ "loss": 0.4701,
1227
+ "step": 870
1228
+ },
1229
+ {
1230
+ "epoch": 0.8377213977979895,
1231
+ "grad_norm": 0.08907407522201538,
1232
+ "learning_rate": 0.0002,
1233
+ "loss": 0.4918,
1234
+ "step": 875
1235
+ },
1236
+ {
1237
+ "epoch": 0.842508377213978,
1238
+ "grad_norm": 0.09331085532903671,
1239
+ "learning_rate": 0.0002,
1240
+ "loss": 0.5438,
1241
+ "step": 880
1242
+ },
1243
+ {
1244
+ "epoch": 0.8472953566299665,
1245
+ "grad_norm": 0.09129630029201508,
1246
+ "learning_rate": 0.0002,
1247
+ "loss": 0.5327,
1248
+ "step": 885
1249
+ },
1250
+ {
1251
+ "epoch": 0.852082336045955,
1252
+ "grad_norm": 0.09735500812530518,
1253
+ "learning_rate": 0.0002,
1254
+ "loss": 0.5281,
1255
+ "step": 890
1256
+ },
1257
+ {
1258
+ "epoch": 0.8568693154619436,
1259
+ "grad_norm": 0.0904528871178627,
1260
+ "learning_rate": 0.0002,
1261
+ "loss": 0.4964,
1262
+ "step": 895
1263
+ },
1264
+ {
1265
+ "epoch": 0.861656294877932,
1266
+ "grad_norm": 0.08291352540254593,
1267
+ "learning_rate": 0.0002,
1268
+ "loss": 0.5192,
1269
+ "step": 900
1270
+ },
1271
+ {
1272
+ "epoch": 0.8664432742939205,
1273
+ "grad_norm": 0.09108038246631622,
1274
+ "learning_rate": 0.0002,
1275
+ "loss": 0.501,
1276
+ "step": 905
1277
+ },
1278
+ {
1279
+ "epoch": 0.871230253709909,
1280
+ "grad_norm": 0.09137269854545593,
1281
+ "learning_rate": 0.0002,
1282
+ "loss": 0.5012,
1283
+ "step": 910
1284
+ },
1285
+ {
1286
+ "epoch": 0.8760172331258975,
1287
+ "grad_norm": 0.08169892430305481,
1288
+ "learning_rate": 0.0002,
1289
+ "loss": 0.4994,
1290
+ "step": 915
1291
+ },
1292
+ {
1293
+ "epoch": 0.8808042125418861,
1294
+ "grad_norm": 0.08467283844947815,
1295
+ "learning_rate": 0.0002,
1296
+ "loss": 0.4502,
1297
+ "step": 920
1298
+ },
1299
+ {
1300
+ "epoch": 0.8855911919578746,
1301
+ "grad_norm": 0.08680226653814316,
1302
+ "learning_rate": 0.0002,
1303
+ "loss": 0.5508,
1304
+ "step": 925
1305
+ },
1306
+ {
1307
+ "epoch": 0.8903781713738631,
1308
+ "grad_norm": 0.08897334337234497,
1309
+ "learning_rate": 0.0002,
1310
+ "loss": 0.5281,
1311
+ "step": 930
1312
+ },
1313
+ {
1314
+ "epoch": 0.8951651507898516,
1315
+ "grad_norm": 0.09124335646629333,
1316
+ "learning_rate": 0.0002,
1317
+ "loss": 0.5173,
1318
+ "step": 935
1319
+ },
1320
+ {
1321
+ "epoch": 0.8999521302058401,
1322
+ "grad_norm": 0.08976174145936966,
1323
+ "learning_rate": 0.0002,
1324
+ "loss": 0.5519,
1325
+ "step": 940
1326
+ },
1327
+ {
1328
+ "epoch": 0.9047391096218286,
1329
+ "grad_norm": 0.07799748331308365,
1330
+ "learning_rate": 0.0002,
1331
+ "loss": 0.5495,
1332
+ "step": 945
1333
+ },
1334
+ {
1335
+ "epoch": 0.9095260890378172,
1336
+ "grad_norm": 0.08304045349359512,
1337
+ "learning_rate": 0.0002,
1338
+ "loss": 0.5552,
1339
+ "step": 950
1340
+ },
1341
+ {
1342
+ "epoch": 0.9143130684538057,
1343
+ "grad_norm": 0.08134391158819199,
1344
+ "learning_rate": 0.0002,
1345
+ "loss": 0.4953,
1346
+ "step": 955
1347
+ },
1348
+ {
1349
+ "epoch": 0.9191000478697942,
1350
+ "grad_norm": 0.102556973695755,
1351
+ "learning_rate": 0.0002,
1352
+ "loss": 0.5841,
1353
+ "step": 960
1354
+ },
1355
+ {
1356
+ "epoch": 0.9238870272857826,
1357
+ "grad_norm": 0.09310037642717361,
1358
+ "learning_rate": 0.0002,
1359
+ "loss": 0.4977,
1360
+ "step": 965
1361
+ },
1362
+ {
1363
+ "epoch": 0.9286740067017711,
1364
+ "grad_norm": 0.08947998285293579,
1365
+ "learning_rate": 0.0002,
1366
+ "loss": 0.5147,
1367
+ "step": 970
1368
+ },
1369
+ {
1370
+ "epoch": 0.9334609861177597,
1371
+ "grad_norm": 0.0801323875784874,
1372
+ "learning_rate": 0.0002,
1373
+ "loss": 0.5379,
1374
+ "step": 975
1375
+ },
1376
+ {
1377
+ "epoch": 0.9382479655337482,
1378
+ "grad_norm": 0.09458567947149277,
1379
+ "learning_rate": 0.0002,
1380
+ "loss": 0.463,
1381
+ "step": 980
1382
+ },
1383
+ {
1384
+ "epoch": 0.9430349449497367,
1385
+ "grad_norm": 0.08248139917850494,
1386
+ "learning_rate": 0.0002,
1387
+ "loss": 0.4899,
1388
+ "step": 985
1389
+ },
1390
+ {
1391
+ "epoch": 0.9478219243657252,
1392
+ "grad_norm": 0.08913381397724152,
1393
+ "learning_rate": 0.0002,
1394
+ "loss": 0.5455,
1395
+ "step": 990
1396
+ },
1397
+ {
1398
+ "epoch": 0.9526089037817137,
1399
+ "grad_norm": 0.09054595977067947,
1400
+ "learning_rate": 0.0002,
1401
+ "loss": 0.528,
1402
+ "step": 995
1403
+ },
1404
+ {
1405
+ "epoch": 0.9573958831977023,
1406
+ "grad_norm": 0.0929536446928978,
1407
+ "learning_rate": 0.0002,
1408
+ "loss": 0.5595,
1409
+ "step": 1000
1410
+ },
1411
+ {
1412
+ "epoch": 0.9621828626136908,
1413
+ "grad_norm": 0.09117671847343445,
1414
+ "learning_rate": 0.0002,
1415
+ "loss": 0.5212,
1416
+ "step": 1005
1417
+ },
1418
+ {
1419
+ "epoch": 0.9669698420296793,
1420
+ "grad_norm": 0.09163827449083328,
1421
+ "learning_rate": 0.0002,
1422
+ "loss": 0.4587,
1423
+ "step": 1010
1424
+ },
1425
+ {
1426
+ "epoch": 0.9717568214456678,
1427
+ "grad_norm": 0.09541551768779755,
1428
+ "learning_rate": 0.0002,
1429
+ "loss": 0.5352,
1430
+ "step": 1015
1431
+ },
1432
+ {
1433
+ "epoch": 0.9765438008616563,
1434
+ "grad_norm": 0.09220823645591736,
1435
+ "learning_rate": 0.0002,
1436
+ "loss": 0.5599,
1437
+ "step": 1020
1438
+ },
1439
+ {
1440
+ "epoch": 0.9813307802776448,
1441
+ "grad_norm": 0.09834371507167816,
1442
+ "learning_rate": 0.0002,
1443
+ "loss": 0.5605,
1444
+ "step": 1025
1445
+ },
1446
+ {
1447
+ "epoch": 0.9861177596936334,
1448
+ "grad_norm": 0.08727829903364182,
1449
+ "learning_rate": 0.0002,
1450
+ "loss": 0.5427,
1451
+ "step": 1030
1452
+ },
1453
+ {
1454
+ "epoch": 0.9909047391096218,
1455
+ "grad_norm": 0.09128595143556595,
1456
+ "learning_rate": 0.0002,
1457
+ "loss": 0.574,
1458
+ "step": 1035
1459
+ },
1460
+ {
1461
+ "epoch": 0.9956917185256103,
1462
+ "grad_norm": 0.09627512097358704,
1463
+ "learning_rate": 0.0002,
1464
+ "loss": 0.4791,
1465
+ "step": 1040
1466
+ },
1467
+ {
1468
+ "epoch": 0.9995213020584012,
1469
+ "step": 1044,
1470
+ "total_flos": 2.304533815525294e+17,
1471
+ "train_loss": 0.5449674188862359,
1472
+ "train_runtime": 2251.699,
1473
+ "train_samples_per_second": 7.421,
1474
+ "train_steps_per_second": 0.464
1475
  }
1476
  ],
1477
  "logging_steps": 5,
1478
+ "max_steps": 1044,
1479
  "num_input_tokens_seen": 0,
1480
  "num_train_epochs": 1,
1481
  "save_steps": 500,
 
1491
  "attributes": {}
1492
  }
1493
  },
1494
+ "total_flos": 2.304533815525294e+17,
1495
  "train_batch_size": 8,
1496
  "trial_name": null,
1497
  "trial_params": null