yyx123 commited on
Commit
0f23aae
·
verified ·
1 Parent(s): b09e452

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,11 @@
2
  license: other
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
- - generated_from_trainer
7
  - trl
8
  - sft
9
  - generated_from_trainer
10
  datasets:
11
- - zhihu
12
  base_model: 01-ai/Yi-6B
13
  model-index:
14
  - name: Yi-6B-zhihu5
@@ -20,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
20
 
21
  # Yi-6B-zhihu5
22
 
23
- This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the zhihu dataset.
24
  It achieves the following results on the evaluation set:
25
- - Loss: 2.6301
26
 
27
  ## Model description
28
 
@@ -41,9 +39,9 @@ More information needed
41
  ### Training hyperparameters
42
 
43
  The following hyperparameters were used during training:
44
- - learning_rate: 1e-06
45
- - train_batch_size: 8
46
- - eval_batch_size: 4
47
  - seed: 42
48
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
  - lr_scheduler_type: cosine
@@ -54,26 +52,26 @@ The following hyperparameters were used during training:
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
- | 2.6842 | 1.0 | 96 | 2.6359 |
58
- | 2.6736 | 2.0 | 192 | 2.6350 |
59
- | 2.7002 | 3.0 | 288 | 2.6337 |
60
- | 2.6785 | 4.0 | 384 | 2.6323 |
61
- | 2.6275 | 5.0 | 480 | 2.6314 |
62
- | 2.6281 | 6.0 | 576 | 2.6308 |
63
- | 2.7613 | 7.0 | 672 | 2.6305 |
64
- | 2.6652 | 8.0 | 768 | 2.6303 |
65
- | 2.6029 | 9.0 | 864 | 2.6302 |
66
- | 2.6551 | 10.0 | 960 | 2.6301 |
67
- | 2.6734 | 11.0 | 1056 | 2.6301 |
68
- | 2.6927 | 12.0 | 1152 | 2.6301 |
69
- | 2.5663 | 13.0 | 1248 | 2.6301 |
70
- | 2.5786 | 14.0 | 1344 | 2.6301 |
71
- | 2.574 | 15.0 | 1440 | 2.6301 |
72
- | 2.6113 | 16.0 | 1536 | 2.6301 |
73
- | 2.6837 | 17.0 | 1632 | 2.6301 |
74
- | 2.5966 | 18.0 | 1728 | 2.6301 |
75
- | 2.5931 | 19.0 | 1824 | 2.6301 |
76
- | 2.6933 | 20.0 | 1920 | 2.6301 |
77
 
78
 
79
  ### Framework versions
 
2
  license: other
3
  library_name: peft
4
  tags:
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  datasets:
9
+ - generator
10
  base_model: 01-ai/Yi-6B
11
  model-index:
12
  - name: Yi-6B-zhihu5
 
18
 
19
  # Yi-6B-zhihu5
20
 
21
+ This model is a fine-tuned version of [01-ai/Yi-6B](https://huggingface.co/01-ai/Yi-6B) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.5677
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 5e-06
43
+ - train_batch_size: 16
44
+ - eval_batch_size: 8
45
  - seed: 42
46
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
 
52
 
53
  | Training Loss | Epoch | Step | Validation Loss |
54
  |:-------------:|:-----:|:----:|:---------------:|
55
+ | 2.6816 | 1.0 | 96 | 2.6328 |
56
+ | 2.6582 | 2.0 | 192 | 2.6169 |
57
+ | 2.6676 | 3.0 | 288 | 2.5983 |
58
+ | 2.6413 | 4.0 | 384 | 2.5891 |
59
+ | 2.581 | 5.0 | 480 | 2.5825 |
60
+ | 2.5884 | 6.0 | 576 | 2.5776 |
61
+ | 2.704 | 7.0 | 672 | 2.5741 |
62
+ | 2.608 | 8.0 | 768 | 2.5715 |
63
+ | 2.5454 | 9.0 | 864 | 2.5698 |
64
+ | 2.5938 | 10.0 | 960 | 2.5688 |
65
+ | 2.6129 | 11.0 | 1056 | 2.5682 |
66
+ | 2.6334 | 12.0 | 1152 | 2.5679 |
67
+ | 2.5013 | 13.0 | 1248 | 2.5678 |
68
+ | 2.519 | 14.0 | 1344 | 2.5677 |
69
+ | 2.5076 | 15.0 | 1440 | 2.5677 |
70
+ | 2.5443 | 16.0 | 1536 | 2.5677 |
71
+ | 2.5972 | 17.0 | 1632 | 2.5677 |
72
+ | 2.5361 | 18.0 | 1728 | 2.5677 |
73
+ | 2.5317 | 19.0 | 1824 | 2.5677 |
74
+ | 2.632 | 20.0 | 1920 | 2.5677 |
75
 
76
 
77
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a1a35d3ba957076616882f4d08bc0010e3e9f4b78dc6fc3dbaa2fdec4383901
3
  size 72673912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09603b655b91df777bd2d1500f9b1e168d1a951b2095fccbcc81e5fe59331c61
3
  size 72673912
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_loss": 2.6300840377807617,
4
- "eval_runtime": 173.3303,
5
  "eval_samples": 2561,
6
- "eval_samples_per_second": 4.419,
7
- "eval_steps_per_second": 1.108,
8
- "train_loss": 1.8769829372564952,
9
- "train_runtime": 10573.7699,
10
  "train_samples": 2561,
11
- "train_samples_per_second": 1.449,
12
- "train_steps_per_second": 0.182
13
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_loss": 2.5677051544189453,
4
+ "eval_runtime": 164.7624,
5
  "eval_samples": 2561,
6
+ "eval_samples_per_second": 4.649,
7
+ "eval_steps_per_second": 0.583,
8
+ "train_loss": 0.428011018037796,
9
+ "train_runtime": 2530.9924,
10
  "train_samples": 2561,
11
+ "train_samples_per_second": 6.053,
12
+ "train_steps_per_second": 0.759
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "eval_loss": 2.6300840377807617,
4
- "eval_runtime": 173.3303,
5
  "eval_samples": 2561,
6
- "eval_samples_per_second": 4.419,
7
- "eval_steps_per_second": 1.108
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "eval_loss": 2.5677051544189453,
4
+ "eval_runtime": 164.7624,
5
  "eval_samples": 2561,
6
+ "eval_samples_per_second": 4.649,
7
+ "eval_steps_per_second": 0.583
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 20.0,
3
- "train_loss": 1.8769829372564952,
4
- "train_runtime": 10573.7699,
5
  "train_samples": 2561,
6
- "train_samples_per_second": 1.449,
7
- "train_steps_per_second": 0.182
8
  }
 
1
  {
2
  "epoch": 20.0,
3
+ "train_loss": 0.428011018037796,
4
+ "train_runtime": 2530.9924,
5
  "train_samples": 2561,
6
+ "train_samples_per_second": 6.053,
7
+ "train_steps_per_second": 0.759
8
  }
trainer_state.json CHANGED
@@ -10,2482 +10,2482 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
- "learning_rate": 5.208333333333333e-09,
14
  "loss": 2.3378,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.05,
19
- "learning_rate": 2.6041666666666667e-08,
20
- "loss": 2.6781,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.1,
25
- "learning_rate": 5.208333333333333e-08,
26
- "loss": 2.6964,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.16,
31
- "learning_rate": 7.812499999999999e-08,
32
  "loss": 2.6688,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.21,
37
- "learning_rate": 1.0416666666666667e-07,
38
  "loss": 2.7272,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.26,
43
- "learning_rate": 1.3020833333333334e-07,
44
  "loss": 2.6971,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.31,
49
- "learning_rate": 1.5624999999999999e-07,
50
- "loss": 2.6381,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.36,
55
- "learning_rate": 1.8229166666666666e-07,
56
- "loss": 2.6255,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.42,
61
- "learning_rate": 2.0833333333333333e-07,
62
- "loss": 2.6377,
63
  "step": 40
64
  },
65
  {
66
  "epoch": 0.47,
67
- "learning_rate": 2.3437499999999998e-07,
68
- "loss": 2.6106,
69
  "step": 45
70
  },
71
  {
72
  "epoch": 0.52,
73
- "learning_rate": 2.604166666666667e-07,
74
- "loss": 2.5925,
75
  "step": 50
76
  },
77
  {
78
  "epoch": 0.57,
79
- "learning_rate": 2.864583333333333e-07,
80
- "loss": 2.5784,
81
  "step": 55
82
  },
83
  {
84
  "epoch": 0.62,
85
- "learning_rate": 3.1249999999999997e-07,
86
- "loss": 2.6202,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.68,
91
- "learning_rate": 3.3854166666666667e-07,
92
- "loss": 2.609,
93
  "step": 65
94
  },
95
  {
96
  "epoch": 0.73,
97
- "learning_rate": 3.645833333333333e-07,
98
- "loss": 2.6473,
99
  "step": 70
100
  },
101
  {
102
  "epoch": 0.78,
103
- "learning_rate": 3.9062499999999997e-07,
104
- "loss": 2.69,
105
  "step": 75
106
  },
107
  {
108
  "epoch": 0.83,
109
- "learning_rate": 4.1666666666666667e-07,
110
- "loss": 2.451,
111
  "step": 80
112
  },
113
  {
114
  "epoch": 0.89,
115
- "learning_rate": 4.427083333333333e-07,
116
- "loss": 2.6445,
117
  "step": 85
118
  },
119
  {
120
  "epoch": 0.94,
121
- "learning_rate": 4.6874999999999996e-07,
122
- "loss": 2.6294,
123
  "step": 90
124
  },
125
  {
126
  "epoch": 0.99,
127
- "learning_rate": 4.947916666666667e-07,
128
- "loss": 2.6842,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 1.0,
133
- "eval_loss": 2.635899543762207,
134
- "eval_runtime": 173.5318,
135
- "eval_samples_per_second": 4.414,
136
  "eval_steps_per_second": 1.106,
137
  "step": 96
138
  },
139
  {
140
  "epoch": 1.04,
141
- "learning_rate": 5.208333333333334e-07,
142
- "loss": 2.7193,
143
  "step": 100
144
  },
145
  {
146
  "epoch": 1.09,
147
- "learning_rate": 5.46875e-07,
148
- "loss": 2.6211,
149
  "step": 105
150
  },
151
  {
152
  "epoch": 1.15,
153
- "learning_rate": 5.729166666666667e-07,
154
- "loss": 2.662,
155
  "step": 110
156
  },
157
  {
158
  "epoch": 1.2,
159
- "learning_rate": 5.989583333333334e-07,
160
- "loss": 2.7073,
161
  "step": 115
162
  },
163
  {
164
  "epoch": 1.25,
165
- "learning_rate": 6.249999999999999e-07,
166
- "loss": 2.7004,
167
  "step": 120
168
  },
169
  {
170
  "epoch": 1.3,
171
- "learning_rate": 6.510416666666666e-07,
172
- "loss": 2.5911,
173
  "step": 125
174
  },
175
  {
176
  "epoch": 1.35,
177
- "learning_rate": 6.770833333333333e-07,
178
- "loss": 2.7241,
179
  "step": 130
180
  },
181
  {
182
  "epoch": 1.41,
183
- "learning_rate": 7.031249999999999e-07,
184
- "loss": 2.6296,
185
  "step": 135
186
  },
187
  {
188
  "epoch": 1.46,
189
- "learning_rate": 7.291666666666666e-07,
190
- "loss": 2.6237,
191
  "step": 140
192
  },
193
  {
194
  "epoch": 1.51,
195
- "learning_rate": 7.552083333333333e-07,
196
- "loss": 2.5253,
197
  "step": 145
198
  },
199
  {
200
  "epoch": 1.56,
201
- "learning_rate": 7.812499999999999e-07,
202
- "loss": 2.6595,
203
  "step": 150
204
  },
205
  {
206
  "epoch": 1.61,
207
- "learning_rate": 8.072916666666666e-07,
208
- "loss": 2.6139,
209
  "step": 155
210
  },
211
  {
212
  "epoch": 1.67,
213
- "learning_rate": 8.333333333333333e-07,
214
- "loss": 2.5513,
215
  "step": 160
216
  },
217
  {
218
  "epoch": 1.72,
219
- "learning_rate": 8.593749999999999e-07,
220
- "loss": 2.6989,
221
  "step": 165
222
  },
223
  {
224
  "epoch": 1.77,
225
- "learning_rate": 8.854166666666666e-07,
226
- "loss": 2.7154,
227
  "step": 170
228
  },
229
  {
230
  "epoch": 1.82,
231
- "learning_rate": 9.114583333333333e-07,
232
- "loss": 2.6658,
233
  "step": 175
234
  },
235
  {
236
  "epoch": 1.88,
237
- "learning_rate": 9.374999999999999e-07,
238
- "loss": 2.5969,
239
  "step": 180
240
  },
241
  {
242
  "epoch": 1.93,
243
- "learning_rate": 9.635416666666665e-07,
244
- "loss": 2.4657,
245
  "step": 185
246
  },
247
  {
248
  "epoch": 1.98,
249
- "learning_rate": 9.895833333333333e-07,
250
- "loss": 2.6736,
251
  "step": 190
252
  },
253
  {
254
  "epoch": 2.0,
255
- "eval_loss": 2.6350057125091553,
256
- "eval_runtime": 173.8992,
257
- "eval_samples_per_second": 4.405,
258
- "eval_steps_per_second": 1.104,
259
  "step": 192
260
  },
261
  {
262
  "epoch": 2.03,
263
- "learning_rate": 9.999925630697108e-07,
264
- "loss": 2.5575,
265
  "step": 195
266
  },
267
  {
268
  "epoch": 2.08,
269
- "learning_rate": 9.999471159635538e-07,
270
- "loss": 2.6949,
271
  "step": 200
272
  },
273
  {
274
  "epoch": 2.14,
275
- "learning_rate": 9.998603571300202e-07,
276
- "loss": 2.6751,
277
  "step": 205
278
  },
279
  {
280
  "epoch": 2.19,
281
- "learning_rate": 9.997322937381827e-07,
282
- "loss": 2.5524,
283
  "step": 210
284
  },
285
  {
286
  "epoch": 2.24,
287
- "learning_rate": 9.995629363702008e-07,
288
- "loss": 2.756,
289
  "step": 215
290
  },
291
  {
292
  "epoch": 2.29,
293
- "learning_rate": 9.993522990204452e-07,
294
- "loss": 2.6097,
295
  "step": 220
296
  },
297
  {
298
  "epoch": 2.34,
299
- "learning_rate": 9.991003990943423e-07,
300
- "loss": 2.6432,
301
  "step": 225
302
  },
303
  {
304
  "epoch": 2.4,
305
- "learning_rate": 9.988072574069363e-07,
306
- "loss": 2.6423,
307
  "step": 230
308
  },
309
  {
310
  "epoch": 2.45,
311
- "learning_rate": 9.984728981811674e-07,
312
- "loss": 2.5638,
313
  "step": 235
314
  },
315
  {
316
  "epoch": 2.5,
317
- "learning_rate": 9.980973490458728e-07,
318
- "loss": 2.569,
319
  "step": 240
320
  },
321
  {
322
  "epoch": 2.55,
323
- "learning_rate": 9.976806410335014e-07,
324
- "loss": 2.646,
325
  "step": 245
326
  },
327
  {
328
  "epoch": 2.6,
329
- "learning_rate": 9.97222808577551e-07,
330
- "loss": 2.5513,
331
  "step": 250
332
  },
333
  {
334
  "epoch": 2.66,
335
- "learning_rate": 9.96723889509722e-07,
336
- "loss": 2.672,
337
  "step": 255
338
  },
339
  {
340
  "epoch": 2.71,
341
- "learning_rate": 9.961839250567922e-07,
342
- "loss": 2.6453,
343
  "step": 260
344
  },
345
  {
346
  "epoch": 2.76,
347
- "learning_rate": 9.956029598372091e-07,
348
- "loss": 2.6943,
349
  "step": 265
350
  },
351
  {
352
  "epoch": 2.81,
353
- "learning_rate": 9.949810418574038e-07,
354
- "loss": 2.6373,
355
  "step": 270
356
  },
357
  {
358
  "epoch": 2.86,
359
- "learning_rate": 9.94318222507824e-07,
360
- "loss": 2.6896,
361
  "step": 275
362
  },
363
  {
364
  "epoch": 2.92,
365
- "learning_rate": 9.93614556558687e-07,
366
- "loss": 2.6051,
367
  "step": 280
368
  },
369
  {
370
  "epoch": 2.97,
371
- "learning_rate": 9.928701021554545e-07,
372
- "loss": 2.7002,
373
  "step": 285
374
  },
375
  {
376
  "epoch": 3.0,
377
- "eval_loss": 2.6337058544158936,
378
- "eval_runtime": 173.7213,
379
  "eval_samples_per_second": 4.409,
380
  "eval_steps_per_second": 1.105,
381
  "step": 288
382
  },
383
  {
384
  "epoch": 3.02,
385
- "learning_rate": 9.920849208140276e-07,
386
- "loss": 2.5634,
387
  "step": 290
388
  },
389
  {
390
  "epoch": 3.07,
391
- "learning_rate": 9.912590774156637e-07,
392
- "loss": 2.6789,
393
  "step": 295
394
  },
395
  {
396
  "epoch": 3.12,
397
- "learning_rate": 9.90392640201615e-07,
398
- "loss": 2.662,
399
  "step": 300
400
  },
401
  {
402
  "epoch": 3.18,
403
- "learning_rate": 9.894856807674906e-07,
404
- "loss": 2.4622,
405
  "step": 305
406
  },
407
  {
408
  "epoch": 3.23,
409
- "learning_rate": 9.885382740573385e-07,
410
- "loss": 2.6982,
411
  "step": 310
412
  },
413
  {
414
  "epoch": 3.28,
415
- "learning_rate": 9.875504983574543e-07,
416
- "loss": 2.5761,
417
  "step": 315
418
  },
419
  {
420
  "epoch": 3.33,
421
- "learning_rate": 9.865224352899118e-07,
422
- "loss": 2.6113,
423
  "step": 320
424
  },
425
  {
426
  "epoch": 3.39,
427
- "learning_rate": 9.854541698058189e-07,
428
- "loss": 2.6054,
429
  "step": 325
430
  },
431
  {
432
  "epoch": 3.44,
433
- "learning_rate": 9.843457901782965e-07,
434
- "loss": 2.694,
435
  "step": 330
436
  },
437
  {
438
  "epoch": 3.49,
439
- "learning_rate": 9.83197387995186e-07,
440
- "loss": 2.5185,
441
  "step": 335
442
  },
443
  {
444
  "epoch": 3.54,
445
- "learning_rate": 9.820090581514797e-07,
446
- "loss": 2.6949,
447
  "step": 340
448
  },
449
  {
450
  "epoch": 3.59,
451
- "learning_rate": 9.80780898841481e-07,
452
- "loss": 2.5847,
453
  "step": 345
454
  },
455
  {
456
  "epoch": 3.65,
457
- "learning_rate": 9.795130115506885e-07,
458
- "loss": 2.6563,
459
  "step": 350
460
  },
461
  {
462
  "epoch": 3.7,
463
- "learning_rate": 9.78205501047412e-07,
464
- "loss": 2.5706,
465
  "step": 355
466
  },
467
  {
468
  "epoch": 3.75,
469
- "learning_rate": 9.768584753741134e-07,
470
- "loss": 2.5576,
471
  "step": 360
472
  },
473
  {
474
  "epoch": 3.8,
475
- "learning_rate": 9.754720458384806e-07,
476
- "loss": 2.6988,
477
  "step": 365
478
  },
479
  {
480
  "epoch": 3.85,
481
- "learning_rate": 9.740463270042287e-07,
482
- "loss": 2.6938,
483
  "step": 370
484
  },
485
  {
486
  "epoch": 3.91,
487
- "learning_rate": 9.72581436681634e-07,
488
- "loss": 2.683,
489
  "step": 375
490
  },
491
  {
492
  "epoch": 3.96,
493
- "learning_rate": 9.710774959177982e-07,
494
- "loss": 2.6785,
495
  "step": 380
496
  },
497
  {
498
  "epoch": 4.0,
499
- "eval_loss": 2.6322734355926514,
500
- "eval_runtime": 173.6398,
501
- "eval_samples_per_second": 4.411,
502
- "eval_steps_per_second": 1.106,
503
  "step": 384
504
  },
505
  {
506
  "epoch": 4.01,
507
- "learning_rate": 9.695346289866477e-07,
508
- "loss": 2.6639,
509
  "step": 385
510
  },
511
  {
512
  "epoch": 4.06,
513
- "learning_rate": 9.67952963378663e-07,
514
- "loss": 2.5853,
515
  "step": 390
516
  },
517
  {
518
  "epoch": 4.11,
519
- "learning_rate": 9.663326297903438e-07,
520
- "loss": 2.61,
521
  "step": 395
522
  },
523
  {
524
  "epoch": 4.17,
525
- "learning_rate": 9.64673762113411e-07,
526
- "loss": 2.68,
527
  "step": 400
528
  },
529
  {
530
  "epoch": 4.22,
531
- "learning_rate": 9.629764974237415e-07,
532
- "loss": 2.6955,
533
  "step": 405
534
  },
535
  {
536
  "epoch": 4.27,
537
- "learning_rate": 9.61240975970041e-07,
538
- "loss": 2.5925,
539
  "step": 410
540
  },
541
  {
542
  "epoch": 4.32,
543
- "learning_rate": 9.594673411622562e-07,
544
- "loss": 2.6925,
545
  "step": 415
546
  },
547
  {
548
  "epoch": 4.38,
549
- "learning_rate": 9.576557395597236e-07,
550
- "loss": 2.5325,
551
  "step": 420
552
  },
553
  {
554
  "epoch": 4.43,
555
- "learning_rate": 9.558063208590592e-07,
556
- "loss": 2.5972,
557
  "step": 425
558
  },
559
  {
560
  "epoch": 4.48,
561
- "learning_rate": 9.539192378817893e-07,
562
- "loss": 2.5576,
563
  "step": 430
564
  },
565
  {
566
  "epoch": 4.53,
567
- "learning_rate": 9.519946465617217e-07,
568
- "loss": 2.6058,
569
  "step": 435
570
  },
571
  {
572
  "epoch": 4.58,
573
- "learning_rate": 9.500327059320605e-07,
574
- "loss": 2.7449,
575
  "step": 440
576
  },
577
  {
578
  "epoch": 4.64,
579
- "learning_rate": 9.48033578112266e-07,
580
- "loss": 2.5301,
581
  "step": 445
582
  },
583
  {
584
  "epoch": 4.69,
585
- "learning_rate": 9.45997428294657e-07,
586
- "loss": 2.7286,
587
  "step": 450
588
  },
589
  {
590
  "epoch": 4.74,
591
- "learning_rate": 9.439244247307616e-07,
592
- "loss": 2.6089,
593
  "step": 455
594
  },
595
  {
596
  "epoch": 4.79,
597
- "learning_rate": 9.418147387174137e-07,
598
- "loss": 2.5838,
599
  "step": 460
600
  },
601
  {
602
  "epoch": 4.84,
603
- "learning_rate": 9.396685445825985e-07,
604
- "loss": 2.704,
605
  "step": 465
606
  },
607
  {
608
  "epoch": 4.9,
609
- "learning_rate": 9.374860196710473e-07,
610
- "loss": 2.6787,
611
  "step": 470
612
  },
613
  {
614
  "epoch": 4.95,
615
- "learning_rate": 9.352673443295834e-07,
616
- "loss": 2.6551,
617
  "step": 475
618
  },
619
  {
620
  "epoch": 5.0,
621
- "learning_rate": 9.330127018922193e-07,
622
- "loss": 2.6275,
623
  "step": 480
624
  },
625
  {
626
  "epoch": 5.0,
627
- "eval_loss": 2.6313512325286865,
628
- "eval_runtime": 173.7421,
629
- "eval_samples_per_second": 4.409,
630
- "eval_steps_per_second": 1.105,
631
  "step": 480
632
  },
633
  {
634
  "epoch": 5.05,
635
- "learning_rate": 9.307222786650078e-07,
636
- "loss": 2.6808,
637
  "step": 485
638
  },
639
  {
640
  "epoch": 5.1,
641
- "learning_rate": 9.283962639106464e-07,
642
- "loss": 2.6823,
643
  "step": 490
644
  },
645
  {
646
  "epoch": 5.16,
647
- "learning_rate": 9.260348498328393e-07,
648
- "loss": 2.5967,
649
  "step": 495
650
  },
651
  {
652
  "epoch": 5.21,
653
- "learning_rate": 9.236382315604139e-07,
654
- "loss": 2.5708,
655
  "step": 500
656
  },
657
  {
658
  "epoch": 5.26,
659
- "learning_rate": 9.212066071311977e-07,
660
- "loss": 2.6208,
661
  "step": 505
662
  },
663
  {
664
  "epoch": 5.31,
665
- "learning_rate": 9.187401774756539e-07,
666
- "loss": 2.7446,
667
  "step": 510
668
  },
669
  {
670
  "epoch": 5.36,
671
- "learning_rate": 9.162391464002774e-07,
672
- "loss": 2.6163,
673
  "step": 515
674
  },
675
  {
676
  "epoch": 5.42,
677
- "learning_rate": 9.137037205707551e-07,
678
- "loss": 2.5948,
679
  "step": 520
680
  },
681
  {
682
  "epoch": 5.47,
683
- "learning_rate": 9.111341094948875e-07,
684
- "loss": 2.5834,
685
  "step": 525
686
  },
687
  {
688
  "epoch": 5.52,
689
- "learning_rate": 9.085305255052769e-07,
690
- "loss": 2.6175,
691
  "step": 530
692
  },
693
  {
694
  "epoch": 5.57,
695
- "learning_rate": 9.058931837417821e-07,
696
- "loss": 2.6837,
697
  "step": 535
698
  },
699
  {
700
  "epoch": 5.62,
701
- "learning_rate": 9.032223021337413e-07,
702
- "loss": 2.5912,
703
  "step": 540
704
  },
705
  {
706
  "epoch": 5.68,
707
- "learning_rate": 9.00518101381963e-07,
708
- "loss": 2.6483,
709
  "step": 545
710
  },
711
  {
712
  "epoch": 5.73,
713
- "learning_rate": 8.977808049404899e-07,
714
- "loss": 2.6406,
715
  "step": 550
716
  },
717
  {
718
  "epoch": 5.78,
719
- "learning_rate": 8.950106389981345e-07,
720
- "loss": 2.6145,
721
  "step": 555
722
  },
723
  {
724
  "epoch": 5.83,
725
- "learning_rate": 8.922078324597878e-07,
726
- "loss": 2.669,
727
  "step": 560
728
  },
729
  {
730
  "epoch": 5.89,
731
- "learning_rate": 8.893726169275053e-07,
732
- "loss": 2.6943,
733
  "step": 565
734
  },
735
  {
736
  "epoch": 5.94,
737
- "learning_rate": 8.865052266813685e-07,
738
- "loss": 2.6667,
739
  "step": 570
740
  },
741
  {
742
  "epoch": 5.99,
743
- "learning_rate": 8.836058986601261e-07,
744
- "loss": 2.6281,
745
  "step": 575
746
  },
747
  {
748
  "epoch": 6.0,
749
- "eval_loss": 2.630817413330078,
750
- "eval_runtime": 173.8406,
751
- "eval_samples_per_second": 4.406,
752
- "eval_steps_per_second": 1.104,
753
  "step": 576
754
  },
755
  {
756
  "epoch": 6.04,
757
- "learning_rate": 8.806748724416156e-07,
758
- "loss": 2.5816,
759
  "step": 580
760
  },
761
  {
762
  "epoch": 6.09,
763
- "learning_rate": 8.777123902229657e-07,
764
- "loss": 2.7035,
765
  "step": 585
766
  },
767
  {
768
  "epoch": 6.15,
769
- "learning_rate": 8.747186968005836e-07,
770
- "loss": 2.6034,
771
  "step": 590
772
  },
773
  {
774
  "epoch": 6.2,
775
- "learning_rate": 8.71694039549927e-07,
776
- "loss": 2.6828,
777
  "step": 595
778
  },
779
  {
780
  "epoch": 6.25,
781
- "learning_rate": 8.68638668405062e-07,
782
- "loss": 2.486,
783
  "step": 600
784
  },
785
  {
786
  "epoch": 6.3,
787
- "learning_rate": 8.65552835838012e-07,
788
- "loss": 2.6201,
789
  "step": 605
790
  },
791
  {
792
  "epoch": 6.35,
793
- "learning_rate": 8.62436796837894e-07,
794
- "loss": 2.7257,
795
  "step": 610
796
  },
797
  {
798
  "epoch": 6.41,
799
- "learning_rate": 8.59290808889849e-07,
800
- "loss": 2.6278,
801
  "step": 615
802
  },
803
  {
804
  "epoch": 6.46,
805
- "learning_rate": 8.561151319537655e-07,
806
- "loss": 2.6432,
807
  "step": 620
808
  },
809
  {
810
  "epoch": 6.51,
811
- "learning_rate": 8.529100284427979e-07,
812
- "loss": 2.624,
813
  "step": 625
814
  },
815
  {
816
  "epoch": 6.56,
817
- "learning_rate": 8.496757632016836e-07,
818
- "loss": 2.7264,
819
  "step": 630
820
  },
821
  {
822
  "epoch": 6.61,
823
- "learning_rate": 8.464126034848568e-07,
824
- "loss": 2.5305,
825
  "step": 635
826
  },
827
  {
828
  "epoch": 6.67,
829
- "learning_rate": 8.431208189343669e-07,
830
- "loss": 2.5706,
831
  "step": 640
832
  },
833
  {
834
  "epoch": 6.72,
835
- "learning_rate": 8.398006815575947e-07,
836
- "loss": 2.6265,
837
  "step": 645
838
  },
839
  {
840
  "epoch": 6.77,
841
- "learning_rate": 8.364524657047787e-07,
842
- "loss": 2.6231,
843
  "step": 650
844
  },
845
  {
846
  "epoch": 6.82,
847
- "learning_rate": 8.330764480463426e-07,
848
- "loss": 2.587,
849
  "step": 655
850
  },
851
  {
852
  "epoch": 6.88,
853
- "learning_rate": 8.296729075500343e-07,
854
- "loss": 2.5851,
855
  "step": 660
856
  },
857
  {
858
  "epoch": 6.93,
859
- "learning_rate": 8.262421254578748e-07,
860
- "loss": 2.6026,
861
  "step": 665
862
  },
863
  {
864
  "epoch": 6.98,
865
- "learning_rate": 8.227843852629174e-07,
866
- "loss": 2.7613,
867
  "step": 670
868
  },
869
  {
870
  "epoch": 7.0,
871
- "eval_loss": 2.6305091381073,
872
- "eval_runtime": 173.7973,
873
- "eval_samples_per_second": 4.407,
874
- "eval_steps_per_second": 1.105,
875
  "step": 672
876
  },
877
  {
878
  "epoch": 7.03,
879
- "learning_rate": 8.192999726858226e-07,
880
- "loss": 2.6184,
881
  "step": 675
882
  },
883
  {
884
  "epoch": 7.08,
885
- "learning_rate": 8.157891756512487e-07,
886
- "loss": 2.6928,
887
  "step": 680
888
  },
889
  {
890
  "epoch": 7.14,
891
- "learning_rate": 8.122522842640595e-07,
892
- "loss": 2.6798,
893
  "step": 685
894
  },
895
  {
896
  "epoch": 7.19,
897
- "learning_rate": 8.086895907853525e-07,
898
- "loss": 2.629,
899
  "step": 690
900
  },
901
  {
902
  "epoch": 7.24,
903
- "learning_rate": 8.051013896083082e-07,
904
- "loss": 2.582,
905
  "step": 695
906
  },
907
  {
908
  "epoch": 7.29,
909
- "learning_rate": 8.014879772338647e-07,
910
- "loss": 2.5739,
911
  "step": 700
912
  },
913
  {
914
  "epoch": 7.34,
915
- "learning_rate": 7.978496522462167e-07,
916
- "loss": 2.5772,
917
  "step": 705
918
  },
919
  {
920
  "epoch": 7.4,
921
- "learning_rate": 7.941867152881422e-07,
922
- "loss": 2.8123,
923
  "step": 710
924
  },
925
  {
926
  "epoch": 7.45,
927
- "learning_rate": 7.904994690361611e-07,
928
- "loss": 2.5865,
929
  "step": 715
930
  },
931
  {
932
  "epoch": 7.5,
933
- "learning_rate": 7.86788218175523e-07,
934
- "loss": 2.6377,
935
  "step": 720
936
  },
937
  {
938
  "epoch": 7.55,
939
- "learning_rate": 7.830532693750313e-07,
940
- "loss": 2.5983,
941
  "step": 725
942
  },
943
  {
944
  "epoch": 7.6,
945
- "learning_rate": 7.792949312617022e-07,
946
- "loss": 2.5955,
947
  "step": 730
948
  },
949
  {
950
  "epoch": 7.66,
951
- "learning_rate": 7.75513514395262e-07,
952
- "loss": 2.5563,
953
  "step": 735
954
  },
955
  {
956
  "epoch": 7.71,
957
- "learning_rate": 7.717093312424849e-07,
958
- "loss": 2.6616,
959
  "step": 740
960
  },
961
  {
962
  "epoch": 7.76,
963
- "learning_rate": 7.678826961513738e-07,
964
- "loss": 2.5939,
965
  "step": 745
966
  },
967
  {
968
  "epoch": 7.81,
969
- "learning_rate": 7.640339253251839e-07,
970
- "loss": 2.6496,
971
  "step": 750
972
  },
973
  {
974
  "epoch": 7.86,
975
- "learning_rate": 7.601633367962954e-07,
976
- "loss": 2.618,
977
  "step": 755
978
  },
979
  {
980
  "epoch": 7.92,
981
- "learning_rate": 7.562712503999326e-07,
982
- "loss": 2.6591,
983
  "step": 760
984
  },
985
  {
986
  "epoch": 7.97,
987
- "learning_rate": 7.52357987747736e-07,
988
- "loss": 2.6652,
989
  "step": 765
990
  },
991
  {
992
  "epoch": 8.0,
993
- "eval_loss": 2.6302928924560547,
994
- "eval_runtime": 174.0132,
995
- "eval_samples_per_second": 4.402,
996
- "eval_steps_per_second": 1.103,
997
  "step": 768
998
  },
999
  {
1000
  "epoch": 8.02,
1001
- "learning_rate": 7.484238722011868e-07,
1002
- "loss": 2.6632,
1003
  "step": 770
1004
  },
1005
  {
1006
  "epoch": 8.07,
1007
- "learning_rate": 7.444692288448862e-07,
1008
- "loss": 2.5898,
1009
  "step": 775
1010
  },
1011
  {
1012
  "epoch": 8.12,
1013
- "learning_rate": 7.404943844596938e-07,
1014
- "loss": 2.5495,
1015
  "step": 780
1016
  },
1017
  {
1018
  "epoch": 8.18,
1019
- "learning_rate": 7.364996674957243e-07,
1020
- "loss": 2.647,
1021
  "step": 785
1022
  },
1023
  {
1024
  "epoch": 8.23,
1025
- "learning_rate": 7.32485408045207e-07,
1026
- "loss": 2.7532,
1027
  "step": 790
1028
  },
1029
  {
1030
  "epoch": 8.28,
1031
- "learning_rate": 7.284519378152103e-07,
1032
- "loss": 2.6293,
1033
  "step": 795
1034
  },
1035
  {
1036
  "epoch": 8.33,
1037
- "learning_rate": 7.243995901002311e-07,
1038
- "loss": 2.6042,
1039
  "step": 800
1040
  },
1041
  {
1042
  "epoch": 8.39,
1043
- "learning_rate": 7.203286997546543e-07,
1044
- "loss": 2.642,
1045
  "step": 805
1046
  },
1047
  {
1048
  "epoch": 8.44,
1049
- "learning_rate": 7.16239603165083e-07,
1050
- "loss": 2.6034,
1051
  "step": 810
1052
  },
1053
  {
1054
  "epoch": 8.49,
1055
- "learning_rate": 7.121326382225428e-07,
1056
- "loss": 2.6535,
1057
  "step": 815
1058
  },
1059
  {
1060
  "epoch": 8.54,
1061
- "learning_rate": 7.080081442945596e-07,
1062
- "loss": 2.6237,
1063
  "step": 820
1064
  },
1065
  {
1066
  "epoch": 8.59,
1067
- "learning_rate": 7.038664621971183e-07,
1068
- "loss": 2.6845,
1069
  "step": 825
1070
  },
1071
  {
1072
  "epoch": 8.65,
1073
- "learning_rate": 6.997079341665002e-07,
1074
- "loss": 2.6151,
1075
  "step": 830
1076
  },
1077
  {
1078
  "epoch": 8.7,
1079
- "learning_rate": 6.955329038310027e-07,
1080
- "loss": 2.563,
1081
  "step": 835
1082
  },
1083
  {
1084
  "epoch": 8.75,
1085
- "learning_rate": 6.913417161825449e-07,
1086
- "loss": 2.599,
1087
  "step": 840
1088
  },
1089
  {
1090
  "epoch": 8.8,
1091
- "learning_rate": 6.871347175481601e-07,
1092
- "loss": 2.5894,
1093
  "step": 845
1094
  },
1095
  {
1096
  "epoch": 8.85,
1097
- "learning_rate": 6.829122555613784e-07,
1098
- "loss": 2.6614,
1099
  "step": 850
1100
  },
1101
  {
1102
  "epoch": 8.91,
1103
- "learning_rate": 6.786746791335e-07,
1104
- "loss": 2.6603,
1105
  "step": 855
1106
  },
1107
  {
1108
  "epoch": 8.96,
1109
- "learning_rate": 6.744223384247654e-07,
1110
- "loss": 2.6029,
1111
  "step": 860
1112
  },
1113
  {
1114
  "epoch": 9.0,
1115
- "eval_loss": 2.630183219909668,
1116
- "eval_runtime": 173.9779,
1117
- "eval_samples_per_second": 4.403,
1118
- "eval_steps_per_second": 1.104,
1119
  "step": 864
1120
  },
1121
  {
1122
  "epoch": 9.01,
1123
- "learning_rate": 6.701555848154192e-07,
1124
- "loss": 2.6899,
1125
  "step": 865
1126
  },
1127
  {
1128
  "epoch": 9.06,
1129
- "learning_rate": 6.658747708766761e-07,
1130
- "loss": 2.6646,
1131
  "step": 870
1132
  },
1133
  {
1134
  "epoch": 9.11,
1135
- "learning_rate": 6.615802503415863e-07,
1136
- "loss": 2.5956,
1137
  "step": 875
1138
  },
1139
  {
1140
  "epoch": 9.17,
1141
- "learning_rate": 6.572723780758068e-07,
1142
- "loss": 2.5993,
1143
  "step": 880
1144
  },
1145
  {
1146
  "epoch": 9.22,
1147
- "learning_rate": 6.529515100482768e-07,
1148
- "loss": 2.6027,
1149
  "step": 885
1150
  },
1151
  {
1152
  "epoch": 9.27,
1153
- "learning_rate": 6.486180033018039e-07,
1154
- "loss": 2.5575,
1155
  "step": 890
1156
  },
1157
  {
1158
  "epoch": 9.32,
1159
- "learning_rate": 6.442722159235607e-07,
1160
- "loss": 2.6163,
1161
  "step": 895
1162
  },
1163
  {
1164
  "epoch": 9.38,
1165
- "learning_rate": 6.39914507015496e-07,
1166
- "loss": 2.6802,
1167
  "step": 900
1168
  },
1169
  {
1170
  "epoch": 9.43,
1171
- "learning_rate": 6.355452366646601e-07,
1172
- "loss": 2.5041,
1173
  "step": 905
1174
  },
1175
  {
1176
  "epoch": 9.48,
1177
- "learning_rate": 6.311647659134508e-07,
1178
- "loss": 2.7211,
1179
  "step": 910
1180
  },
1181
  {
1182
  "epoch": 9.53,
1183
- "learning_rate": 6.267734567297798e-07,
1184
- "loss": 2.6253,
1185
  "step": 915
1186
  },
1187
  {
1188
  "epoch": 9.58,
1189
- "learning_rate": 6.223716719771619e-07,
1190
- "loss": 2.6138,
1191
  "step": 920
1192
  },
1193
  {
1194
  "epoch": 9.64,
1195
- "learning_rate": 6.179597753847315e-07,
1196
- "loss": 2.6438,
1197
  "step": 925
1198
  },
1199
  {
1200
  "epoch": 9.69,
1201
- "learning_rate": 6.135381315171865e-07,
1202
- "loss": 2.6278,
1203
  "step": 930
1204
  },
1205
  {
1206
  "epoch": 9.74,
1207
- "learning_rate": 6.091071057446634e-07,
1208
- "loss": 2.5997,
1209
  "step": 935
1210
  },
1211
  {
1212
  "epoch": 9.79,
1213
- "learning_rate": 6.046670642125459e-07,
1214
- "loss": 2.71,
1215
  "step": 940
1216
  },
1217
  {
1218
  "epoch": 9.84,
1219
- "learning_rate": 6.002183738112102e-07,
1220
- "loss": 2.7388,
1221
  "step": 945
1222
  },
1223
  {
1224
  "epoch": 9.9,
1225
- "learning_rate": 5.957614021457072e-07,
1226
- "loss": 2.6013,
1227
  "step": 950
1228
  },
1229
  {
1230
  "epoch": 9.95,
1231
- "learning_rate": 5.912965175053866e-07,
1232
- "loss": 2.6036,
1233
  "step": 955
1234
  },
1235
  {
1236
  "epoch": 10.0,
1237
- "learning_rate": 5.868240888334652e-07,
1238
- "loss": 2.6551,
1239
  "step": 960
1240
  },
1241
  {
1242
  "epoch": 10.0,
1243
- "eval_loss": 2.6301300525665283,
1244
- "eval_runtime": 174.1155,
1245
- "eval_samples_per_second": 4.399,
1246
- "eval_steps_per_second": 1.103,
1247
  "step": 960
1248
  },
1249
  {
1250
  "epoch": 10.05,
1251
- "learning_rate": 5.823444856965393e-07,
1252
- "loss": 2.6629,
1253
  "step": 965
1254
  },
1255
  {
1256
  "epoch": 10.1,
1257
- "learning_rate": 5.778580782540469e-07,
1258
- "loss": 2.7471,
1259
  "step": 970
1260
  },
1261
  {
1262
  "epoch": 10.16,
1263
- "learning_rate": 5.733652372276809e-07,
1264
- "loss": 2.67,
1265
  "step": 975
1266
  },
1267
  {
1268
  "epoch": 10.21,
1269
- "learning_rate": 5.688663338707553e-07,
1270
- "loss": 2.6761,
1271
  "step": 980
1272
  },
1273
  {
1274
  "epoch": 10.26,
1275
- "learning_rate": 5.64361739937528e-07,
1276
- "loss": 2.6528,
1277
  "step": 985
1278
  },
1279
  {
1280
  "epoch": 10.31,
1281
- "learning_rate": 5.598518276524812e-07,
1282
- "loss": 2.7113,
1283
  "step": 990
1284
  },
1285
  {
1286
  "epoch": 10.36,
1287
- "learning_rate": 5.553369696795646e-07,
1288
- "loss": 2.4484,
1289
  "step": 995
1290
  },
1291
  {
1292
  "epoch": 10.42,
1293
- "learning_rate": 5.50817539091401e-07,
1294
- "loss": 2.7609,
1295
  "step": 1000
1296
  },
1297
  {
1298
  "epoch": 10.47,
1299
- "learning_rate": 5.462939093384578e-07,
1300
- "loss": 2.5864,
1301
  "step": 1005
1302
  },
1303
  {
1304
  "epoch": 10.52,
1305
- "learning_rate": 5.417664542181893e-07,
1306
- "loss": 2.5866,
1307
  "step": 1010
1308
  },
1309
  {
1310
  "epoch": 10.57,
1311
- "learning_rate": 5.372355478441483e-07,
1312
- "loss": 2.5708,
1313
  "step": 1015
1314
  },
1315
  {
1316
  "epoch": 10.62,
1317
- "learning_rate": 5.327015646150716e-07,
1318
- "loss": 2.5576,
1319
  "step": 1020
1320
  },
1321
  {
1322
  "epoch": 10.68,
1323
- "learning_rate": 5.281648791839438e-07,
1324
- "loss": 2.5516,
1325
  "step": 1025
1326
  },
1327
  {
1328
  "epoch": 10.73,
1329
- "learning_rate": 5.236258664270384e-07,
1330
- "loss": 2.6803,
1331
  "step": 1030
1332
  },
1333
  {
1334
  "epoch": 10.78,
1335
- "learning_rate": 5.190849014129408e-07,
1336
- "loss": 2.5449,
1337
  "step": 1035
1338
  },
1339
  {
1340
  "epoch": 10.83,
1341
- "learning_rate": 5.145423593715557e-07,
1342
- "loss": 2.6265,
1343
  "step": 1040
1344
  },
1345
  {
1346
  "epoch": 10.89,
1347
- "learning_rate": 5.09998615663101e-07,
1348
- "loss": 2.587,
1349
  "step": 1045
1350
  },
1351
  {
1352
  "epoch": 10.94,
1353
- "learning_rate": 5.054540457470911e-07,
1354
- "loss": 2.6855,
1355
  "step": 1050
1356
  },
1357
  {
1358
  "epoch": 10.99,
1359
- "learning_rate": 5.009090251513118e-07,
1360
- "loss": 2.6734,
1361
  "step": 1055
1362
  },
1363
  {
1364
  "epoch": 11.0,
1365
- "eval_loss": 2.630112409591675,
1366
- "eval_runtime": 173.1711,
1367
  "eval_samples_per_second": 4.423,
1368
  "eval_steps_per_second": 1.109,
1369
  "step": 1056
1370
  },
1371
  {
1372
  "epoch": 11.04,
1373
- "learning_rate": 4.963639294407892e-07,
1374
- "loss": 2.5494,
1375
  "step": 1060
1376
  },
1377
  {
1378
  "epoch": 11.09,
1379
- "learning_rate": 4.918191341867566e-07,
1380
- "loss": 2.713,
1381
  "step": 1065
1382
  },
1383
  {
1384
  "epoch": 11.15,
1385
- "learning_rate": 4.872750149356199e-07,
1386
- "loss": 2.6899,
1387
  "step": 1070
1388
  },
1389
  {
1390
  "epoch": 11.2,
1391
- "learning_rate": 4.827319471779254e-07,
1392
- "loss": 2.7445,
1393
  "step": 1075
1394
  },
1395
  {
1396
  "epoch": 11.25,
1397
- "learning_rate": 4.78190306317332e-07,
1398
- "loss": 2.5517,
1399
  "step": 1080
1400
  },
1401
  {
1402
  "epoch": 11.3,
1403
- "learning_rate": 4.7365046763959115e-07,
1404
- "loss": 2.5754,
1405
  "step": 1085
1406
  },
1407
  {
1408
  "epoch": 11.35,
1409
- "learning_rate": 4.6911280628153603e-07,
1410
- "loss": 2.6542,
1411
  "step": 1090
1412
  },
1413
  {
1414
  "epoch": 11.41,
1415
- "learning_rate": 4.6457769720008295e-07,
1416
- "loss": 2.6702,
1417
  "step": 1095
1418
  },
1419
  {
1420
  "epoch": 11.46,
1421
- "learning_rate": 4.600455151412481e-07,
1422
- "loss": 2.6873,
1423
  "step": 1100
1424
  },
1425
  {
1426
  "epoch": 11.51,
1427
- "learning_rate": 4.55516634609181e-07,
1428
- "loss": 2.6564,
1429
  "step": 1105
1430
  },
1431
  {
1432
  "epoch": 11.56,
1433
- "learning_rate": 4.5099142983521963e-07,
1434
- "loss": 2.534,
1435
  "step": 1110
1436
  },
1437
  {
1438
  "epoch": 11.61,
1439
- "learning_rate": 4.4647027474696536e-07,
1440
- "loss": 2.458,
1441
  "step": 1115
1442
  },
1443
  {
1444
  "epoch": 11.67,
1445
- "learning_rate": 4.419535429373848e-07,
1446
- "loss": 2.64,
1447
  "step": 1120
1448
  },
1449
  {
1450
  "epoch": 11.72,
1451
- "learning_rate": 4.3744160763394046e-07,
1452
- "loss": 2.663,
1453
  "step": 1125
1454
  },
1455
  {
1456
  "epoch": 11.77,
1457
- "learning_rate": 4.329348416677479e-07,
1458
- "loss": 2.4541,
1459
  "step": 1130
1460
  },
1461
  {
1462
  "epoch": 11.82,
1463
- "learning_rate": 4.284336174427696e-07,
1464
- "loss": 2.6241,
1465
  "step": 1135
1466
  },
1467
  {
1468
  "epoch": 11.88,
1469
- "learning_rate": 4.2393830690504165e-07,
1470
- "loss": 2.6692,
1471
  "step": 1140
1472
  },
1473
  {
1474
  "epoch": 11.93,
1475
- "learning_rate": 4.1944928151193925e-07,
1476
- "loss": 2.6779,
1477
  "step": 1145
1478
  },
1479
  {
1480
  "epoch": 11.98,
1481
- "learning_rate": 4.1496691220148223e-07,
1482
- "loss": 2.6927,
1483
  "step": 1150
1484
  },
1485
  {
1486
  "epoch": 12.0,
1487
- "eval_loss": 2.630101203918457,
1488
- "eval_runtime": 173.6048,
1489
- "eval_samples_per_second": 4.412,
1490
- "eval_steps_per_second": 1.106,
1491
  "step": 1152
1492
  },
1493
  {
1494
  "epoch": 12.03,
1495
- "learning_rate": 4.104915693616837e-07,
1496
- "loss": 2.7295,
1497
  "step": 1155
1498
  },
1499
  {
1500
  "epoch": 12.08,
1501
- "learning_rate": 4.0602362279994405e-07,
1502
- "loss": 2.6081,
1503
  "step": 1160
1504
  },
1505
  {
1506
  "epoch": 12.14,
1507
- "learning_rate": 4.015634417124932e-07,
1508
- "loss": 2.5461,
1509
  "step": 1165
1510
  },
1511
  {
1512
  "epoch": 12.19,
1513
- "learning_rate": 3.9711139465388256e-07,
1514
- "loss": 2.6491,
1515
  "step": 1170
1516
  },
1517
  {
1518
  "epoch": 12.24,
1519
- "learning_rate": 3.926678495065312e-07,
1520
- "loss": 2.663,
1521
  "step": 1175
1522
  },
1523
  {
1524
  "epoch": 12.29,
1525
- "learning_rate": 3.8823317345032623e-07,
1526
- "loss": 2.6422,
1527
  "step": 1180
1528
  },
1529
  {
1530
  "epoch": 12.34,
1531
- "learning_rate": 3.8380773293228276e-07,
1532
- "loss": 2.7531,
1533
  "step": 1185
1534
  },
1535
  {
1536
  "epoch": 12.4,
1537
- "learning_rate": 3.7939189363626277e-07,
1538
- "loss": 2.7157,
1539
  "step": 1190
1540
  },
1541
  {
1542
  "epoch": 12.45,
1543
- "learning_rate": 3.749860204527584e-07,
1544
- "loss": 2.6708,
1545
  "step": 1195
1546
  },
1547
  {
1548
  "epoch": 12.5,
1549
- "learning_rate": 3.7059047744873955e-07,
1550
- "loss": 2.6179,
1551
  "step": 1200
1552
  },
1553
  {
1554
  "epoch": 12.55,
1555
- "learning_rate": 3.662056278375716e-07,
1556
- "loss": 2.5415,
1557
  "step": 1205
1558
  },
1559
  {
1560
  "epoch": 12.6,
1561
- "learning_rate": 3.618318339490009e-07,
1562
- "loss": 2.6207,
1563
  "step": 1210
1564
  },
1565
  {
1566
  "epoch": 12.66,
1567
- "learning_rate": 3.574694571992147e-07,
1568
- "loss": 2.6337,
1569
  "step": 1215
1570
  },
1571
  {
1572
  "epoch": 12.71,
1573
- "learning_rate": 3.5311885806097776e-07,
1574
- "loss": 2.548,
1575
  "step": 1220
1576
  },
1577
  {
1578
  "epoch": 12.76,
1579
- "learning_rate": 3.4878039603384504e-07,
1580
- "loss": 2.7612,
1581
  "step": 1225
1582
  },
1583
  {
1584
  "epoch": 12.81,
1585
- "learning_rate": 3.444544296144546e-07,
1586
- "loss": 2.6004,
1587
  "step": 1230
1588
  },
1589
  {
1590
  "epoch": 12.86,
1591
- "learning_rate": 3.401413162669057e-07,
1592
- "loss": 2.6198,
1593
  "step": 1235
1594
  },
1595
  {
1596
  "epoch": 12.92,
1597
- "learning_rate": 3.358414123932195e-07,
1598
- "loss": 2.5703,
1599
  "step": 1240
1600
  },
1601
  {
1602
  "epoch": 12.97,
1603
- "learning_rate": 3.3155507330388996e-07,
1604
- "loss": 2.5663,
1605
  "step": 1245
1606
  },
1607
  {
1608
  "epoch": 13.0,
1609
- "eval_loss": 2.630080461502075,
1610
- "eval_runtime": 174.4281,
1611
- "eval_samples_per_second": 4.391,
1612
- "eval_steps_per_second": 1.101,
1613
  "step": 1248
1614
  },
1615
  {
1616
  "epoch": 13.02,
1617
- "learning_rate": 3.272826531885229e-07,
1618
- "loss": 2.5574,
1619
  "step": 1250
1620
  },
1621
  {
1622
  "epoch": 13.07,
1623
- "learning_rate": 3.2302450508656833e-07,
1624
- "loss": 2.7233,
1625
  "step": 1255
1626
  },
1627
  {
1628
  "epoch": 13.12,
1629
- "learning_rate": 3.187809808581492e-07,
1630
- "loss": 2.6756,
1631
  "step": 1260
1632
  },
1633
  {
1634
  "epoch": 13.18,
1635
- "learning_rate": 3.145524311549852e-07,
1636
- "loss": 2.5584,
1637
  "step": 1265
1638
  },
1639
  {
1640
  "epoch": 13.23,
1641
- "learning_rate": 3.1033920539141836e-07,
1642
- "loss": 2.6065,
1643
  "step": 1270
1644
  },
1645
  {
1646
  "epoch": 13.28,
1647
- "learning_rate": 3.061416517155396e-07,
1648
- "loss": 2.6843,
1649
  "step": 1275
1650
  },
1651
  {
1652
  "epoch": 13.33,
1653
- "learning_rate": 3.0196011698042156e-07,
1654
- "loss": 2.6118,
1655
  "step": 1280
1656
  },
1657
  {
1658
  "epoch": 13.39,
1659
- "learning_rate": 2.97794946715456e-07,
1660
- "loss": 2.6114,
1661
  "step": 1285
1662
  },
1663
  {
1664
  "epoch": 13.44,
1665
- "learning_rate": 2.9364648509780265e-07,
1666
- "loss": 2.628,
1667
  "step": 1290
1668
  },
1669
  {
1670
  "epoch": 13.49,
1671
- "learning_rate": 2.8951507492394935e-07,
1672
- "loss": 2.6265,
1673
  "step": 1295
1674
  },
1675
  {
1676
  "epoch": 13.54,
1677
- "learning_rate": 2.8540105758138555e-07,
1678
- "loss": 2.5731,
1679
  "step": 1300
1680
  },
1681
  {
1682
  "epoch": 13.59,
1683
- "learning_rate": 2.813047730203929e-07,
1684
- "loss": 2.5962,
1685
  "step": 1305
1686
  },
1687
  {
1688
  "epoch": 13.65,
1689
- "learning_rate": 2.772265597259543e-07,
1690
- "loss": 2.6739,
1691
  "step": 1310
1692
  },
1693
  {
1694
  "epoch": 13.7,
1695
- "learning_rate": 2.7316675468978444e-07,
1696
- "loss": 2.6158,
1697
  "step": 1315
1698
  },
1699
  {
1700
  "epoch": 13.75,
1701
- "learning_rate": 2.6912569338248315e-07,
1702
- "loss": 2.745,
1703
  "step": 1320
1704
  },
1705
  {
1706
  "epoch": 13.8,
1707
- "learning_rate": 2.651037097258145e-07,
1708
- "loss": 2.5617,
1709
  "step": 1325
1710
  },
1711
  {
1712
  "epoch": 13.85,
1713
- "learning_rate": 2.61101136065115e-07,
1714
- "loss": 2.6718,
1715
  "step": 1330
1716
  },
1717
  {
1718
  "epoch": 13.91,
1719
- "learning_rate": 2.571183031418299e-07,
1720
- "loss": 2.6849,
1721
  "step": 1335
1722
  },
1723
  {
1724
  "epoch": 13.96,
1725
- "learning_rate": 2.531555400661848e-07,
1726
- "loss": 2.5786,
1727
  "step": 1340
1728
  },
1729
  {
1730
  "epoch": 14.0,
1731
- "eval_loss": 2.6300783157348633,
1732
- "eval_runtime": 174.7054,
1733
- "eval_samples_per_second": 4.385,
1734
- "eval_steps_per_second": 1.099,
1735
  "step": 1344
1736
  },
1737
  {
1738
  "epoch": 14.01,
1739
- "learning_rate": 2.492131742899892e-07,
1740
- "loss": 2.6208,
1741
  "step": 1345
1742
  },
1743
  {
1744
  "epoch": 14.06,
1745
- "learning_rate": 2.452915315795791e-07,
1746
- "loss": 2.5469,
1747
  "step": 1350
1748
  },
1749
  {
1750
  "epoch": 14.11,
1751
- "learning_rate": 2.41390935988898e-07,
1752
- "loss": 2.5873,
1753
  "step": 1355
1754
  },
1755
  {
1756
  "epoch": 14.17,
1757
- "learning_rate": 2.3751170983271996e-07,
1758
- "loss": 2.6717,
1759
  "step": 1360
1760
  },
1761
  {
1762
  "epoch": 14.22,
1763
- "learning_rate": 2.3365417366001549e-07,
1764
- "loss": 2.7293,
1765
  "step": 1365
1766
  },
1767
  {
1768
  "epoch": 14.27,
1769
- "learning_rate": 2.2981864622746433e-07,
1770
- "loss": 2.7345,
1771
  "step": 1370
1772
  },
1773
  {
1774
  "epoch": 14.32,
1775
- "learning_rate": 2.2600544447311548e-07,
1776
- "loss": 2.5931,
1777
  "step": 1375
1778
  },
1779
  {
1780
  "epoch": 14.38,
1781
- "learning_rate": 2.2221488349019902e-07,
1782
- "loss": 2.7244,
1783
  "step": 1380
1784
  },
1785
  {
1786
  "epoch": 14.43,
1787
- "learning_rate": 2.1844727650108708e-07,
1788
- "loss": 2.5362,
1789
  "step": 1385
1790
  },
1791
  {
1792
  "epoch": 14.48,
1793
- "learning_rate": 2.1470293483141354e-07,
1794
- "loss": 2.6723,
1795
  "step": 1390
1796
  },
1797
  {
1798
  "epoch": 14.53,
1799
- "learning_rate": 2.1098216788434835e-07,
1800
- "loss": 2.6033,
1801
  "step": 1395
1802
  },
1803
  {
1804
  "epoch": 14.58,
1805
- "learning_rate": 2.0728528311502973e-07,
1806
- "loss": 2.6966,
1807
  "step": 1400
1808
  },
1809
  {
1810
  "epoch": 14.64,
1811
- "learning_rate": 2.036125860051594e-07,
1812
- "loss": 2.5426,
1813
  "step": 1405
1814
  },
1815
  {
1816
  "epoch": 14.69,
1817
- "learning_rate": 1.9996438003775957e-07,
1818
- "loss": 2.6649,
1819
  "step": 1410
1820
  },
1821
  {
1822
  "epoch": 14.74,
1823
- "learning_rate": 1.9634096667209577e-07,
1824
- "loss": 2.632,
1825
  "step": 1415
1826
  },
1827
  {
1828
  "epoch": 14.79,
1829
- "learning_rate": 1.9274264531876627e-07,
1830
- "loss": 2.6428,
1831
  "step": 1420
1832
  },
1833
  {
1834
  "epoch": 14.84,
1835
- "learning_rate": 1.891697133149614e-07,
1836
- "loss": 2.6066,
1837
  "step": 1425
1838
  },
1839
  {
1840
  "epoch": 14.9,
1841
- "learning_rate": 1.8562246589989367e-07,
1842
- "loss": 2.6354,
1843
  "step": 1430
1844
  },
1845
  {
1846
  "epoch": 14.95,
1847
- "learning_rate": 1.8210119619040204e-07,
1848
- "loss": 2.5518,
1849
  "step": 1435
1850
  },
1851
  {
1852
  "epoch": 15.0,
1853
- "learning_rate": 1.7860619515673032e-07,
1854
- "loss": 2.574,
1855
  "step": 1440
1856
  },
1857
  {
1858
  "epoch": 15.0,
1859
- "eval_loss": 2.6300854682922363,
1860
- "eval_runtime": 174.5983,
1861
  "eval_samples_per_second": 4.387,
1862
  "eval_steps_per_second": 1.1,
1863
  "step": 1440
1864
  },
1865
  {
1866
  "epoch": 15.05,
1867
- "learning_rate": 1.7513775159848398e-07,
1868
- "loss": 2.5838,
1869
  "step": 1445
1870
  },
1871
  {
1872
  "epoch": 15.1,
1873
- "learning_rate": 1.716961521207661e-07,
1874
- "loss": 2.6134,
1875
  "step": 1450
1876
  },
1877
  {
1878
  "epoch": 15.16,
1879
- "learning_rate": 1.682816811104945e-07,
1880
- "loss": 2.7366,
1881
  "step": 1455
1882
  },
1883
  {
1884
  "epoch": 15.21,
1885
- "learning_rate": 1.6489462071290212e-07,
1886
- "loss": 2.6152,
1887
  "step": 1460
1888
  },
1889
  {
1890
  "epoch": 15.26,
1891
- "learning_rate": 1.6153525080822288e-07,
1892
- "loss": 2.6113,
1893
  "step": 1465
1894
  },
1895
  {
1896
  "epoch": 15.31,
1897
- "learning_rate": 1.5820384898856433e-07,
1898
- "loss": 2.4771,
1899
  "step": 1470
1900
  },
1901
  {
1902
  "epoch": 15.36,
1903
- "learning_rate": 1.5490069053497019e-07,
1904
- "loss": 2.6627,
1905
  "step": 1475
1906
  },
1907
  {
1908
  "epoch": 15.42,
1909
- "learning_rate": 1.5162604839467265e-07,
1910
- "loss": 2.7011,
1911
  "step": 1480
1912
  },
1913
  {
1914
  "epoch": 15.47,
1915
- "learning_rate": 1.4838019315853796e-07,
1916
- "loss": 2.6672,
1917
  "step": 1485
1918
  },
1919
  {
1920
  "epoch": 15.52,
1921
- "learning_rate": 1.4516339303870762e-07,
1922
- "loss": 2.6236,
1923
  "step": 1490
1924
  },
1925
  {
1926
  "epoch": 15.57,
1927
- "learning_rate": 1.4197591384643547e-07,
1928
- "loss": 2.6338,
1929
  "step": 1495
1930
  },
1931
  {
1932
  "epoch": 15.62,
1933
- "learning_rate": 1.3881801897012224e-07,
1934
- "loss": 2.5546,
1935
  "step": 1500
1936
  },
1937
  {
1938
  "epoch": 15.68,
1939
- "learning_rate": 1.3568996935355194e-07,
1940
- "loss": 2.6324,
1941
  "step": 1505
1942
  },
1943
  {
1944
  "epoch": 15.73,
1945
- "learning_rate": 1.3259202347432908e-07,
1946
- "loss": 2.7599,
1947
  "step": 1510
1948
  },
1949
  {
1950
  "epoch": 15.78,
1951
- "learning_rate": 1.2952443732252054e-07,
1952
- "loss": 2.5799,
1953
  "step": 1515
1954
  },
1955
  {
1956
  "epoch": 15.83,
1957
- "learning_rate": 1.2648746437950208e-07,
1958
- "loss": 2.5397,
1959
  "step": 1520
1960
  },
1961
  {
1962
  "epoch": 15.89,
1963
- "learning_rate": 1.234813555970129e-07,
1964
- "loss": 2.6592,
1965
  "step": 1525
1966
  },
1967
  {
1968
  "epoch": 15.94,
1969
- "learning_rate": 1.2050635937641907e-07,
1970
- "loss": 2.6715,
1971
  "step": 1530
1972
  },
1973
  {
1974
  "epoch": 15.99,
1975
- "learning_rate": 1.1756272154818714e-07,
1976
- "loss": 2.6113,
1977
  "step": 1535
1978
  },
1979
  {
1980
  "epoch": 16.0,
1981
- "eval_loss": 2.630089044570923,
1982
- "eval_runtime": 174.5957,
1983
- "eval_samples_per_second": 4.387,
1984
- "eval_steps_per_second": 1.1,
1985
  "step": 1536
1986
  },
1987
  {
1988
  "epoch": 16.04,
1989
- "learning_rate": 1.1465068535157096e-07,
1990
- "loss": 2.6605,
1991
  "step": 1540
1992
  },
1993
  {
1994
  "epoch": 16.09,
1995
- "learning_rate": 1.1177049141451223e-07,
1996
- "loss": 2.6989,
1997
  "step": 1545
1998
  },
1999
  {
2000
  "epoch": 16.15,
2001
- "learning_rate": 1.0892237773375679e-07,
2002
- "loss": 2.6261,
2003
  "step": 1550
2004
  },
2005
  {
2006
  "epoch": 16.2,
2007
- "learning_rate": 1.061065796551886e-07,
2008
- "loss": 2.6557,
2009
  "step": 1555
2010
  },
2011
  {
2012
  "epoch": 16.25,
2013
- "learning_rate": 1.0332332985438247e-07,
2014
- "loss": 2.5237,
2015
  "step": 1560
2016
  },
2017
  {
2018
  "epoch": 16.3,
2019
- "learning_rate": 1.0057285831737738e-07,
2020
- "loss": 2.6447,
2021
  "step": 1565
2022
  },
2023
  {
2024
  "epoch": 16.35,
2025
- "learning_rate": 9.785539232167294e-08,
2026
- "loss": 2.6633,
2027
  "step": 1570
2028
  },
2029
  {
2030
  "epoch": 16.41,
2031
- "learning_rate": 9.517115641744794e-08,
2032
- "loss": 2.4559,
2033
  "step": 1575
2034
  },
2035
  {
2036
  "epoch": 16.46,
2037
- "learning_rate": 9.252037240900617e-08,
2038
- "loss": 2.664,
2039
  "step": 1580
2040
  },
2041
  {
2042
  "epoch": 16.51,
2043
- "learning_rate": 8.990325933644716e-08,
2044
- "loss": 2.6389,
2045
  "step": 1585
2046
  },
2047
  {
2048
  "epoch": 16.56,
2049
- "learning_rate": 8.732003345756812e-08,
2050
- "loss": 2.628,
2051
  "step": 1590
2052
  },
2053
  {
2054
  "epoch": 16.61,
2055
- "learning_rate": 8.477090822999239e-08,
2056
- "loss": 2.5724,
2057
  "step": 1595
2058
  },
2059
  {
2060
  "epoch": 16.67,
2061
- "learning_rate": 8.225609429353186e-08,
2062
- "loss": 2.7386,
2063
  "step": 1600
2064
  },
2065
  {
2066
  "epoch": 16.72,
2067
- "learning_rate": 7.97757994527809e-08,
2068
- "loss": 2.7362,
2069
  "step": 1605
2070
  },
2071
  {
2072
  "epoch": 16.77,
2073
- "learning_rate": 7.733022865994599e-08,
2074
- "loss": 2.5169,
2075
  "step": 1610
2076
  },
2077
  {
2078
  "epoch": 16.82,
2079
- "learning_rate": 7.491958399790826e-08,
2080
- "loss": 2.6433,
2081
  "step": 1615
2082
  },
2083
  {
2084
  "epoch": 16.88,
2085
- "learning_rate": 7.25440646635268e-08,
2086
- "loss": 2.5979,
2087
  "step": 1620
2088
  },
2089
  {
2090
  "epoch": 16.93,
2091
- "learning_rate": 7.02038669511773e-08,
2092
- "loss": 2.6365,
2093
  "step": 1625
2094
  },
2095
  {
2096
  "epoch": 16.98,
2097
- "learning_rate": 6.789918423653285e-08,
2098
- "loss": 2.6837,
2099
  "step": 1630
2100
  },
2101
  {
2102
  "epoch": 17.0,
2103
- "eval_loss": 2.6300783157348633,
2104
- "eval_runtime": 173.8235,
2105
- "eval_samples_per_second": 4.407,
2106
- "eval_steps_per_second": 1.105,
2107
  "step": 1632
2108
  },
2109
  {
2110
  "epoch": 17.03,
2111
- "learning_rate": 6.563020696058386e-08,
2112
- "loss": 2.7281,
2113
  "step": 1635
2114
  },
2115
  {
2116
  "epoch": 17.08,
2117
- "learning_rate": 6.339712261390212e-08,
2118
- "loss": 2.5881,
2119
  "step": 1640
2120
  },
2121
  {
2122
  "epoch": 17.14,
2123
- "learning_rate": 6.120011572114802e-08,
2124
- "loss": 2.7231,
2125
  "step": 1645
2126
  },
2127
  {
2128
  "epoch": 17.19,
2129
- "learning_rate": 5.9039367825822526e-08,
2130
- "loss": 2.5787,
2131
  "step": 1650
2132
  },
2133
  {
2134
  "epoch": 17.24,
2135
- "learning_rate": 5.6915057475266314e-08,
2136
- "loss": 2.6824,
2137
  "step": 1655
2138
  },
2139
  {
2140
  "epoch": 17.29,
2141
- "learning_rate": 5.4827360205905505e-08,
2142
- "loss": 2.6002,
2143
  "step": 1660
2144
  },
2145
  {
2146
  "epoch": 17.34,
2147
- "learning_rate": 5.2776448528747406e-08,
2148
- "loss": 2.6811,
2149
  "step": 1665
2150
  },
2151
  {
2152
  "epoch": 17.4,
2153
- "learning_rate": 5.0762491915124605e-08,
2154
- "loss": 2.6052,
2155
  "step": 1670
2156
  },
2157
  {
2158
  "epoch": 17.45,
2159
- "learning_rate": 4.8785656782692034e-08,
2160
- "loss": 2.4643,
2161
  "step": 1675
2162
  },
2163
  {
2164
  "epoch": 17.5,
2165
- "learning_rate": 4.684610648167503e-08,
2166
- "loss": 2.6742,
2167
  "step": 1680
2168
  },
2169
  {
2170
  "epoch": 17.55,
2171
- "learning_rate": 4.4944001281371433e-08,
2172
- "loss": 2.6602,
2173
  "step": 1685
2174
  },
2175
  {
2176
  "epoch": 17.6,
2177
- "learning_rate": 4.307949835690844e-08,
2178
- "loss": 2.6529,
2179
  "step": 1690
2180
  },
2181
  {
2182
  "epoch": 17.66,
2183
- "learning_rate": 4.125275177625437e-08,
2184
- "loss": 2.7053,
2185
  "step": 1695
2186
  },
2187
  {
2188
  "epoch": 17.71,
2189
- "learning_rate": 3.946391248748821e-08,
2190
- "loss": 2.6967,
2191
  "step": 1700
2192
  },
2193
  {
2194
  "epoch": 17.76,
2195
- "learning_rate": 3.771312830632628e-08,
2196
- "loss": 2.5528,
2197
  "step": 1705
2198
  },
2199
  {
2200
  "epoch": 17.81,
2201
- "learning_rate": 3.600054390390778e-08,
2202
- "loss": 2.5321,
2203
  "step": 1710
2204
  },
2205
  {
2206
  "epoch": 17.86,
2207
- "learning_rate": 3.432630079484017e-08,
2208
- "loss": 2.6202,
2209
  "step": 1715
2210
  },
2211
  {
2212
  "epoch": 17.92,
2213
- "learning_rate": 3.269053732550581e-08,
2214
- "loss": 2.6669,
2215
  "step": 1720
2216
  },
2217
  {
2218
  "epoch": 17.97,
2219
- "learning_rate": 3.109338866263017e-08,
2220
- "loss": 2.5966,
2221
  "step": 1725
2222
  },
2223
  {
2224
  "epoch": 18.0,
2225
- "eval_loss": 2.6300864219665527,
2226
- "eval_runtime": 173.981,
2227
- "eval_samples_per_second": 4.403,
2228
- "eval_steps_per_second": 1.104,
2229
  "step": 1728
2230
  },
2231
  {
2232
  "epoch": 18.02,
2233
- "learning_rate": 2.9534986782112304e-08,
2234
- "loss": 2.675,
2235
  "step": 1730
2236
  },
2237
  {
2238
  "epoch": 18.07,
2239
- "learning_rate": 2.80154604581197e-08,
2240
- "loss": 2.6138,
2241
  "step": 1735
2242
  },
2243
  {
2244
  "epoch": 18.12,
2245
- "learning_rate": 2.653493525244721e-08,
2246
- "loss": 2.7089,
2247
  "step": 1740
2248
  },
2249
  {
2250
  "epoch": 18.18,
2251
- "learning_rate": 2.5093533504141784e-08,
2252
- "loss": 2.6545,
2253
  "step": 1745
2254
  },
2255
  {
2256
  "epoch": 18.23,
2257
- "learning_rate": 2.3691374319393165e-08,
2258
- "loss": 2.6073,
2259
  "step": 1750
2260
  },
2261
  {
2262
  "epoch": 18.28,
2263
- "learning_rate": 2.232857356169199e-08,
2264
- "loss": 2.6539,
2265
  "step": 1755
2266
  },
2267
  {
2268
  "epoch": 18.33,
2269
- "learning_rate": 2.100524384225555e-08,
2270
- "loss": 2.6538,
2271
  "step": 1760
2272
  },
2273
  {
2274
  "epoch": 18.39,
2275
- "learning_rate": 1.972149451072297e-08,
2276
- "loss": 2.6003,
2277
  "step": 1765
2278
  },
2279
  {
2280
  "epoch": 18.44,
2281
- "learning_rate": 1.8477431646118647e-08,
2282
- "loss": 2.7242,
2283
  "step": 1770
2284
  },
2285
  {
2286
  "epoch": 18.49,
2287
- "learning_rate": 1.7273158048087433e-08,
2288
- "loss": 2.6556,
2289
  "step": 1775
2290
  },
2291
  {
2292
  "epoch": 18.54,
2293
- "learning_rate": 1.6108773228399542e-08,
2294
- "loss": 2.5692,
2295
  "step": 1780
2296
  },
2297
  {
2298
  "epoch": 18.59,
2299
- "learning_rate": 1.4984373402728012e-08,
2300
- "loss": 2.6181,
2301
  "step": 1785
2302
  },
2303
  {
2304
  "epoch": 18.65,
2305
- "learning_rate": 1.3900051482698072e-08,
2306
- "loss": 2.6276,
2307
  "step": 1790
2308
  },
2309
  {
2310
  "epoch": 18.7,
2311
- "learning_rate": 1.2855897068209553e-08,
2312
- "loss": 2.6578,
2313
  "step": 1795
2314
  },
2315
  {
2316
  "epoch": 18.75,
2317
- "learning_rate": 1.1851996440033318e-08,
2318
- "loss": 2.5378,
2319
  "step": 1800
2320
  },
2321
  {
2322
  "epoch": 18.8,
2323
- "learning_rate": 1.0888432552681403e-08,
2324
- "loss": 2.7221,
2325
  "step": 1805
2326
  },
2327
  {
2328
  "epoch": 18.85,
2329
- "learning_rate": 9.96528502755245e-09,
2330
- "loss": 2.586,
2331
  "step": 1810
2332
  },
2333
  {
2334
  "epoch": 18.91,
2335
- "learning_rate": 9.082630146352355e-09,
2336
- "loss": 2.5015,
2337
  "step": 1815
2338
  },
2339
  {
2340
  "epoch": 18.96,
2341
- "learning_rate": 8.240540844791144e-09,
2342
- "loss": 2.5931,
2343
  "step": 1820
2344
  },
2345
  {
2346
  "epoch": 19.0,
2347
- "eval_loss": 2.6300864219665527,
2348
- "eval_runtime": 173.6034,
2349
- "eval_samples_per_second": 4.412,
2350
- "eval_steps_per_second": 1.106,
2351
  "step": 1824
2352
  },
2353
  {
2354
  "epoch": 19.01,
2355
- "learning_rate": 7.439086706555741e-09,
2356
- "loss": 2.6478,
2357
  "step": 1825
2358
  },
2359
  {
2360
  "epoch": 19.06,
2361
- "learning_rate": 6.678333957560511e-09,
2362
- "loss": 2.6916,
2363
  "step": 1830
2364
  },
2365
  {
2366
  "epoch": 19.11,
2367
- "learning_rate": 5.958345460474634e-09,
2368
- "loss": 2.5967,
2369
  "step": 1835
2370
  },
2371
  {
2372
  "epoch": 19.17,
2373
- "learning_rate": 5.279180709527764e-09,
2374
- "loss": 2.6543,
2375
  "step": 1840
2376
  },
2377
  {
2378
  "epoch": 19.22,
2379
- "learning_rate": 4.640895825593683e-09,
2380
- "loss": 2.6649,
2381
  "step": 1845
2382
  },
2383
  {
2384
  "epoch": 19.27,
2385
- "learning_rate": 4.04354355155323e-09,
2386
- "loss": 2.6998,
2387
  "step": 1850
2388
  },
2389
  {
2390
  "epoch": 19.32,
2391
- "learning_rate": 3.4871732479356263e-09,
2392
- "loss": 2.6116,
2393
  "step": 1855
2394
  },
2395
  {
2396
  "epoch": 19.38,
2397
- "learning_rate": 2.9718308888401767e-09,
2398
- "loss": 2.6608,
2399
  "step": 1860
2400
  },
2401
  {
2402
  "epoch": 19.43,
2403
- "learning_rate": 2.4975590581369777e-09,
2404
- "loss": 2.7199,
2405
  "step": 1865
2406
  },
2407
  {
2408
  "epoch": 19.48,
2409
- "learning_rate": 2.0643969459482323e-09,
2410
- "loss": 2.6137,
2411
  "step": 1870
2412
  },
2413
  {
2414
  "epoch": 19.53,
2415
- "learning_rate": 1.6723803454098407e-09,
2416
- "loss": 2.6616,
2417
  "step": 1875
2418
  },
2419
  {
2420
  "epoch": 19.58,
2421
- "learning_rate": 1.3215416497138754e-09,
2422
- "loss": 2.6161,
2423
  "step": 1880
2424
  },
2425
  {
2426
  "epoch": 19.64,
2427
- "learning_rate": 1.011909849431669e-09,
2428
- "loss": 2.5045,
2429
  "step": 1885
2430
  },
2431
  {
2432
  "epoch": 19.69,
2433
- "learning_rate": 7.435105301184519e-10,
2434
- "loss": 2.5974,
2435
  "step": 1890
2436
  },
2437
  {
2438
  "epoch": 19.74,
2439
- "learning_rate": 5.163658701989315e-10,
2440
- "loss": 2.5197,
2441
  "step": 1895
2442
  },
2443
  {
2444
  "epoch": 19.79,
2445
- "learning_rate": 3.3049463913498167e-10,
2446
- "loss": 2.6404,
2447
  "step": 1900
2448
  },
2449
  {
2450
  "epoch": 19.84,
2451
- "learning_rate": 1.859121958741605e-10,
2452
- "loss": 2.5237,
2453
  "step": 1905
2454
  },
2455
  {
2456
  "epoch": 19.9,
2457
- "learning_rate": 8.26304875812256e-11,
2458
- "loss": 2.6946,
2459
  "step": 1910
2460
  },
2461
  {
2462
  "epoch": 19.95,
2463
- "learning_rate": 2.065804865025722e-11,
2464
- "loss": 2.6112,
2465
  "step": 1915
2466
  },
2467
  {
2468
  "epoch": 20.0,
2469
  "learning_rate": 0.0,
2470
- "loss": 2.6933,
2471
  "step": 1920
2472
  },
2473
  {
2474
  "epoch": 20.0,
2475
- "eval_loss": 2.6300840377807617,
2476
- "eval_runtime": 173.2911,
2477
- "eval_samples_per_second": 4.42,
2478
- "eval_steps_per_second": 1.108,
2479
  "step": 1920
2480
  },
2481
  {
2482
  "epoch": 20.0,
2483
  "step": 1920,
2484
  "total_flos": 1.0984887148766822e+18,
2485
- "train_loss": 1.8769829372564952,
2486
- "train_runtime": 10573.7699,
2487
- "train_samples_per_second": 1.449,
2488
- "train_steps_per_second": 0.182
2489
  }
2490
  ],
2491
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.01,
13
+ "learning_rate": 5.208333333333333e-08,
14
  "loss": 2.3378,
15
  "step": 1
16
  },
17
  {
18
  "epoch": 0.05,
19
+ "learning_rate": 2.604166666666667e-07,
20
+ "loss": 2.6783,
21
  "step": 5
22
  },
23
  {
24
  "epoch": 0.1,
25
+ "learning_rate": 5.208333333333334e-07,
26
+ "loss": 2.6965,
27
  "step": 10
28
  },
29
  {
30
  "epoch": 0.16,
31
+ "learning_rate": 7.8125e-07,
32
  "loss": 2.6688,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.21,
37
+ "learning_rate": 1.0416666666666667e-06,
38
  "loss": 2.7272,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.26,
43
+ "learning_rate": 1.3020833333333335e-06,
44
  "loss": 2.6971,
45
  "step": 25
46
  },
47
  {
48
  "epoch": 0.31,
49
+ "learning_rate": 1.5625e-06,
50
+ "loss": 2.638,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.36,
55
+ "learning_rate": 1.8229166666666666e-06,
56
+ "loss": 2.6253,
57
  "step": 35
58
  },
59
  {
60
  "epoch": 0.42,
61
+ "learning_rate": 2.0833333333333334e-06,
62
+ "loss": 2.6373,
63
  "step": 40
64
  },
65
  {
66
  "epoch": 0.47,
67
+ "learning_rate": 2.3437500000000002e-06,
68
+ "loss": 2.6102,
69
  "step": 45
70
  },
71
  {
72
  "epoch": 0.52,
73
+ "learning_rate": 2.604166666666667e-06,
74
+ "loss": 2.592,
75
  "step": 50
76
  },
77
  {
78
  "epoch": 0.57,
79
+ "learning_rate": 2.8645833333333334e-06,
80
+ "loss": 2.5777,
81
  "step": 55
82
  },
83
  {
84
  "epoch": 0.62,
85
+ "learning_rate": 3.125e-06,
86
+ "loss": 2.6194,
87
  "step": 60
88
  },
89
  {
90
  "epoch": 0.68,
91
+ "learning_rate": 3.385416666666667e-06,
92
+ "loss": 2.608,
93
  "step": 65
94
  },
95
  {
96
  "epoch": 0.73,
97
+ "learning_rate": 3.6458333333333333e-06,
98
+ "loss": 2.6459,
99
  "step": 70
100
  },
101
  {
102
  "epoch": 0.78,
103
+ "learning_rate": 3.90625e-06,
104
+ "loss": 2.6886,
105
  "step": 75
106
  },
107
  {
108
  "epoch": 0.83,
109
+ "learning_rate": 4.166666666666667e-06,
110
+ "loss": 2.4494,
111
  "step": 80
112
  },
113
  {
114
  "epoch": 0.89,
115
+ "learning_rate": 4.427083333333334e-06,
116
+ "loss": 2.6428,
117
  "step": 85
118
  },
119
  {
120
  "epoch": 0.94,
121
+ "learning_rate": 4.6875000000000004e-06,
122
+ "loss": 2.6275,
123
  "step": 90
124
  },
125
  {
126
  "epoch": 0.99,
127
+ "learning_rate": 4.947916666666667e-06,
128
+ "loss": 2.6816,
129
  "step": 95
130
  },
131
  {
132
  "epoch": 1.0,
133
+ "eval_loss": 2.6328012943267822,
134
+ "eval_runtime": 173.5828,
135
+ "eval_samples_per_second": 4.413,
136
  "eval_steps_per_second": 1.106,
137
  "step": 96
138
  },
139
  {
140
  "epoch": 1.04,
141
+ "learning_rate": 5.208333333333334e-06,
142
+ "loss": 2.716,
143
  "step": 100
144
  },
145
  {
146
  "epoch": 1.09,
147
+ "learning_rate": 5.468750000000001e-06,
148
+ "loss": 2.6176,
149
  "step": 105
150
  },
151
  {
152
  "epoch": 1.15,
153
+ "learning_rate": 5.729166666666667e-06,
154
+ "loss": 2.6579,
155
  "step": 110
156
  },
157
  {
158
  "epoch": 1.2,
159
+ "learning_rate": 5.989583333333334e-06,
160
+ "loss": 2.7026,
161
  "step": 115
162
  },
163
  {
164
  "epoch": 1.25,
165
+ "learning_rate": 6.25e-06,
166
+ "loss": 2.6952,
167
  "step": 120
168
  },
169
  {
170
  "epoch": 1.3,
171
+ "learning_rate": 6.510416666666667e-06,
172
+ "loss": 2.5857,
173
  "step": 125
174
  },
175
  {
176
  "epoch": 1.35,
177
+ "learning_rate": 6.770833333333334e-06,
178
+ "loss": 2.718,
179
  "step": 130
180
  },
181
  {
182
  "epoch": 1.41,
183
+ "learning_rate": 7.031250000000001e-06,
184
+ "loss": 2.6229,
185
  "step": 135
186
  },
187
  {
188
  "epoch": 1.46,
189
+ "learning_rate": 7.291666666666667e-06,
190
+ "loss": 2.6168,
191
  "step": 140
192
  },
193
  {
194
  "epoch": 1.51,
195
+ "learning_rate": 7.552083333333334e-06,
196
+ "loss": 2.5181,
197
  "step": 145
198
  },
199
  {
200
  "epoch": 1.56,
201
+ "learning_rate": 7.8125e-06,
202
+ "loss": 2.6514,
203
  "step": 150
204
  },
205
  {
206
  "epoch": 1.61,
207
+ "learning_rate": 8.072916666666667e-06,
208
+ "loss": 2.6047,
209
  "step": 155
210
  },
211
  {
212
  "epoch": 1.67,
213
+ "learning_rate": 8.333333333333334e-06,
214
+ "loss": 2.5415,
215
  "step": 160
216
  },
217
  {
218
  "epoch": 1.72,
219
+ "learning_rate": 8.59375e-06,
220
+ "loss": 2.6892,
221
  "step": 165
222
  },
223
  {
224
  "epoch": 1.77,
225
+ "learning_rate": 8.854166666666667e-06,
226
+ "loss": 2.7038,
227
  "step": 170
228
  },
229
  {
230
  "epoch": 1.82,
231
+ "learning_rate": 9.114583333333334e-06,
232
+ "loss": 2.6528,
233
  "step": 175
234
  },
235
  {
236
  "epoch": 1.88,
237
+ "learning_rate": 9.375000000000001e-06,
238
+ "loss": 2.5846,
239
  "step": 180
240
  },
241
  {
242
  "epoch": 1.93,
243
+ "learning_rate": 9.635416666666668e-06,
244
+ "loss": 2.4526,
245
  "step": 185
246
  },
247
  {
248
  "epoch": 1.98,
249
+ "learning_rate": 9.895833333333334e-06,
250
+ "loss": 2.6582,
251
  "step": 190
252
  },
253
  {
254
  "epoch": 2.0,
255
+ "eval_loss": 2.6169474124908447,
256
+ "eval_runtime": 173.6466,
257
+ "eval_samples_per_second": 4.411,
258
+ "eval_steps_per_second": 1.106,
259
  "step": 192
260
  },
261
  {
262
  "epoch": 2.03,
263
+ "learning_rate": 9.99992563069711e-06,
264
+ "loss": 2.5407,
265
  "step": 195
266
  },
267
  {
268
  "epoch": 2.08,
269
+ "learning_rate": 9.999471159635538e-06,
270
+ "loss": 2.6746,
271
  "step": 200
272
  },
273
  {
274
  "epoch": 2.14,
275
+ "learning_rate": 9.998603571300204e-06,
276
+ "loss": 2.653,
277
  "step": 205
278
  },
279
  {
280
  "epoch": 2.19,
281
+ "learning_rate": 9.997322937381829e-06,
282
+ "loss": 2.5313,
283
  "step": 210
284
  },
285
  {
286
  "epoch": 2.24,
287
+ "learning_rate": 9.995629363702008e-06,
288
+ "loss": 2.7327,
289
  "step": 215
290
  },
291
  {
292
  "epoch": 2.29,
293
+ "learning_rate": 9.993522990204453e-06,
294
+ "loss": 2.5849,
295
  "step": 220
296
  },
297
  {
298
  "epoch": 2.34,
299
+ "learning_rate": 9.991003990943424e-06,
300
+ "loss": 2.6192,
301
  "step": 225
302
  },
303
  {
304
  "epoch": 2.4,
305
+ "learning_rate": 9.988072574069363e-06,
306
+ "loss": 2.6161,
307
  "step": 230
308
  },
309
  {
310
  "epoch": 2.45,
311
+ "learning_rate": 9.984728981811676e-06,
312
+ "loss": 2.536,
313
  "step": 235
314
  },
315
  {
316
  "epoch": 2.5,
317
+ "learning_rate": 9.980973490458728e-06,
318
+ "loss": 2.5393,
319
  "step": 240
320
  },
321
  {
322
  "epoch": 2.55,
323
+ "learning_rate": 9.976806410335015e-06,
324
+ "loss": 2.6175,
325
  "step": 245
326
  },
327
  {
328
  "epoch": 2.6,
329
+ "learning_rate": 9.972228085775512e-06,
330
+ "loss": 2.5245,
331
  "step": 250
332
  },
333
  {
334
  "epoch": 2.66,
335
+ "learning_rate": 9.967238895097223e-06,
336
+ "loss": 2.6436,
337
  "step": 255
338
  },
339
  {
340
  "epoch": 2.71,
341
+ "learning_rate": 9.961839250567925e-06,
342
+ "loss": 2.6141,
343
  "step": 260
344
  },
345
  {
346
  "epoch": 2.76,
347
+ "learning_rate": 9.956029598372092e-06,
348
+ "loss": 2.6642,
349
  "step": 265
350
  },
351
  {
352
  "epoch": 2.81,
353
+ "learning_rate": 9.94981041857404e-06,
354
+ "loss": 2.6052,
355
  "step": 270
356
  },
357
  {
358
  "epoch": 2.86,
359
+ "learning_rate": 9.943182225078242e-06,
360
+ "loss": 2.6589,
361
  "step": 275
362
  },
363
  {
364
  "epoch": 2.92,
365
+ "learning_rate": 9.936145565586871e-06,
366
+ "loss": 2.5745,
367
  "step": 280
368
  },
369
  {
370
  "epoch": 2.97,
371
+ "learning_rate": 9.928701021554545e-06,
372
+ "loss": 2.6676,
373
  "step": 285
374
  },
375
  {
376
  "epoch": 3.0,
377
+ "eval_loss": 2.598314046859741,
378
+ "eval_runtime": 173.726,
379
  "eval_samples_per_second": 4.409,
380
  "eval_steps_per_second": 1.105,
381
  "step": 288
382
  },
383
  {
384
  "epoch": 3.02,
385
+ "learning_rate": 9.920849208140277e-06,
386
+ "loss": 2.5292,
387
  "step": 290
388
  },
389
  {
390
  "epoch": 3.07,
391
+ "learning_rate": 9.912590774156638e-06,
392
+ "loss": 2.6445,
393
  "step": 295
394
  },
395
  {
396
  "epoch": 3.12,
397
+ "learning_rate": 9.903926402016153e-06,
398
+ "loss": 2.6219,
399
  "step": 300
400
  },
401
  {
402
  "epoch": 3.18,
403
+ "learning_rate": 9.894856807674908e-06,
404
+ "loss": 2.4258,
405
  "step": 305
406
  },
407
  {
408
  "epoch": 3.23,
409
+ "learning_rate": 9.885382740573385e-06,
410
+ "loss": 2.6602,
411
  "step": 310
412
  },
413
  {
414
  "epoch": 3.28,
415
+ "learning_rate": 9.875504983574545e-06,
416
+ "loss": 2.539,
417
  "step": 315
418
  },
419
  {
420
  "epoch": 3.33,
421
+ "learning_rate": 9.86522435289912e-06,
422
+ "loss": 2.5752,
423
  "step": 320
424
  },
425
  {
426
  "epoch": 3.39,
427
+ "learning_rate": 9.85454169805819e-06,
428
+ "loss": 2.5685,
429
  "step": 325
430
  },
431
  {
432
  "epoch": 3.44,
433
+ "learning_rate": 9.843457901782967e-06,
434
+ "loss": 2.6553,
435
  "step": 330
436
  },
437
  {
438
  "epoch": 3.49,
439
+ "learning_rate": 9.83197387995186e-06,
440
+ "loss": 2.4846,
441
  "step": 335
442
  },
443
  {
444
  "epoch": 3.54,
445
+ "learning_rate": 9.820090581514799e-06,
446
+ "loss": 2.6538,
447
  "step": 340
448
  },
449
  {
450
  "epoch": 3.59,
451
+ "learning_rate": 9.807808988414811e-06,
452
+ "loss": 2.5469,
453
  "step": 345
454
  },
455
  {
456
  "epoch": 3.65,
457
+ "learning_rate": 9.795130115506887e-06,
458
+ "loss": 2.6193,
459
  "step": 350
460
  },
461
  {
462
  "epoch": 3.7,
463
+ "learning_rate": 9.78205501047412e-06,
464
+ "loss": 2.5287,
465
  "step": 355
466
  },
467
  {
468
  "epoch": 3.75,
469
+ "learning_rate": 9.768584753741134e-06,
470
+ "loss": 2.5179,
471
  "step": 360
472
  },
473
  {
474
  "epoch": 3.8,
475
+ "learning_rate": 9.754720458384808e-06,
476
+ "loss": 2.6582,
477
  "step": 365
478
  },
479
  {
480
  "epoch": 3.85,
481
+ "learning_rate": 9.740463270042289e-06,
482
+ "loss": 2.6538,
483
  "step": 370
484
  },
485
  {
486
  "epoch": 3.91,
487
+ "learning_rate": 9.72581436681634e-06,
488
+ "loss": 2.641,
489
  "step": 375
490
  },
491
  {
492
  "epoch": 3.96,
493
+ "learning_rate": 9.710774959177983e-06,
494
+ "loss": 2.6413,
495
  "step": 380
496
  },
497
  {
498
  "epoch": 4.0,
499
+ "eval_loss": 2.589087724685669,
500
+ "eval_runtime": 173.3132,
501
+ "eval_samples_per_second": 4.42,
502
+ "eval_steps_per_second": 1.108,
503
  "step": 384
504
  },
505
  {
506
  "epoch": 4.01,
507
+ "learning_rate": 9.695346289866478e-06,
508
+ "loss": 2.6235,
509
  "step": 385
510
  },
511
  {
512
  "epoch": 4.06,
513
+ "learning_rate": 9.67952963378663e-06,
514
+ "loss": 2.5417,
515
  "step": 390
516
  },
517
  {
518
  "epoch": 4.11,
519
+ "learning_rate": 9.66332629790344e-06,
520
+ "loss": 2.5655,
521
  "step": 395
522
  },
523
  {
524
  "epoch": 4.17,
525
+ "learning_rate": 9.646737621134112e-06,
526
+ "loss": 2.6339,
527
  "step": 400
528
  },
529
  {
530
  "epoch": 4.22,
531
+ "learning_rate": 9.629764974237416e-06,
532
+ "loss": 2.6498,
533
  "step": 405
534
  },
535
  {
536
  "epoch": 4.27,
537
+ "learning_rate": 9.612409759700412e-06,
538
+ "loss": 2.5508,
539
  "step": 410
540
  },
541
  {
542
  "epoch": 4.32,
543
+ "learning_rate": 9.594673411622563e-06,
544
+ "loss": 2.6466,
545
  "step": 415
546
  },
547
  {
548
  "epoch": 4.38,
549
+ "learning_rate": 9.576557395597237e-06,
550
+ "loss": 2.4891,
551
  "step": 420
552
  },
553
  {
554
  "epoch": 4.43,
555
+ "learning_rate": 9.558063208590594e-06,
556
+ "loss": 2.5521,
557
  "step": 425
558
  },
559
  {
560
  "epoch": 4.48,
561
+ "learning_rate": 9.539192378817894e-06,
562
+ "loss": 2.517,
563
  "step": 430
564
  },
565
  {
566
  "epoch": 4.53,
567
+ "learning_rate": 9.519946465617217e-06,
568
+ "loss": 2.562,
569
  "step": 435
570
  },
571
  {
572
  "epoch": 4.58,
573
+ "learning_rate": 9.500327059320606e-06,
574
+ "loss": 2.7011,
575
  "step": 440
576
  },
577
  {
578
  "epoch": 4.64,
579
+ "learning_rate": 9.480335781122661e-06,
580
+ "loss": 2.4814,
581
  "step": 445
582
  },
583
  {
584
  "epoch": 4.69,
585
+ "learning_rate": 9.459974282946572e-06,
586
+ "loss": 2.6825,
587
  "step": 450
588
  },
589
  {
590
  "epoch": 4.74,
591
+ "learning_rate": 9.439244247307618e-06,
592
+ "loss": 2.5627,
593
  "step": 455
594
  },
595
  {
596
  "epoch": 4.79,
597
+ "learning_rate": 9.41814738717414e-06,
598
+ "loss": 2.5377,
599
  "step": 460
600
  },
601
  {
602
  "epoch": 4.84,
603
+ "learning_rate": 9.396685445825987e-06,
604
+ "loss": 2.6597,
605
  "step": 465
606
  },
607
  {
608
  "epoch": 4.9,
609
+ "learning_rate": 9.374860196710474e-06,
610
+ "loss": 2.6323,
611
  "step": 470
612
  },
613
  {
614
  "epoch": 4.95,
615
+ "learning_rate": 9.352673443295834e-06,
616
+ "loss": 2.609,
617
  "step": 475
618
  },
619
  {
620
  "epoch": 5.0,
621
+ "learning_rate": 9.330127018922195e-06,
622
+ "loss": 2.581,
623
  "step": 480
624
  },
625
  {
626
  "epoch": 5.0,
627
+ "eval_loss": 2.5825417041778564,
628
+ "eval_runtime": 173.2578,
629
+ "eval_samples_per_second": 4.421,
630
+ "eval_steps_per_second": 1.108,
631
  "step": 480
632
  },
633
  {
634
  "epoch": 5.05,
635
+ "learning_rate": 9.307222786650079e-06,
636
+ "loss": 2.6336,
637
  "step": 485
638
  },
639
  {
640
  "epoch": 5.1,
641
+ "learning_rate": 9.283962639106464e-06,
642
+ "loss": 2.6305,
643
  "step": 490
644
  },
645
  {
646
  "epoch": 5.16,
647
+ "learning_rate": 9.260348498328393e-06,
648
+ "loss": 2.5438,
649
  "step": 495
650
  },
651
  {
652
  "epoch": 5.21,
653
+ "learning_rate": 9.23638231560414e-06,
654
+ "loss": 2.5235,
655
  "step": 500
656
  },
657
  {
658
  "epoch": 5.26,
659
+ "learning_rate": 9.212066071311978e-06,
660
+ "loss": 2.5714,
661
  "step": 505
662
  },
663
  {
664
  "epoch": 5.31,
665
+ "learning_rate": 9.18740177475654e-06,
666
+ "loss": 2.6968,
667
  "step": 510
668
  },
669
  {
670
  "epoch": 5.36,
671
+ "learning_rate": 9.162391464002776e-06,
672
+ "loss": 2.5705,
673
  "step": 515
674
  },
675
  {
676
  "epoch": 5.42,
677
+ "learning_rate": 9.137037205707552e-06,
678
+ "loss": 2.5459,
679
  "step": 520
680
  },
681
  {
682
  "epoch": 5.47,
683
+ "learning_rate": 9.111341094948876e-06,
684
+ "loss": 2.5294,
685
  "step": 525
686
  },
687
  {
688
  "epoch": 5.52,
689
+ "learning_rate": 9.08530525505277e-06,
690
+ "loss": 2.5656,
691
  "step": 530
692
  },
693
  {
694
  "epoch": 5.57,
695
+ "learning_rate": 9.058931837417823e-06,
696
+ "loss": 2.6372,
697
  "step": 535
698
  },
699
  {
700
  "epoch": 5.62,
701
+ "learning_rate": 9.032223021337415e-06,
702
+ "loss": 2.539,
703
  "step": 540
704
  },
705
  {
706
  "epoch": 5.68,
707
+ "learning_rate": 9.00518101381963e-06,
708
+ "loss": 2.6005,
709
  "step": 545
710
  },
711
  {
712
  "epoch": 5.73,
713
+ "learning_rate": 8.9778080494049e-06,
714
+ "loss": 2.5895,
715
  "step": 550
716
  },
717
  {
718
  "epoch": 5.78,
719
+ "learning_rate": 8.950106389981346e-06,
720
+ "loss": 2.5937,
721
  "step": 555
722
  },
723
  {
724
  "epoch": 5.83,
725
+ "learning_rate": 8.92207832459788e-06,
726
+ "loss": 2.5635,
727
  "step": 560
728
  },
729
  {
730
  "epoch": 5.89,
731
+ "learning_rate": 8.893726169275054e-06,
732
+ "loss": 2.5176,
733
  "step": 565
734
  },
735
  {
736
  "epoch": 5.94,
737
+ "learning_rate": 8.865052266813686e-06,
738
+ "loss": 2.6101,
739
  "step": 570
740
  },
741
  {
742
  "epoch": 5.99,
743
+ "learning_rate": 8.836058986601263e-06,
744
+ "loss": 2.5884,
745
  "step": 575
746
  },
747
  {
748
  "epoch": 6.0,
749
+ "eval_loss": 2.577636957168579,
750
+ "eval_runtime": 173.3959,
751
+ "eval_samples_per_second": 4.418,
752
+ "eval_steps_per_second": 1.107,
753
  "step": 576
754
  },
755
  {
756
  "epoch": 6.04,
757
+ "learning_rate": 8.806748724416156e-06,
758
+ "loss": 2.5728,
759
  "step": 580
760
  },
761
  {
762
  "epoch": 6.09,
763
+ "learning_rate": 8.777123902229658e-06,
764
+ "loss": 2.652,
765
  "step": 585
766
  },
767
  {
768
  "epoch": 6.15,
769
+ "learning_rate": 8.747186968005837e-06,
770
+ "loss": 2.5489,
771
  "step": 590
772
  },
773
  {
774
  "epoch": 6.2,
775
+ "learning_rate": 8.71694039549927e-06,
776
+ "loss": 2.6273,
777
  "step": 595
778
  },
779
  {
780
  "epoch": 6.25,
781
+ "learning_rate": 8.68638668405062e-06,
782
+ "loss": 2.4337,
783
  "step": 600
784
  },
785
  {
786
  "epoch": 6.3,
787
+ "learning_rate": 8.655528358380121e-06,
788
+ "loss": 2.5657,
789
  "step": 605
790
  },
791
  {
792
  "epoch": 6.35,
793
+ "learning_rate": 8.624367968378941e-06,
794
+ "loss": 2.6715,
795
  "step": 610
796
  },
797
  {
798
  "epoch": 6.41,
799
+ "learning_rate": 8.59290808889849e-06,
800
+ "loss": 2.5736,
801
  "step": 615
802
  },
803
  {
804
  "epoch": 6.46,
805
+ "learning_rate": 8.561151319537656e-06,
806
+ "loss": 2.5889,
807
  "step": 620
808
  },
809
  {
810
  "epoch": 6.51,
811
+ "learning_rate": 8.52910028442798e-06,
812
+ "loss": 2.5689,
813
  "step": 625
814
  },
815
  {
816
  "epoch": 6.56,
817
+ "learning_rate": 8.496757632016836e-06,
818
+ "loss": 2.6729,
819
  "step": 630
820
  },
821
  {
822
  "epoch": 6.61,
823
+ "learning_rate": 8.46412603484857e-06,
824
+ "loss": 2.4771,
825
  "step": 635
826
  },
827
  {
828
  "epoch": 6.67,
829
+ "learning_rate": 8.43120818934367e-06,
830
+ "loss": 2.5163,
831
  "step": 640
832
  },
833
  {
834
  "epoch": 6.72,
835
+ "learning_rate": 8.398006815575949e-06,
836
+ "loss": 2.5749,
837
  "step": 645
838
  },
839
  {
840
  "epoch": 6.77,
841
+ "learning_rate": 8.364524657047789e-06,
842
+ "loss": 2.5716,
843
  "step": 650
844
  },
845
  {
846
  "epoch": 6.82,
847
+ "learning_rate": 8.330764480463427e-06,
848
+ "loss": 2.5333,
849
  "step": 655
850
  },
851
  {
852
  "epoch": 6.88,
853
+ "learning_rate": 8.296729075500345e-06,
854
+ "loss": 2.5299,
855
  "step": 660
856
  },
857
  {
858
  "epoch": 6.93,
859
+ "learning_rate": 8.262421254578749e-06,
860
+ "loss": 2.5526,
861
  "step": 665
862
  },
863
  {
864
  "epoch": 6.98,
865
+ "learning_rate": 8.227843852629174e-06,
866
+ "loss": 2.704,
867
  "step": 670
868
  },
869
  {
870
  "epoch": 7.0,
871
+ "eval_loss": 2.57405686378479,
872
+ "eval_runtime": 173.1315,
873
+ "eval_samples_per_second": 4.424,
874
+ "eval_steps_per_second": 1.109,
875
  "step": 672
876
  },
877
  {
878
  "epoch": 7.03,
879
+ "learning_rate": 8.192999726858227e-06,
880
+ "loss": 2.5605,
881
  "step": 675
882
  },
883
  {
884
  "epoch": 7.08,
885
+ "learning_rate": 8.157891756512488e-06,
886
+ "loss": 2.6373,
887
  "step": 680
888
  },
889
  {
890
  "epoch": 7.14,
891
+ "learning_rate": 8.122522842640596e-06,
892
+ "loss": 2.6236,
893
  "step": 685
894
  },
895
  {
896
  "epoch": 7.19,
897
+ "learning_rate": 8.086895907853526e-06,
898
+ "loss": 2.5717,
899
  "step": 690
900
  },
901
  {
902
  "epoch": 7.24,
903
+ "learning_rate": 8.051013896083084e-06,
904
+ "loss": 2.5228,
905
  "step": 695
906
  },
907
  {
908
  "epoch": 7.29,
909
+ "learning_rate": 8.014879772338649e-06,
910
+ "loss": 2.5166,
911
  "step": 700
912
  },
913
  {
914
  "epoch": 7.34,
915
+ "learning_rate": 7.978496522462167e-06,
916
+ "loss": 2.5228,
917
  "step": 705
918
  },
919
  {
920
  "epoch": 7.4,
921
+ "learning_rate": 7.941867152881423e-06,
922
+ "loss": 2.7549,
923
  "step": 710
924
  },
925
  {
926
  "epoch": 7.45,
927
+ "learning_rate": 7.904994690361612e-06,
928
+ "loss": 2.5288,
929
  "step": 715
930
  },
931
  {
932
  "epoch": 7.5,
933
+ "learning_rate": 7.86788218175523e-06,
934
+ "loss": 2.582,
935
  "step": 720
936
  },
937
  {
938
  "epoch": 7.55,
939
+ "learning_rate": 7.830532693750314e-06,
940
+ "loss": 2.5402,
941
  "step": 725
942
  },
943
  {
944
  "epoch": 7.6,
945
+ "learning_rate": 7.792949312617023e-06,
946
+ "loss": 2.5406,
947
  "step": 730
948
  },
949
  {
950
  "epoch": 7.66,
951
+ "learning_rate": 7.755135143952621e-06,
952
+ "loss": 2.4985,
953
  "step": 735
954
  },
955
  {
956
  "epoch": 7.71,
957
+ "learning_rate": 7.71709331242485e-06,
958
+ "loss": 2.6041,
959
  "step": 740
960
  },
961
  {
962
  "epoch": 7.76,
963
+ "learning_rate": 7.678826961513739e-06,
964
+ "loss": 2.544,
965
  "step": 745
966
  },
967
  {
968
  "epoch": 7.81,
969
+ "learning_rate": 7.64033925325184e-06,
970
+ "loss": 2.5934,
971
  "step": 750
972
  },
973
  {
974
  "epoch": 7.86,
975
+ "learning_rate": 7.601633367962955e-06,
976
+ "loss": 2.5599,
977
  "step": 755
978
  },
979
  {
980
  "epoch": 7.92,
981
+ "learning_rate": 7.562712503999327e-06,
982
+ "loss": 2.6012,
983
  "step": 760
984
  },
985
  {
986
  "epoch": 7.97,
987
+ "learning_rate": 7.523579877477361e-06,
988
+ "loss": 2.608,
989
  "step": 765
990
  },
991
  {
992
  "epoch": 8.0,
993
+ "eval_loss": 2.571471691131592,
994
+ "eval_runtime": 173.1738,
995
+ "eval_samples_per_second": 4.423,
996
+ "eval_steps_per_second": 1.109,
997
  "step": 768
998
  },
999
  {
1000
  "epoch": 8.02,
1001
+ "learning_rate": 7.484238722011869e-06,
1002
+ "loss": 2.6066,
1003
  "step": 770
1004
  },
1005
  {
1006
  "epoch": 8.07,
1007
+ "learning_rate": 7.444692288448864e-06,
1008
+ "loss": 2.5357,
1009
  "step": 775
1010
  },
1011
  {
1012
  "epoch": 8.12,
1013
+ "learning_rate": 7.404943844596939e-06,
1014
+ "loss": 2.487,
1015
  "step": 780
1016
  },
1017
  {
1018
  "epoch": 8.18,
1019
+ "learning_rate": 7.364996674957243e-06,
1020
+ "loss": 2.5873,
1021
  "step": 785
1022
  },
1023
  {
1024
  "epoch": 8.23,
1025
+ "learning_rate": 7.324854080452071e-06,
1026
+ "loss": 2.6906,
1027
  "step": 790
1028
  },
1029
  {
1030
  "epoch": 8.28,
1031
+ "learning_rate": 7.284519378152104e-06,
1032
+ "loss": 2.575,
1033
  "step": 795
1034
  },
1035
  {
1036
  "epoch": 8.33,
1037
+ "learning_rate": 7.243995901002312e-06,
1038
+ "loss": 2.5426,
1039
  "step": 800
1040
  },
1041
  {
1042
  "epoch": 8.39,
1043
+ "learning_rate": 7.203286997546543e-06,
1044
+ "loss": 2.5829,
1045
  "step": 805
1046
  },
1047
  {
1048
  "epoch": 8.44,
1049
+ "learning_rate": 7.162396031650831e-06,
1050
+ "loss": 2.5422,
1051
  "step": 810
1052
  },
1053
  {
1054
  "epoch": 8.49,
1055
+ "learning_rate": 7.121326382225429e-06,
1056
+ "loss": 2.5977,
1057
  "step": 815
1058
  },
1059
  {
1060
  "epoch": 8.54,
1061
+ "learning_rate": 7.080081442945597e-06,
1062
+ "loss": 2.5636,
1063
  "step": 820
1064
  },
1065
  {
1066
  "epoch": 8.59,
1067
+ "learning_rate": 7.038664621971184e-06,
1068
+ "loss": 2.6223,
1069
  "step": 825
1070
  },
1071
  {
1072
  "epoch": 8.65,
1073
+ "learning_rate": 6.997079341665003e-06,
1074
+ "loss": 2.5567,
1075
  "step": 830
1076
  },
1077
  {
1078
  "epoch": 8.7,
1079
+ "learning_rate": 6.955329038310028e-06,
1080
+ "loss": 2.5051,
1081
  "step": 835
1082
  },
1083
  {
1084
  "epoch": 8.75,
1085
+ "learning_rate": 6.913417161825449e-06,
1086
+ "loss": 2.5383,
1087
  "step": 840
1088
  },
1089
  {
1090
  "epoch": 8.8,
1091
+ "learning_rate": 6.871347175481602e-06,
1092
+ "loss": 2.5363,
1093
  "step": 845
1094
  },
1095
  {
1096
  "epoch": 8.85,
1097
+ "learning_rate": 6.829122555613786e-06,
1098
+ "loss": 2.6013,
1099
  "step": 850
1100
  },
1101
  {
1102
  "epoch": 8.91,
1103
+ "learning_rate": 6.786746791335001e-06,
1104
+ "loss": 2.6019,
1105
  "step": 855
1106
  },
1107
  {
1108
  "epoch": 8.96,
1109
+ "learning_rate": 6.7442233842476545e-06,
1110
+ "loss": 2.5454,
1111
  "step": 860
1112
  },
1113
  {
1114
  "epoch": 9.0,
1115
+ "eval_loss": 2.5697662830352783,
1116
+ "eval_runtime": 173.2483,
1117
+ "eval_samples_per_second": 4.421,
1118
+ "eval_steps_per_second": 1.108,
1119
  "step": 864
1120
  },
1121
  {
1122
  "epoch": 9.01,
1123
+ "learning_rate": 6.701555848154193e-06,
1124
+ "loss": 2.6289,
1125
  "step": 865
1126
  },
1127
  {
1128
  "epoch": 9.06,
1129
+ "learning_rate": 6.6587477087667615e-06,
1130
+ "loss": 2.6078,
1131
  "step": 870
1132
  },
1133
  {
1134
  "epoch": 9.11,
1135
+ "learning_rate": 6.615802503415865e-06,
1136
+ "loss": 2.5328,
1137
  "step": 875
1138
  },
1139
  {
1140
  "epoch": 9.17,
1141
+ "learning_rate": 6.572723780758069e-06,
1142
+ "loss": 2.537,
1143
  "step": 880
1144
  },
1145
  {
1146
  "epoch": 9.22,
1147
+ "learning_rate": 6.529515100482768e-06,
1148
+ "loss": 2.5444,
1149
  "step": 885
1150
  },
1151
  {
1152
  "epoch": 9.27,
1153
+ "learning_rate": 6.486180033018039e-06,
1154
+ "loss": 2.4951,
1155
  "step": 890
1156
  },
1157
  {
1158
  "epoch": 9.32,
1159
+ "learning_rate": 6.442722159235608e-06,
1160
+ "loss": 2.5551,
1161
  "step": 895
1162
  },
1163
  {
1164
  "epoch": 9.38,
1165
+ "learning_rate": 6.399145070154962e-06,
1166
+ "loss": 2.6229,
1167
  "step": 900
1168
  },
1169
  {
1170
  "epoch": 9.43,
1171
+ "learning_rate": 6.355452366646602e-06,
1172
+ "loss": 2.4483,
1173
  "step": 905
1174
  },
1175
  {
1176
  "epoch": 9.48,
1177
+ "learning_rate": 6.311647659134509e-06,
1178
+ "loss": 2.659,
1179
  "step": 910
1180
  },
1181
  {
1182
  "epoch": 9.53,
1183
+ "learning_rate": 6.267734567297799e-06,
1184
+ "loss": 2.565,
1185
  "step": 915
1186
  },
1187
  {
1188
  "epoch": 9.58,
1189
+ "learning_rate": 6.2237167197716195e-06,
1190
+ "loss": 2.5539,
1191
  "step": 920
1192
  },
1193
  {
1194
  "epoch": 9.64,
1195
+ "learning_rate": 6.179597753847317e-06,
1196
+ "loss": 2.5881,
1197
  "step": 925
1198
  },
1199
  {
1200
  "epoch": 9.69,
1201
+ "learning_rate": 6.135381315171867e-06,
1202
+ "loss": 2.5665,
1203
  "step": 930
1204
  },
1205
  {
1206
  "epoch": 9.74,
1207
+ "learning_rate": 6.091071057446635e-06,
1208
+ "loss": 2.5365,
1209
  "step": 935
1210
  },
1211
  {
1212
  "epoch": 9.79,
1213
+ "learning_rate": 6.046670642125461e-06,
1214
+ "loss": 2.6475,
1215
  "step": 940
1216
  },
1217
  {
1218
  "epoch": 9.84,
1219
+ "learning_rate": 6.002183738112103e-06,
1220
+ "loss": 2.6788,
1221
  "step": 945
1222
  },
1223
  {
1224
  "epoch": 9.9,
1225
+ "learning_rate": 5.957614021457072e-06,
1226
+ "loss": 2.5368,
1227
  "step": 950
1228
  },
1229
  {
1230
  "epoch": 9.95,
1231
+ "learning_rate": 5.912965175053867e-06,
1232
+ "loss": 2.547,
1233
  "step": 955
1234
  },
1235
  {
1236
  "epoch": 10.0,
1237
+ "learning_rate": 5.8682408883346535e-06,
1238
+ "loss": 2.5938,
1239
  "step": 960
1240
  },
1241
  {
1242
  "epoch": 10.0,
1243
+ "eval_loss": 2.5687718391418457,
1244
+ "eval_runtime": 173.1474,
1245
+ "eval_samples_per_second": 4.424,
1246
+ "eval_steps_per_second": 1.109,
1247
  "step": 960
1248
  },
1249
  {
1250
  "epoch": 10.05,
1251
+ "learning_rate": 5.823444856965393e-06,
1252
+ "loss": 2.6,
1253
  "step": 965
1254
  },
1255
  {
1256
  "epoch": 10.1,
1257
+ "learning_rate": 5.77858078254047e-06,
1258
+ "loss": 2.6841,
1259
  "step": 970
1260
  },
1261
  {
1262
  "epoch": 10.16,
1263
+ "learning_rate": 5.733652372276809e-06,
1264
+ "loss": 2.6076,
1265
  "step": 975
1266
  },
1267
  {
1268
  "epoch": 10.21,
1269
+ "learning_rate": 5.688663338707554e-06,
1270
+ "loss": 2.6151,
1271
  "step": 980
1272
  },
1273
  {
1274
  "epoch": 10.26,
1275
+ "learning_rate": 5.643617399375281e-06,
1276
+ "loss": 2.5886,
1277
  "step": 985
1278
  },
1279
  {
1280
  "epoch": 10.31,
1281
+ "learning_rate": 5.598518276524813e-06,
1282
+ "loss": 2.6533,
1283
  "step": 990
1284
  },
1285
  {
1286
  "epoch": 10.36,
1287
+ "learning_rate": 5.553369696795647e-06,
1288
+ "loss": 2.3891,
1289
  "step": 995
1290
  },
1291
  {
1292
  "epoch": 10.42,
1293
+ "learning_rate": 5.50817539091401e-06,
1294
+ "loss": 2.697,
1295
  "step": 1000
1296
  },
1297
  {
1298
  "epoch": 10.47,
1299
+ "learning_rate": 5.462939093384579e-06,
1300
+ "loss": 2.5268,
1301
  "step": 1005
1302
  },
1303
  {
1304
  "epoch": 10.52,
1305
+ "learning_rate": 5.417664542181894e-06,
1306
+ "loss": 2.5242,
1307
  "step": 1010
1308
  },
1309
  {
1310
  "epoch": 10.57,
1311
+ "learning_rate": 5.372355478441483e-06,
1312
+ "loss": 2.5093,
1313
  "step": 1015
1314
  },
1315
  {
1316
  "epoch": 10.62,
1317
+ "learning_rate": 5.327015646150716e-06,
1318
+ "loss": 2.5013,
1319
  "step": 1020
1320
  },
1321
  {
1322
  "epoch": 10.68,
1323
+ "learning_rate": 5.2816487918394385e-06,
1324
+ "loss": 2.4911,
1325
  "step": 1025
1326
  },
1327
  {
1328
  "epoch": 10.73,
1329
+ "learning_rate": 5.236258664270385e-06,
1330
+ "loss": 2.6194,
1331
  "step": 1030
1332
  },
1333
  {
1334
  "epoch": 10.78,
1335
+ "learning_rate": 5.1908490141294085e-06,
1336
+ "loss": 2.4813,
1337
  "step": 1035
1338
  },
1339
  {
1340
  "epoch": 10.83,
1341
+ "learning_rate": 5.145423593715558e-06,
1342
+ "loss": 2.5679,
1343
  "step": 1040
1344
  },
1345
  {
1346
  "epoch": 10.89,
1347
+ "learning_rate": 5.09998615663101e-06,
1348
+ "loss": 2.5244,
1349
  "step": 1045
1350
  },
1351
  {
1352
  "epoch": 10.94,
1353
+ "learning_rate": 5.054540457470912e-06,
1354
+ "loss": 2.6256,
1355
  "step": 1050
1356
  },
1357
  {
1358
  "epoch": 10.99,
1359
+ "learning_rate": 5.009090251513119e-06,
1360
+ "loss": 2.6129,
1361
  "step": 1055
1362
  },
1363
  {
1364
  "epoch": 11.0,
1365
+ "eval_loss": 2.568225145339966,
1366
+ "eval_runtime": 173.1937,
1367
  "eval_samples_per_second": 4.423,
1368
  "eval_steps_per_second": 1.109,
1369
  "step": 1056
1370
  },
1371
  {
1372
  "epoch": 11.04,
1373
+ "learning_rate": 4.963639294407893e-06,
1374
+ "loss": 2.4814,
1375
  "step": 1060
1376
  },
1377
  {
1378
  "epoch": 11.09,
1379
+ "learning_rate": 4.918191341867566e-06,
1380
+ "loss": 2.6482,
1381
  "step": 1065
1382
  },
1383
  {
1384
  "epoch": 11.15,
1385
+ "learning_rate": 4.8727501493562e-06,
1386
+ "loss": 2.6263,
1387
  "step": 1070
1388
  },
1389
  {
1390
  "epoch": 11.2,
1391
+ "learning_rate": 4.827319471779255e-06,
1392
+ "loss": 2.6835,
1393
  "step": 1075
1394
  },
1395
  {
1396
  "epoch": 11.25,
1397
+ "learning_rate": 4.781903063173321e-06,
1398
+ "loss": 2.4925,
1399
  "step": 1080
1400
  },
1401
  {
1402
  "epoch": 11.3,
1403
+ "learning_rate": 4.736504676395912e-06,
1404
+ "loss": 2.51,
1405
  "step": 1085
1406
  },
1407
  {
1408
  "epoch": 11.35,
1409
+ "learning_rate": 4.691128062815361e-06,
1410
+ "loss": 2.5933,
1411
  "step": 1090
1412
  },
1413
  {
1414
  "epoch": 11.41,
1415
+ "learning_rate": 4.64577697200083e-06,
1416
+ "loss": 2.6073,
1417
  "step": 1095
1418
  },
1419
  {
1420
  "epoch": 11.46,
1421
+ "learning_rate": 4.600455151412482e-06,
1422
+ "loss": 2.6271,
1423
  "step": 1100
1424
  },
1425
  {
1426
  "epoch": 11.51,
1427
+ "learning_rate": 4.555166346091811e-06,
1428
+ "loss": 2.5913,
1429
  "step": 1105
1430
  },
1431
  {
1432
  "epoch": 11.56,
1433
+ "learning_rate": 4.509914298352197e-06,
1434
+ "loss": 2.4734,
1435
  "step": 1110
1436
  },
1437
  {
1438
  "epoch": 11.61,
1439
+ "learning_rate": 4.464702747469654e-06,
1440
+ "loss": 2.4006,
1441
  "step": 1115
1442
  },
1443
  {
1444
  "epoch": 11.67,
1445
+ "learning_rate": 4.4195354293738484e-06,
1446
+ "loss": 2.5788,
1447
  "step": 1120
1448
  },
1449
  {
1450
  "epoch": 11.72,
1451
+ "learning_rate": 4.374416076339405e-06,
1452
+ "loss": 2.6018,
1453
  "step": 1125
1454
  },
1455
  {
1456
  "epoch": 11.77,
1457
+ "learning_rate": 4.3293484166774795e-06,
1458
+ "loss": 2.3919,
1459
  "step": 1130
1460
  },
1461
  {
1462
  "epoch": 11.82,
1463
+ "learning_rate": 4.2843361744276965e-06,
1464
+ "loss": 2.5624,
1465
  "step": 1135
1466
  },
1467
  {
1468
  "epoch": 11.88,
1469
+ "learning_rate": 4.239383069050417e-06,
1470
+ "loss": 2.6089,
1471
  "step": 1140
1472
  },
1473
  {
1474
  "epoch": 11.93,
1475
+ "learning_rate": 4.194492815119393e-06,
1476
+ "loss": 2.6184,
1477
  "step": 1145
1478
  },
1479
  {
1480
  "epoch": 11.98,
1481
+ "learning_rate": 4.149669122014823e-06,
1482
+ "loss": 2.6334,
1483
  "step": 1150
1484
  },
1485
  {
1486
  "epoch": 12.0,
1487
+ "eval_loss": 2.5679450035095215,
1488
+ "eval_runtime": 173.1818,
1489
+ "eval_samples_per_second": 4.423,
1490
+ "eval_steps_per_second": 1.109,
1491
  "step": 1152
1492
  },
1493
  {
1494
  "epoch": 12.03,
1495
+ "learning_rate": 4.104915693616838e-06,
1496
+ "loss": 2.6666,
1497
  "step": 1155
1498
  },
1499
  {
1500
  "epoch": 12.08,
1501
+ "learning_rate": 4.060236227999441e-06,
1502
+ "loss": 2.5513,
1503
  "step": 1160
1504
  },
1505
  {
1506
  "epoch": 12.14,
1507
+ "learning_rate": 4.015634417124932e-06,
1508
+ "loss": 2.4863,
1509
  "step": 1165
1510
  },
1511
  {
1512
  "epoch": 12.19,
1513
+ "learning_rate": 3.971113946538826e-06,
1514
+ "loss": 2.5857,
1515
  "step": 1170
1516
  },
1517
  {
1518
  "epoch": 12.24,
1519
+ "learning_rate": 3.926678495065313e-06,
1520
+ "loss": 2.6008,
1521
  "step": 1175
1522
  },
1523
  {
1524
  "epoch": 12.29,
1525
+ "learning_rate": 3.882331734503263e-06,
1526
+ "loss": 2.5744,
1527
  "step": 1180
1528
  },
1529
  {
1530
  "epoch": 12.34,
1531
+ "learning_rate": 3.838077329322828e-06,
1532
+ "loss": 2.6885,
1533
  "step": 1185
1534
  },
1535
  {
1536
  "epoch": 12.4,
1537
+ "learning_rate": 3.7939189363626282e-06,
1538
+ "loss": 2.6531,
1539
  "step": 1190
1540
  },
1541
  {
1542
  "epoch": 12.45,
1543
+ "learning_rate": 3.7498602045275846e-06,
1544
+ "loss": 2.6082,
1545
  "step": 1195
1546
  },
1547
  {
1548
  "epoch": 12.5,
1549
+ "learning_rate": 3.705904774487396e-06,
1550
+ "loss": 2.5587,
1551
  "step": 1200
1552
  },
1553
  {
1554
  "epoch": 12.55,
1555
+ "learning_rate": 3.6620562783757163e-06,
1556
+ "loss": 2.4835,
1557
  "step": 1205
1558
  },
1559
  {
1560
  "epoch": 12.6,
1561
+ "learning_rate": 3.618318339490009e-06,
1562
+ "loss": 2.5595,
1563
  "step": 1210
1564
  },
1565
  {
1566
  "epoch": 12.66,
1567
+ "learning_rate": 3.5746945719921476e-06,
1568
+ "loss": 2.5684,
1569
  "step": 1215
1570
  },
1571
  {
1572
  "epoch": 12.71,
1573
+ "learning_rate": 3.531188580609778e-06,
1574
+ "loss": 2.4877,
1575
  "step": 1220
1576
  },
1577
  {
1578
  "epoch": 12.76,
1579
+ "learning_rate": 3.4878039603384505e-06,
1580
+ "loss": 2.7007,
1581
  "step": 1225
1582
  },
1583
  {
1584
  "epoch": 12.81,
1585
+ "learning_rate": 3.444544296144546e-06,
1586
+ "loss": 2.5391,
1587
  "step": 1230
1588
  },
1589
  {
1590
  "epoch": 12.86,
1591
+ "learning_rate": 3.401413162669057e-06,
1592
+ "loss": 2.5575,
1593
  "step": 1235
1594
  },
1595
  {
1596
  "epoch": 12.92,
1597
+ "learning_rate": 3.3584141239321953e-06,
1598
+ "loss": 2.5095,
1599
  "step": 1240
1600
  },
1601
  {
1602
  "epoch": 12.97,
1603
+ "learning_rate": 3.3155507330389004e-06,
1604
+ "loss": 2.5013,
1605
  "step": 1245
1606
  },
1607
  {
1608
  "epoch": 13.0,
1609
+ "eval_loss": 2.567814350128174,
1610
+ "eval_runtime": 173.1218,
1611
+ "eval_samples_per_second": 4.425,
1612
+ "eval_steps_per_second": 1.109,
1613
  "step": 1248
1614
  },
1615
  {
1616
  "epoch": 13.02,
1617
+ "learning_rate": 3.272826531885229e-06,
1618
+ "loss": 2.4931,
1619
  "step": 1250
1620
  },
1621
  {
1622
  "epoch": 13.07,
1623
+ "learning_rate": 3.2302450508656835e-06,
1624
+ "loss": 2.6623,
1625
  "step": 1255
1626
  },
1627
  {
1628
  "epoch": 13.12,
1629
+ "learning_rate": 3.1878098085814926e-06,
1630
+ "loss": 2.6147,
1631
  "step": 1260
1632
  },
1633
  {
1634
  "epoch": 13.18,
1635
+ "learning_rate": 3.1455243115498523e-06,
1636
+ "loss": 2.4961,
1637
  "step": 1265
1638
  },
1639
  {
1640
  "epoch": 13.23,
1641
+ "learning_rate": 3.1033920539141837e-06,
1642
+ "loss": 2.5426,
1643
  "step": 1270
1644
  },
1645
  {
1646
  "epoch": 13.28,
1647
+ "learning_rate": 3.061416517155397e-06,
1648
+ "loss": 2.6229,
1649
  "step": 1275
1650
  },
1651
  {
1652
  "epoch": 13.33,
1653
+ "learning_rate": 3.019601169804216e-06,
1654
+ "loss": 2.5512,
1655
  "step": 1280
1656
  },
1657
  {
1658
  "epoch": 13.39,
1659
+ "learning_rate": 2.97794946715456e-06,
1660
+ "loss": 2.5486,
1661
  "step": 1285
1662
  },
1663
  {
1664
  "epoch": 13.44,
1665
+ "learning_rate": 2.936464850978027e-06,
1666
+ "loss": 2.5661,
1667
  "step": 1290
1668
  },
1669
  {
1670
  "epoch": 13.49,
1671
+ "learning_rate": 2.8951507492394937e-06,
1672
+ "loss": 2.5613,
1673
  "step": 1295
1674
  },
1675
  {
1676
  "epoch": 13.54,
1677
+ "learning_rate": 2.854010575813856e-06,
1678
+ "loss": 2.5139,
1679
  "step": 1300
1680
  },
1681
  {
1682
  "epoch": 13.59,
1683
+ "learning_rate": 2.8130477302039292e-06,
1684
+ "loss": 2.5344,
1685
  "step": 1305
1686
  },
1687
  {
1688
  "epoch": 13.65,
1689
+ "learning_rate": 2.7722655972595438e-06,
1690
+ "loss": 2.6115,
1691
  "step": 1310
1692
  },
1693
  {
1694
  "epoch": 13.7,
1695
+ "learning_rate": 2.731667546897845e-06,
1696
+ "loss": 2.551,
1697
  "step": 1315
1698
  },
1699
  {
1700
  "epoch": 13.75,
1701
+ "learning_rate": 2.6912569338248317e-06,
1702
+ "loss": 2.6828,
1703
  "step": 1320
1704
  },
1705
  {
1706
  "epoch": 13.8,
1707
+ "learning_rate": 2.6510370972581455e-06,
1708
+ "loss": 2.502,
1709
  "step": 1325
1710
  },
1711
  {
1712
  "epoch": 13.85,
1713
+ "learning_rate": 2.61101136065115e-06,
1714
+ "loss": 2.6054,
1715
  "step": 1330
1716
  },
1717
  {
1718
  "epoch": 13.91,
1719
+ "learning_rate": 2.5711830314182996e-06,
1720
+ "loss": 2.6214,
1721
  "step": 1335
1722
  },
1723
  {
1724
  "epoch": 13.96,
1725
+ "learning_rate": 2.5315554006618487e-06,
1726
+ "loss": 2.519,
1727
  "step": 1340
1728
  },
1729
  {
1730
  "epoch": 14.0,
1731
+ "eval_loss": 2.567744255065918,
1732
+ "eval_runtime": 174.5513,
1733
+ "eval_samples_per_second": 4.388,
1734
+ "eval_steps_per_second": 1.1,
1735
  "step": 1344
1736
  },
1737
  {
1738
  "epoch": 14.01,
1739
+ "learning_rate": 2.4921317428998924e-06,
1740
+ "loss": 2.5582,
1741
  "step": 1345
1742
  },
1743
  {
1744
  "epoch": 14.06,
1745
+ "learning_rate": 2.4529153157957913e-06,
1746
+ "loss": 2.4842,
1747
  "step": 1350
1748
  },
1749
  {
1750
  "epoch": 14.11,
1751
+ "learning_rate": 2.4139093598889806e-06,
1752
+ "loss": 2.5287,
1753
  "step": 1355
1754
  },
1755
  {
1756
  "epoch": 14.17,
1757
+ "learning_rate": 2.3751170983272e-06,
1758
+ "loss": 2.6123,
1759
  "step": 1360
1760
  },
1761
  {
1762
  "epoch": 14.22,
1763
+ "learning_rate": 2.3365417366001552e-06,
1764
+ "loss": 2.6674,
1765
  "step": 1365
1766
  },
1767
  {
1768
  "epoch": 14.27,
1769
+ "learning_rate": 2.2981864622746438e-06,
1770
+ "loss": 2.6728,
1771
  "step": 1370
1772
  },
1773
  {
1774
  "epoch": 14.32,
1775
+ "learning_rate": 2.260054444731155e-06,
1776
+ "loss": 2.5313,
1777
  "step": 1375
1778
  },
1779
  {
1780
  "epoch": 14.38,
1781
+ "learning_rate": 2.2221488349019903e-06,
1782
+ "loss": 2.664,
1783
  "step": 1380
1784
  },
1785
  {
1786
  "epoch": 14.43,
1787
+ "learning_rate": 2.184472765010871e-06,
1788
+ "loss": 2.4754,
1789
  "step": 1385
1790
  },
1791
  {
1792
  "epoch": 14.48,
1793
+ "learning_rate": 2.147029348314136e-06,
1794
+ "loss": 2.6064,
1795
  "step": 1390
1796
  },
1797
  {
1798
  "epoch": 14.53,
1799
+ "learning_rate": 2.109821678843484e-06,
1800
+ "loss": 2.5408,
1801
  "step": 1395
1802
  },
1803
  {
1804
  "epoch": 14.58,
1805
+ "learning_rate": 2.0728528311502977e-06,
1806
+ "loss": 2.6351,
1807
  "step": 1400
1808
  },
1809
  {
1810
  "epoch": 14.64,
1811
+ "learning_rate": 2.036125860051594e-06,
1812
+ "loss": 2.4797,
1813
  "step": 1405
1814
  },
1815
  {
1816
  "epoch": 14.69,
1817
+ "learning_rate": 1.999643800377596e-06,
1818
+ "loss": 2.6001,
1819
  "step": 1410
1820
  },
1821
  {
1822
  "epoch": 14.74,
1823
+ "learning_rate": 1.963409666720958e-06,
1824
+ "loss": 2.5746,
1825
  "step": 1415
1826
  },
1827
  {
1828
  "epoch": 14.79,
1829
+ "learning_rate": 1.927426453187663e-06,
1830
+ "loss": 2.5829,
1831
  "step": 1420
1832
  },
1833
  {
1834
  "epoch": 14.84,
1835
+ "learning_rate": 1.8916971331496143e-06,
1836
+ "loss": 2.5435,
1837
  "step": 1425
1838
  },
1839
  {
1840
  "epoch": 14.9,
1841
+ "learning_rate": 1.8562246589989369e-06,
1842
+ "loss": 2.5677,
1843
  "step": 1430
1844
  },
1845
  {
1846
  "epoch": 14.95,
1847
+ "learning_rate": 1.8210119619040206e-06,
1848
+ "loss": 2.4879,
1849
  "step": 1435
1850
  },
1851
  {
1852
  "epoch": 15.0,
1853
+ "learning_rate": 1.7860619515673034e-06,
1854
+ "loss": 2.5076,
1855
  "step": 1440
1856
  },
1857
  {
1858
  "epoch": 15.0,
1859
+ "eval_loss": 2.5677287578582764,
1860
+ "eval_runtime": 174.6013,
1861
  "eval_samples_per_second": 4.387,
1862
  "eval_steps_per_second": 1.1,
1863
  "step": 1440
1864
  },
1865
  {
1866
  "epoch": 15.05,
1867
+ "learning_rate": 1.75137751598484e-06,
1868
+ "loss": 2.522,
1869
  "step": 1445
1870
  },
1871
  {
1872
  "epoch": 15.1,
1873
+ "learning_rate": 1.7169615212076612e-06,
1874
+ "loss": 2.5531,
1875
  "step": 1450
1876
  },
1877
  {
1878
  "epoch": 15.16,
1879
+ "learning_rate": 1.6828168111049454e-06,
1880
+ "loss": 2.6733,
1881
  "step": 1455
1882
  },
1883
  {
1884
  "epoch": 15.21,
1885
+ "learning_rate": 1.6489462071290213e-06,
1886
+ "loss": 2.5552,
1887
  "step": 1460
1888
  },
1889
  {
1890
  "epoch": 15.26,
1891
+ "learning_rate": 1.615352508082229e-06,
1892
+ "loss": 2.5467,
1893
  "step": 1465
1894
  },
1895
  {
1896
  "epoch": 15.31,
1897
+ "learning_rate": 1.5820384898856433e-06,
1898
+ "loss": 2.4187,
1899
  "step": 1470
1900
  },
1901
  {
1902
  "epoch": 15.36,
1903
+ "learning_rate": 1.549006905349702e-06,
1904
+ "loss": 2.5954,
1905
  "step": 1475
1906
  },
1907
  {
1908
  "epoch": 15.42,
1909
+ "learning_rate": 1.5162604839467265e-06,
1910
+ "loss": 2.6387,
1911
  "step": 1480
1912
  },
1913
  {
1914
  "epoch": 15.47,
1915
+ "learning_rate": 1.4838019315853796e-06,
1916
+ "loss": 2.6038,
1917
  "step": 1485
1918
  },
1919
  {
1920
  "epoch": 15.52,
1921
+ "learning_rate": 1.4516339303870763e-06,
1922
+ "loss": 2.5639,
1923
  "step": 1490
1924
  },
1925
  {
1926
  "epoch": 15.57,
1927
+ "learning_rate": 1.419759138464355e-06,
1928
+ "loss": 2.5704,
1929
  "step": 1495
1930
  },
1931
  {
1932
  "epoch": 15.62,
1933
+ "learning_rate": 1.3881801897012225e-06,
1934
+ "loss": 2.4915,
1935
  "step": 1500
1936
  },
1937
  {
1938
  "epoch": 15.68,
1939
+ "learning_rate": 1.3568996935355194e-06,
1940
+ "loss": 2.5739,
1941
  "step": 1505
1942
  },
1943
  {
1944
  "epoch": 15.73,
1945
+ "learning_rate": 1.325920234743291e-06,
1946
+ "loss": 2.6971,
1947
  "step": 1510
1948
  },
1949
  {
1950
  "epoch": 15.78,
1951
+ "learning_rate": 1.2952443732252058e-06,
1952
+ "loss": 2.5222,
1953
  "step": 1515
1954
  },
1955
  {
1956
  "epoch": 15.83,
1957
+ "learning_rate": 1.264874643795021e-06,
1958
+ "loss": 2.4783,
1959
  "step": 1520
1960
  },
1961
  {
1962
  "epoch": 15.89,
1963
+ "learning_rate": 1.234813555970129e-06,
1964
+ "loss": 2.5954,
1965
  "step": 1525
1966
  },
1967
  {
1968
  "epoch": 15.94,
1969
+ "learning_rate": 1.2050635937641909e-06,
1970
+ "loss": 2.6067,
1971
  "step": 1530
1972
  },
1973
  {
1974
  "epoch": 15.99,
1975
+ "learning_rate": 1.1756272154818715e-06,
1976
+ "loss": 2.5443,
1977
  "step": 1535
1978
  },
1979
  {
1980
  "epoch": 16.0,
1981
+ "eval_loss": 2.567720413208008,
1982
+ "eval_runtime": 174.877,
1983
+ "eval_samples_per_second": 4.38,
1984
+ "eval_steps_per_second": 1.098,
1985
  "step": 1536
1986
  },
1987
  {
1988
  "epoch": 16.04,
1989
+ "learning_rate": 1.1465068535157098e-06,
1990
+ "loss": 2.5967,
1991
  "step": 1540
1992
  },
1993
  {
1994
  "epoch": 16.09,
1995
+ "learning_rate": 1.1177049141451223e-06,
1996
+ "loss": 2.6367,
1997
  "step": 1545
1998
  },
1999
  {
2000
  "epoch": 16.15,
2001
+ "learning_rate": 1.089223777337568e-06,
2002
+ "loss": 2.5624,
2003
  "step": 1550
2004
  },
2005
  {
2006
  "epoch": 16.2,
2007
+ "learning_rate": 1.0610657965518861e-06,
2008
+ "loss": 2.5973,
2009
  "step": 1555
2010
  },
2011
  {
2012
  "epoch": 16.25,
2013
+ "learning_rate": 1.0332332985438248e-06,
2014
+ "loss": 2.4671,
2015
  "step": 1560
2016
  },
2017
  {
2018
  "epoch": 16.3,
2019
+ "learning_rate": 1.0057285831737739e-06,
2020
+ "loss": 2.5794,
2021
  "step": 1565
2022
  },
2023
  {
2024
  "epoch": 16.35,
2025
+ "learning_rate": 9.785539232167296e-07,
2026
+ "loss": 2.6019,
2027
  "step": 1570
2028
  },
2029
  {
2030
  "epoch": 16.41,
2031
+ "learning_rate": 9.517115641744795e-07,
2032
+ "loss": 2.3944,
2033
  "step": 1575
2034
  },
2035
  {
2036
  "epoch": 16.46,
2037
+ "learning_rate": 9.252037240900618e-07,
2038
+ "loss": 2.6003,
2039
  "step": 1580
2040
  },
2041
  {
2042
  "epoch": 16.51,
2043
+ "learning_rate": 8.990325933644717e-07,
2044
+ "loss": 2.5742,
2045
  "step": 1585
2046
  },
2047
  {
2048
  "epoch": 16.56,
2049
+ "learning_rate": 8.732003345756812e-07,
2050
+ "loss": 2.5666,
2051
  "step": 1590
2052
  },
2053
  {
2054
  "epoch": 16.61,
2055
+ "learning_rate": 8.47709082299924e-07,
2056
+ "loss": 2.5089,
2057
  "step": 1595
2058
  },
2059
  {
2060
  "epoch": 16.67,
2061
+ "learning_rate": 8.225609429353187e-07,
2062
+ "loss": 2.6752,
2063
  "step": 1600
2064
  },
2065
  {
2066
  "epoch": 16.72,
2067
+ "learning_rate": 7.977579945278091e-07,
2068
+ "loss": 2.6726,
2069
  "step": 1605
2070
  },
2071
  {
2072
  "epoch": 16.77,
2073
+ "learning_rate": 7.733022865994599e-07,
2074
+ "loss": 2.5055,
2075
  "step": 1610
2076
  },
2077
  {
2078
  "epoch": 16.82,
2079
+ "learning_rate": 7.491958399790827e-07,
2080
+ "loss": 2.564,
2081
  "step": 1615
2082
  },
2083
  {
2084
  "epoch": 16.88,
2085
+ "learning_rate": 7.254406466352682e-07,
2086
+ "loss": 2.5455,
2087
  "step": 1620
2088
  },
2089
  {
2090
  "epoch": 16.93,
2091
+ "learning_rate": 7.020386695117732e-07,
2092
+ "loss": 2.5649,
2093
  "step": 1625
2094
  },
2095
  {
2096
  "epoch": 16.98,
2097
+ "learning_rate": 6.789918423653285e-07,
2098
+ "loss": 2.5972,
2099
  "step": 1630
2100
  },
2101
  {
2102
  "epoch": 17.0,
2103
+ "eval_loss": 2.5676934719085693,
2104
+ "eval_runtime": 164.8301,
2105
+ "eval_samples_per_second": 4.647,
2106
+ "eval_steps_per_second": 0.582,
2107
  "step": 1632
2108
  },
2109
  {
2110
  "epoch": 17.03,
2111
+ "learning_rate": 6.563020696058387e-07,
2112
+ "loss": 2.5949,
2113
  "step": 1635
2114
  },
2115
  {
2116
  "epoch": 17.08,
2117
+ "learning_rate": 6.339712261390213e-07,
2118
+ "loss": 2.5258,
2119
  "step": 1640
2120
  },
2121
  {
2122
  "epoch": 17.14,
2123
+ "learning_rate": 6.120011572114803e-07,
2124
+ "loss": 2.6606,
2125
  "step": 1645
2126
  },
2127
  {
2128
  "epoch": 17.19,
2129
+ "learning_rate": 5.903936782582253e-07,
2130
+ "loss": 2.5174,
2131
  "step": 1650
2132
  },
2133
  {
2134
  "epoch": 17.24,
2135
+ "learning_rate": 5.691505747526633e-07,
2136
+ "loss": 2.6218,
2137
  "step": 1655
2138
  },
2139
  {
2140
  "epoch": 17.29,
2141
+ "learning_rate": 5.482736020590551e-07,
2142
+ "loss": 2.5367,
2143
  "step": 1660
2144
  },
2145
  {
2146
  "epoch": 17.34,
2147
+ "learning_rate": 5.277644852874742e-07,
2148
+ "loss": 2.6172,
2149
  "step": 1665
2150
  },
2151
  {
2152
  "epoch": 17.4,
2153
+ "learning_rate": 5.076249191512461e-07,
2154
+ "loss": 2.5443,
2155
  "step": 1670
2156
  },
2157
  {
2158
  "epoch": 17.45,
2159
+ "learning_rate": 4.878565678269204e-07,
2160
+ "loss": 2.406,
2161
  "step": 1675
2162
  },
2163
  {
2164
  "epoch": 17.5,
2165
+ "learning_rate": 4.6846106481675035e-07,
2166
+ "loss": 2.6111,
2167
  "step": 1680
2168
  },
2169
  {
2170
  "epoch": 17.55,
2171
+ "learning_rate": 4.494400128137144e-07,
2172
+ "loss": 2.5944,
2173
  "step": 1685
2174
  },
2175
  {
2176
  "epoch": 17.6,
2177
+ "learning_rate": 4.3079498356908446e-07,
2178
+ "loss": 2.589,
2179
  "step": 1690
2180
  },
2181
  {
2182
  "epoch": 17.66,
2183
+ "learning_rate": 4.1252751776254373e-07,
2184
+ "loss": 2.6432,
2185
  "step": 1695
2186
  },
2187
  {
2188
  "epoch": 17.71,
2189
+ "learning_rate": 3.946391248748821e-07,
2190
+ "loss": 2.6341,
2191
  "step": 1700
2192
  },
2193
  {
2194
  "epoch": 17.76,
2195
+ "learning_rate": 3.7713128306326286e-07,
2196
+ "loss": 2.4919,
2197
  "step": 1705
2198
  },
2199
  {
2200
  "epoch": 17.81,
2201
+ "learning_rate": 3.600054390390778e-07,
2202
+ "loss": 2.4727,
2203
  "step": 1710
2204
  },
2205
  {
2206
  "epoch": 17.86,
2207
+ "learning_rate": 3.4326300794840174e-07,
2208
+ "loss": 2.5534,
2209
  "step": 1715
2210
  },
2211
  {
2212
  "epoch": 17.92,
2213
+ "learning_rate": 3.269053732550581e-07,
2214
+ "loss": 2.6066,
2215
  "step": 1720
2216
  },
2217
  {
2218
  "epoch": 17.97,
2219
+ "learning_rate": 3.1093388662630173e-07,
2220
+ "loss": 2.5361,
2221
  "step": 1725
2222
  },
2223
  {
2224
  "epoch": 18.0,
2225
+ "eval_loss": 2.567695140838623,
2226
+ "eval_runtime": 164.9524,
2227
+ "eval_samples_per_second": 4.644,
2228
+ "eval_steps_per_second": 0.582,
2229
  "step": 1728
2230
  },
2231
  {
2232
  "epoch": 18.02,
2233
+ "learning_rate": 2.9534986782112306e-07,
2234
+ "loss": 2.608,
2235
  "step": 1730
2236
  },
2237
  {
2238
  "epoch": 18.07,
2239
+ "learning_rate": 2.80154604581197e-07,
2240
+ "loss": 2.5554,
2241
  "step": 1735
2242
  },
2243
  {
2244
  "epoch": 18.12,
2245
+ "learning_rate": 2.653493525244721e-07,
2246
+ "loss": 2.6435,
2247
  "step": 1740
2248
  },
2249
  {
2250
  "epoch": 18.18,
2251
+ "learning_rate": 2.5093533504141786e-07,
2252
+ "loss": 2.5905,
2253
  "step": 1745
2254
  },
2255
  {
2256
  "epoch": 18.23,
2257
+ "learning_rate": 2.3691374319393168e-07,
2258
+ "loss": 2.5416,
2259
  "step": 1750
2260
  },
2261
  {
2262
  "epoch": 18.28,
2263
+ "learning_rate": 2.232857356169199e-07,
2264
+ "loss": 2.5946,
2265
  "step": 1755
2266
  },
2267
  {
2268
  "epoch": 18.33,
2269
+ "learning_rate": 2.1005243842255552e-07,
2270
+ "loss": 2.5916,
2271
  "step": 1760
2272
  },
2273
  {
2274
  "epoch": 18.39,
2275
+ "learning_rate": 1.972149451072297e-07,
2276
+ "loss": 2.5382,
2277
  "step": 1765
2278
  },
2279
  {
2280
  "epoch": 18.44,
2281
+ "learning_rate": 1.8477431646118648e-07,
2282
+ "loss": 2.6638,
2283
  "step": 1770
2284
  },
2285
  {
2286
  "epoch": 18.49,
2287
+ "learning_rate": 1.7273158048087434e-07,
2288
+ "loss": 2.5931,
2289
  "step": 1775
2290
  },
2291
  {
2292
  "epoch": 18.54,
2293
+ "learning_rate": 1.6108773228399543e-07,
2294
+ "loss": 2.5053,
2295
  "step": 1780
2296
  },
2297
  {
2298
  "epoch": 18.59,
2299
+ "learning_rate": 1.4984373402728014e-07,
2300
+ "loss": 2.5557,
2301
  "step": 1785
2302
  },
2303
  {
2304
  "epoch": 18.65,
2305
+ "learning_rate": 1.3900051482698074e-07,
2306
+ "loss": 2.5686,
2307
  "step": 1790
2308
  },
2309
  {
2310
  "epoch": 18.7,
2311
+ "learning_rate": 1.2855897068209555e-07,
2312
+ "loss": 2.594,
2313
  "step": 1795
2314
  },
2315
  {
2316
  "epoch": 18.75,
2317
+ "learning_rate": 1.185199644003332e-07,
2318
+ "loss": 2.4774,
2319
  "step": 1800
2320
  },
2321
  {
2322
  "epoch": 18.8,
2323
+ "learning_rate": 1.0888432552681405e-07,
2324
+ "loss": 2.6601,
2325
  "step": 1805
2326
  },
2327
  {
2328
  "epoch": 18.85,
2329
+ "learning_rate": 9.965285027552452e-08,
2330
+ "loss": 2.5226,
2331
  "step": 1810
2332
  },
2333
  {
2334
  "epoch": 18.91,
2335
+ "learning_rate": 9.082630146352356e-08,
2336
+ "loss": 2.4359,
2337
  "step": 1815
2338
  },
2339
  {
2340
  "epoch": 18.96,
2341
+ "learning_rate": 8.240540844791145e-08,
2342
+ "loss": 2.5317,
2343
  "step": 1820
2344
  },
2345
  {
2346
  "epoch": 19.0,
2347
+ "eval_loss": 2.567706346511841,
2348
+ "eval_runtime": 164.8443,
2349
+ "eval_samples_per_second": 4.647,
2350
+ "eval_steps_per_second": 0.582,
2351
  "step": 1824
2352
  },
2353
  {
2354
  "epoch": 19.01,
2355
+ "learning_rate": 7.439086706555743e-08,
2356
+ "loss": 2.5861,
2357
  "step": 1825
2358
  },
2359
  {
2360
  "epoch": 19.06,
2361
+ "learning_rate": 6.678333957560513e-08,
2362
+ "loss": 2.6277,
2363
  "step": 1830
2364
  },
2365
  {
2366
  "epoch": 19.11,
2367
+ "learning_rate": 5.958345460474635e-08,
2368
+ "loss": 2.533,
2369
  "step": 1835
2370
  },
2371
  {
2372
  "epoch": 19.17,
2373
+ "learning_rate": 5.279180709527765e-08,
2374
+ "loss": 2.5901,
2375
  "step": 1840
2376
  },
2377
  {
2378
  "epoch": 19.22,
2379
+ "learning_rate": 4.640895825593683e-08,
2380
+ "loss": 2.6011,
2381
  "step": 1845
2382
  },
2383
  {
2384
  "epoch": 19.27,
2385
+ "learning_rate": 4.0435435515532304e-08,
2386
+ "loss": 2.6357,
2387
  "step": 1850
2388
  },
2389
  {
2390
  "epoch": 19.32,
2391
+ "learning_rate": 3.487173247935627e-08,
2392
+ "loss": 2.5523,
2393
  "step": 1855
2394
  },
2395
  {
2396
  "epoch": 19.38,
2397
+ "learning_rate": 2.971830888840177e-08,
2398
+ "loss": 2.5988,
2399
  "step": 1860
2400
  },
2401
  {
2402
  "epoch": 19.43,
2403
+ "learning_rate": 2.4975590581369778e-08,
2404
+ "loss": 2.6604,
2405
  "step": 1865
2406
  },
2407
  {
2408
  "epoch": 19.48,
2409
+ "learning_rate": 2.0643969459482326e-08,
2410
+ "loss": 2.5515,
2411
  "step": 1870
2412
  },
2413
  {
2414
  "epoch": 19.53,
2415
+ "learning_rate": 1.6723803454098408e-08,
2416
+ "loss": 2.6021,
2417
  "step": 1875
2418
  },
2419
  {
2420
  "epoch": 19.58,
2421
+ "learning_rate": 1.3215416497138756e-08,
2422
+ "loss": 2.5515,
2423
  "step": 1880
2424
  },
2425
  {
2426
  "epoch": 19.64,
2427
+ "learning_rate": 1.0119098494316693e-08,
2428
+ "loss": 2.4397,
2429
  "step": 1885
2430
  },
2431
  {
2432
  "epoch": 19.69,
2433
+ "learning_rate": 7.43510530118452e-09,
2434
+ "loss": 2.5337,
2435
  "step": 1890
2436
  },
2437
  {
2438
  "epoch": 19.74,
2439
+ "learning_rate": 5.163658701989316e-09,
2440
+ "loss": 2.4587,
2441
  "step": 1895
2442
  },
2443
  {
2444
  "epoch": 19.79,
2445
+ "learning_rate": 3.304946391349817e-09,
2446
+ "loss": 2.5766,
2447
  "step": 1900
2448
  },
2449
  {
2450
  "epoch": 19.84,
2451
+ "learning_rate": 1.8591219587416053e-09,
2452
+ "loss": 2.4665,
2453
  "step": 1905
2454
  },
2455
  {
2456
  "epoch": 19.9,
2457
+ "learning_rate": 8.26304875812256e-10,
2458
+ "loss": 2.6275,
2459
  "step": 1910
2460
  },
2461
  {
2462
  "epoch": 19.95,
2463
+ "learning_rate": 2.0658048650257223e-10,
2464
+ "loss": 2.5522,
2465
  "step": 1915
2466
  },
2467
  {
2468
  "epoch": 20.0,
2469
  "learning_rate": 0.0,
2470
+ "loss": 2.632,
2471
  "step": 1920
2472
  },
2473
  {
2474
  "epoch": 20.0,
2475
+ "eval_loss": 2.5677051544189453,
2476
+ "eval_runtime": 164.798,
2477
+ "eval_samples_per_second": 4.648,
2478
+ "eval_steps_per_second": 0.583,
2479
  "step": 1920
2480
  },
2481
  {
2482
  "epoch": 20.0,
2483
  "step": 1920,
2484
  "total_flos": 1.0984887148766822e+18,
2485
+ "train_loss": 0.428011018037796,
2486
+ "train_runtime": 2530.9924,
2487
+ "train_samples_per_second": 6.053,
2488
+ "train_steps_per_second": 0.759
2489
  }
2490
  ],
2491
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b43b3577f076f1aebc279513d44b66365d46750508d7189a2c78835d05f6f6c
3
  size 4728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fcd2d34fa2a07b0718ff1754f0d2b0590e8818ecf133c21ffda74cf3db65e8b
3
  size 4728