roy commited on
Commit
f63bba1
·
1 Parent(s): 59e423f

gogamza: summarization

Browse files
all_results.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "epoch": 10.0,
3
  "eval_gen_len": 20.0,
4
- "eval_loss": 1.933449387550354,
5
- "eval_rouge1": 22.3137,
6
- "eval_rouge2": 5.8376,
7
- "eval_rougeL": 22.1264,
8
- "eval_rougeLsum": 22.1274,
9
- "eval_runtime": 163.5249,
10
  "eval_samples": 7008,
11
- "eval_samples_per_second": 42.856,
12
- "eval_steps_per_second": 2.678,
13
- "train_loss": 1.5038130920392954,
14
- "train_runtime": 13804.3068,
15
  "train_samples": 56760,
16
- "train_samples_per_second": 41.118,
17
- "train_steps_per_second": 2.57
18
  }
 
1
  {
2
  "epoch": 10.0,
3
  "eval_gen_len": 20.0,
4
+ "eval_loss": 1.9633301496505737,
5
+ "eval_rouge1": 22.6551,
6
+ "eval_rouge2": 6.1328,
7
+ "eval_rougeL": 22.457,
8
+ "eval_rougeLsum": 22.4619,
9
+ "eval_runtime": 163.8551,
10
  "eval_samples": 7008,
11
+ "eval_samples_per_second": 42.77,
12
+ "eval_steps_per_second": 2.673,
13
+ "train_loss": 1.581618417182931,
14
+ "train_runtime": 14068.2834,
15
  "train_samples": 56760,
16
+ "train_samples_per_second": 40.346,
17
+ "train_steps_per_second": 2.522
18
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "gogamza/kobart-base-v2",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
  "add_bias_logits": false,
@@ -9,7 +9,7 @@
9
  ],
10
  "attention_dropout": 0.0,
11
  "author": "Heewon Jeon([email protected])",
12
- "bos_token_id": 1,
13
  "classif_dropout": 0.1,
14
  "classifier_dropout": 0.1,
15
  "d_model": 768,
@@ -17,7 +17,7 @@
17
  "decoder_ffn_dim": 3072,
18
  "decoder_layerdrop": 0.0,
19
  "decoder_layers": 6,
20
- "decoder_start_token_id": 1,
21
  "do_blenderbot_90_layernorm": false,
22
  "dropout": 0.1,
23
  "encoder_attention_heads": 16,
@@ -27,15 +27,13 @@
27
  "eos_token_id": 1,
28
  "extra_pos_embeddings": 2,
29
  "force_bos_token_to_be_generated": false,
30
- "forced_eos_token_id": 1,
31
- "gradient_checkpointing": false,
32
  "id2label": {
33
  "0": "NEGATIVE",
34
  "1": "POSITIVE"
35
  },
36
  "init_std": 0.02,
37
  "is_encoder_decoder": true,
38
- "kobart_version": 2.0,
39
  "label2id": {
40
  "NEGATIVE": 0,
41
  "POSITIVE": 1
@@ -48,7 +46,6 @@
48
  "pad_token_id": 3,
49
  "scale_embedding": false,
50
  "static_position_embeddings": false,
51
- "tokenizer_class": "PreTrainedTokenizerFast",
52
  "torch_dtype": "float32",
53
  "transformers_version": "4.25.0",
54
  "use_cache": true,
 
1
  {
2
+ "_name_or_path": "gogamza/kobart-summarization",
3
  "activation_dropout": 0.0,
4
  "activation_function": "gelu",
5
  "add_bias_logits": false,
 
9
  ],
10
  "attention_dropout": 0.0,
11
  "author": "Heewon Jeon([email protected])",
12
+ "bos_token_id": 0,
13
  "classif_dropout": 0.1,
14
  "classifier_dropout": 0.1,
15
  "d_model": 768,
 
17
  "decoder_ffn_dim": 3072,
18
  "decoder_layerdrop": 0.0,
19
  "decoder_layers": 6,
20
+ "decoder_start_token_id": 2,
21
  "do_blenderbot_90_layernorm": false,
22
  "dropout": 0.1,
23
  "encoder_attention_heads": 16,
 
27
  "eos_token_id": 1,
28
  "extra_pos_embeddings": 2,
29
  "force_bos_token_to_be_generated": false,
30
+ "forced_eos_token_id": 2,
 
31
  "id2label": {
32
  "0": "NEGATIVE",
33
  "1": "POSITIVE"
34
  },
35
  "init_std": 0.02,
36
  "is_encoder_decoder": true,
 
37
  "label2id": {
38
  "NEGATIVE": 0,
39
  "POSITIVE": 1
 
46
  "pad_token_id": 3,
47
  "scale_embedding": false,
48
  "static_position_embeddings": false,
 
49
  "torch_dtype": "float32",
50
  "transformers_version": "4.25.0",
51
  "use_cache": true,
eval_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
  "eval_gen_len": 20.0,
4
- "eval_loss": 1.933449387550354,
5
- "eval_rouge1": 22.3137,
6
- "eval_rouge2": 5.8376,
7
- "eval_rougeL": 22.1264,
8
- "eval_rougeLsum": 22.1274,
9
- "eval_runtime": 163.5249,
10
  "eval_samples": 7008,
11
- "eval_samples_per_second": 42.856,
12
- "eval_steps_per_second": 2.678
13
  }
 
1
  {
2
  "epoch": 10.0,
3
  "eval_gen_len": 20.0,
4
+ "eval_loss": 1.9633301496505737,
5
+ "eval_rouge1": 22.6551,
6
+ "eval_rouge2": 6.1328,
7
+ "eval_rougeL": 22.457,
8
+ "eval_rougeLsum": 22.4619,
9
+ "eval_runtime": 163.8551,
10
  "eval_samples": 7008,
11
+ "eval_samples_per_second": 42.77,
12
+ "eval_steps_per_second": 2.673
13
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50b60bc88908f3b3fab51f1e924033f49a8c9055d01c6148a53ad4cf0dc47454
3
  size 495652819
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adeb720df850151ba963293c94b7cf8baa672095f397cf43b7a36c00d531d665
3
  size 495652819
special_tokens_map.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "bos_token": "</s>",
3
  "eos_token": "</s>",
4
  "mask_token": "<mask>",
5
  "pad_token": "<pad>",
 
1
  {
2
+ "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "mask_token": "<mask>",
5
  "pad_token": "<pad>",
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "model_max_length": 1000000000000000019884624838656,
3
- "name_or_path": "gogamza/kobart-base-v2",
4
- "special_tokens_map_file": "/opt/ml/input/Summarization/checkpoint-34000/models--gogamza--kobart-base-v2/snapshots/f9f2ec35d3c32a1ecc7a3281f9626b7ec1913fed/special_tokens_map.json",
5
  "tokenizer_class": "PreTrainedTokenizerFast",
6
  "use_fast": true
7
  }
 
1
  {
2
  "model_max_length": 1000000000000000019884624838656,
3
+ "name_or_path": "gogamza/kobart-summarization",
4
+ "special_tokens_map_file": "/opt/ml/.cache/huggingface/hub/models--gogamza--kobart-summarization/snapshots/8a63d6913edc0e16a902e3fa8b688a134f0dd776/special_tokens_map.json",
5
  "tokenizer_class": "PreTrainedTokenizerFast",
6
  "use_fast": true
7
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "train_loss": 1.5038130920392954,
4
- "train_runtime": 13804.3068,
5
  "train_samples": 56760,
6
- "train_samples_per_second": 41.118,
7
- "train_steps_per_second": 2.57
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "train_loss": 1.581618417182931,
4
+ "train_runtime": 14068.2834,
5
  "train_samples": 56760,
6
+ "train_samples_per_second": 40.346,
7
+ "train_steps_per_second": 2.522
8
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 1.933449387550354,
3
- "best_model_checkpoint": "/opt/ml/input/Summarization/checkpoint-10000",
4
  "epoch": 10.0,
5
  "global_step": 35480,
6
  "is_hyper_param_search": false,
@@ -9,2357 +9,2357 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.03,
12
- "learning_rate": 9.603605964719648e-06,
13
- "loss": 2.9628,
14
  "step": 100
15
  },
16
  {
17
  "epoch": 0.06,
18
- "learning_rate": 2.651882655109716e-05,
19
- "loss": 2.3592,
20
  "step": 200
21
  },
22
  {
23
  "epoch": 0.08,
24
- "learning_rate": 4.9653319145399344e-05,
25
- "loss": 2.2458,
26
  "step": 300
27
  },
28
  {
29
  "epoch": 0.11,
30
- "learning_rate": 3.5272614557174996e-05,
31
- "loss": 2.2573,
32
  "step": 400
33
  },
34
  {
35
  "epoch": 0.14,
36
- "learning_rate": 1.1805441557799242e-05,
37
- "loss": 2.2079,
38
  "step": 500
39
  },
40
  {
41
  "epoch": 0.17,
42
- "learning_rate": 1.1082060287457063e-07,
43
- "loss": 2.1759,
44
  "step": 600
45
  },
46
  {
47
  "epoch": 0.2,
48
- "learning_rate": 4.722474358076239e-05,
49
- "loss": 2.1902,
50
  "step": 700
51
  },
52
  {
53
  "epoch": 0.23,
54
- "learning_rate": 3.921546108807772e-05,
55
- "loss": 2.1833,
56
  "step": 800
57
  },
58
  {
59
  "epoch": 0.25,
60
- "learning_rate": 2.786089912498746e-05,
61
- "loss": 2.1655,
62
  "step": 900
63
  },
64
  {
65
  "epoch": 0.28,
66
- "learning_rate": 1.5842522332451548e-05,
67
- "loss": 2.1736,
68
  "step": 1000
69
  },
70
  {
71
  "epoch": 0.31,
72
- "learning_rate": 5.998560182965376e-06,
73
- "loss": 2.1269,
74
  "step": 1100
75
  },
76
  {
77
  "epoch": 0.34,
78
- "learning_rate": 6.537378853569256e-07,
79
- "loss": 2.1282,
80
  "step": 1200
81
  },
82
  {
83
  "epoch": 0.37,
84
- "learning_rate": 4.975624045467085e-05,
85
- "loss": 2.1398,
86
  "step": 1300
87
  },
88
  {
89
  "epoch": 0.39,
90
- "learning_rate": 4.817047144722464e-05,
91
- "loss": 2.1608,
92
  "step": 1400
93
  },
94
  {
95
  "epoch": 0.42,
96
- "learning_rate": 4.519891933012913e-05,
97
- "loss": 2.1441,
98
  "step": 1500
99
  },
100
  {
101
  "epoch": 0.45,
102
- "learning_rate": 4.1019691489417214e-05,
103
- "loss": 2.1726,
104
  "step": 1600
105
  },
106
  {
107
  "epoch": 0.48,
108
- "learning_rate": 3.588328036709013e-05,
109
- "loss": 2.08,
110
  "step": 1700
111
  },
112
  {
113
  "epoch": 0.51,
114
- "learning_rate": 3.009754957197535e-05,
115
- "loss": 2.1441,
116
  "step": 1800
117
  },
118
  {
119
  "epoch": 0.54,
120
- "learning_rate": 2.4009281306639542e-05,
121
- "loss": 2.121,
122
  "step": 1900
123
  },
124
  {
125
  "epoch": 0.56,
126
- "learning_rate": 1.798339111133779e-05,
127
- "loss": 2.115,
128
  "step": 2000
129
  },
130
  {
131
  "epoch": 0.56,
132
  "eval_gen_len": 20.0,
133
- "eval_loss": 1.985213279724121,
134
- "eval_rouge1": 21.6401,
135
- "eval_rouge2": 5.6588,
136
- "eval_rougeL": 21.4503,
137
- "eval_rougeLsum": 21.4438,
138
- "eval_runtime": 165.0934,
139
- "eval_samples_per_second": 42.449,
140
- "eval_steps_per_second": 2.653,
141
  "step": 2000
142
  },
143
  {
144
  "epoch": 0.59,
145
- "learning_rate": 1.2381055741151043e-05,
146
- "loss": 2.0857,
147
  "step": 2100
148
  },
149
  {
150
  "epoch": 0.62,
151
- "learning_rate": 7.538065136536039e-06,
152
- "loss": 2.0659,
153
  "step": 2200
154
  },
155
  {
156
  "epoch": 0.65,
157
- "learning_rate": 3.7446960159146312e-06,
158
- "loss": 2.0782,
159
  "step": 2300
160
  },
161
  {
162
  "epoch": 0.68,
163
- "learning_rate": 1.2283134167178716e-06,
164
- "loss": 2.0432,
165
  "step": 2400
166
  },
167
  {
168
  "epoch": 0.7,
169
- "learning_rate": 1.397430049077326e-07,
170
- "loss": 2.0695,
171
  "step": 2500
172
  },
173
  {
174
  "epoch": 0.73,
175
- "learning_rate": 4.9888693982206995e-05,
176
- "loss": 2.1082,
177
  "step": 2600
178
  },
179
  {
180
  "epoch": 0.76,
181
- "learning_rate": 4.9413728617356034e-05,
182
- "loss": 2.1176,
183
  "step": 2700
184
  },
185
  {
186
  "epoch": 0.79,
187
- "learning_rate": 4.857231009627857e-05,
188
- "loss": 2.1424,
189
  "step": 2800
190
  },
191
  {
192
  "epoch": 0.82,
193
- "learning_rate": 4.737709413672616e-05,
194
- "loss": 2.1241,
195
  "step": 2900
196
  },
197
  {
198
  "epoch": 0.85,
199
- "learning_rate": 4.5846057899240565e-05,
200
- "loss": 2.115,
201
  "step": 3000
202
  },
203
  {
204
  "epoch": 0.87,
205
- "learning_rate": 4.400222959392605e-05,
206
- "loss": 2.0883,
207
  "step": 3100
208
  },
209
  {
210
  "epoch": 0.9,
211
- "learning_rate": 4.187334211473467e-05,
212
- "loss": 2.1088,
213
  "step": 3200
214
  },
215
  {
216
  "epoch": 0.93,
217
- "learning_rate": 3.949141591092745e-05,
218
- "loss": 2.0855,
219
  "step": 3300
220
  },
221
  {
222
  "epoch": 0.96,
223
- "learning_rate": 3.6892277369712686e-05,
224
- "loss": 2.0811,
225
  "step": 3400
226
  },
227
  {
228
  "epoch": 0.99,
229
- "learning_rate": 3.411501995403511e-05,
230
- "loss": 2.0632,
231
  "step": 3500
232
  },
233
  {
234
  "epoch": 1.01,
235
- "learning_rate": 3.1201416200504895e-05,
236
- "loss": 1.9746,
237
  "step": 3600
238
  },
239
  {
240
  "epoch": 1.04,
241
- "learning_rate": 2.8195289421565e-05,
242
- "loss": 1.8776,
243
  "step": 3700
244
  },
245
  {
246
  "epoch": 1.07,
247
- "learning_rate": 2.5141854562081277e-05,
248
- "loss": 1.878,
249
  "step": 3800
250
  },
251
  {
252
  "epoch": 1.1,
253
- "learning_rate": 2.208703812448537e-05,
254
- "loss": 1.9068,
255
  "step": 3900
256
  },
257
  {
258
  "epoch": 1.13,
259
- "learning_rate": 1.9076787391429783e-05,
260
- "loss": 1.8943,
261
  "step": 4000
262
  },
263
  {
264
  "epoch": 1.13,
265
  "eval_gen_len": 20.0,
266
- "eval_loss": 1.964289665222168,
267
- "eval_rouge1": 21.3418,
268
- "eval_rouge2": 5.6223,
269
- "eval_rougeL": 21.1288,
270
- "eval_rougeLsum": 21.1143,
271
- "eval_runtime": 162.0352,
272
- "eval_samples_per_second": 43.25,
273
- "eval_steps_per_second": 2.703,
274
  "step": 4000
275
  },
276
  {
277
  "epoch": 1.16,
278
- "learning_rate": 1.6156379335888783e-05,
279
- "loss": 1.8808,
280
  "step": 4100
281
  },
282
  {
283
  "epoch": 1.18,
284
- "learning_rate": 1.3369739613341079e-05,
285
- "loss": 1.8741,
286
  "step": 4200
287
  },
288
  {
289
  "epoch": 1.21,
290
- "learning_rate": 1.0758781879025238e-05,
291
- "loss": 1.86,
292
  "step": 4300
293
  },
294
  {
295
  "epoch": 1.24,
296
- "learning_rate": 8.362777367551548e-06,
297
- "loss": 1.8857,
298
  "step": 4400
299
  },
300
  {
301
  "epoch": 1.27,
302
- "learning_rate": 6.2177642169797586e-06,
303
- "loss": 1.8757,
304
  "step": 4500
305
  },
306
  {
307
  "epoch": 1.3,
308
- "learning_rate": 4.356005421678249e-06,
309
- "loss": 1.8509,
310
  "step": 4600
311
  },
312
  {
313
  "epoch": 1.32,
314
- "learning_rate": 2.8055035668582452e-06,
315
- "loss": 1.8304,
316
  "step": 4700
317
  },
318
  {
319
  "epoch": 1.35,
320
- "learning_rate": 1.589579643627363e-06,
321
- "loss": 1.8435,
322
  "step": 4800
323
  },
324
  {
325
  "epoch": 1.38,
326
- "learning_rate": 7.265222795761219e-07,
327
- "loss": 1.888,
328
  "step": 4900
329
  },
330
  {
331
  "epoch": 1.41,
332
- "learning_rate": 2.2931266079587017e-07,
333
- "loss": 1.8392,
334
  "step": 5000
335
  },
336
  {
337
  "epoch": 1.44,
338
- "learning_rate": 4.99986426423885e-05,
339
- "loss": 1.8821,
340
  "step": 5100
341
  },
342
  {
343
  "epoch": 1.47,
344
- "learning_rate": 4.9935733363756125e-05,
345
- "loss": 1.9121,
346
  "step": 5200
347
  },
348
  {
349
  "epoch": 1.49,
350
- "learning_rate": 4.9779159803948603e-05,
351
- "loss": 1.8862,
352
  "step": 5300
353
  },
354
  {
355
  "epoch": 1.52,
356
- "learning_rate": 4.952951127049067e-05,
357
- "loss": 1.9367,
358
  "step": 5400
359
  },
360
  {
361
  "epoch": 1.55,
362
- "learning_rate": 4.9187727384091566e-05,
363
- "loss": 1.9389,
364
  "step": 5500
365
  },
366
  {
367
  "epoch": 1.58,
368
- "learning_rate": 4.875509454212479e-05,
369
- "loss": 1.917,
370
  "step": 5600
371
  },
372
  {
373
  "epoch": 1.61,
374
- "learning_rate": 4.823324107691887e-05,
375
- "loss": 1.9325,
376
  "step": 5700
377
  },
378
  {
379
  "epoch": 1.63,
380
- "learning_rate": 4.762413112708219e-05,
381
- "loss": 1.939,
382
  "step": 5800
383
  },
384
  {
385
  "epoch": 1.66,
386
- "learning_rate": 4.6930057244928707e-05,
387
- "loss": 1.9489,
388
  "step": 5900
389
  },
390
  {
391
  "epoch": 1.69,
392
- "learning_rate": 4.6153631767828776e-05,
393
- "loss": 1.9567,
394
  "step": 6000
395
  },
396
  {
397
  "epoch": 1.69,
398
  "eval_gen_len": 20.0,
399
- "eval_loss": 1.9752851724624634,
400
- "eval_rouge1": 21.8045,
401
- "eval_rouge2": 5.6422,
402
- "eval_rougeL": 21.628,
403
- "eval_rougeLsum": 21.6131,
404
- "eval_runtime": 162.1464,
405
- "eval_samples_per_second": 43.22,
406
- "eval_steps_per_second": 2.701,
407
  "step": 6000
408
  },
409
  {
410
  "epoch": 1.72,
411
- "learning_rate": 4.529777698596109e-05,
412
- "loss": 1.9347,
413
  "step": 6100
414
  },
415
  {
416
  "epoch": 1.75,
417
- "learning_rate": 4.4365714143472324e-05,
418
- "loss": 1.9325,
419
  "step": 6200
420
  },
421
  {
422
  "epoch": 1.78,
423
- "learning_rate": 4.3360951314441554e-05,
424
- "loss": 1.9184,
425
  "step": 6300
426
  },
427
  {
428
  "epoch": 1.8,
429
- "learning_rate": 4.228727019928149e-05,
430
- "loss": 1.919,
431
  "step": 6400
432
  },
433
  {
434
  "epoch": 1.83,
435
- "learning_rate": 4.114871189127209e-05,
436
- "loss": 1.9375,
437
  "step": 6500
438
  },
439
  {
440
  "epoch": 1.86,
441
- "learning_rate": 3.994956166679789e-05,
442
- "loss": 1.9609,
443
  "step": 6600
444
  },
445
  {
446
  "epoch": 1.89,
447
- "learning_rate": 3.869433285653533e-05,
448
- "loss": 1.9409,
449
  "step": 6700
450
  },
451
  {
452
  "epoch": 1.92,
453
- "learning_rate": 3.73877498582952e-05,
454
- "loss": 1.9153,
455
  "step": 6800
456
  },
457
  {
458
  "epoch": 1.94,
459
- "learning_rate": 3.603473035545605e-05,
460
- "loss": 1.918,
461
  "step": 6900
462
  },
463
  {
464
  "epoch": 1.97,
465
- "learning_rate": 3.4640366807914365e-05,
466
- "loss": 1.9425,
467
  "step": 7000
468
  },
469
  {
470
  "epoch": 2.0,
471
- "learning_rate": 3.320990728521537e-05,
472
- "loss": 1.918,
473
  "step": 7100
474
  },
475
  {
476
  "epoch": 2.03,
477
- "learning_rate": 3.174873571400432e-05,
478
- "loss": 1.7143,
479
  "step": 7200
480
  },
481
  {
482
  "epoch": 2.06,
483
- "learning_rate": 3.0262351614142278e-05,
484
- "loss": 1.7251,
485
  "step": 7300
486
  },
487
  {
488
  "epoch": 2.09,
489
- "learning_rate": 2.875634939975517e-05,
490
- "loss": 1.7188,
491
  "step": 7400
492
  },
493
  {
494
  "epoch": 2.11,
495
- "learning_rate": 2.7236397323122172e-05,
496
- "loss": 1.732,
497
  "step": 7500
498
  },
499
  {
500
  "epoch": 2.14,
501
- "learning_rate": 2.570821614065411e-05,
502
- "loss": 1.7565,
503
  "step": 7600
504
  },
505
  {
506
  "epoch": 2.17,
507
- "learning_rate": 2.4177557581258026e-05,
508
- "loss": 1.7159,
509
  "step": 7700
510
  },
511
  {
512
  "epoch": 2.2,
513
- "learning_rate": 2.2650182698128346e-05,
514
- "loss": 1.7191,
515
  "step": 7800
516
  },
517
  {
518
  "epoch": 2.23,
519
- "learning_rate": 2.1131840185443426e-05,
520
- "loss": 1.7215,
521
  "step": 7900
522
  },
523
  {
524
  "epoch": 2.25,
525
- "learning_rate": 1.962824474157885e-05,
526
- "loss": 1.7129,
527
  "step": 8000
528
  },
529
  {
530
  "epoch": 2.25,
531
  "eval_gen_len": 20.0,
532
- "eval_loss": 1.9584242105484009,
533
- "eval_rouge1": 22.117,
534
- "eval_rouge2": 5.7798,
535
- "eval_rougeL": 21.9114,
536
- "eval_rougeLsum": 21.8886,
537
- "eval_runtime": 162.6289,
538
- "eval_samples_per_second": 43.092,
539
- "eval_steps_per_second": 2.693,
540
  "step": 8000
541
  },
542
  {
543
  "epoch": 2.28,
544
- "learning_rate": 1.8145055560273016e-05,
545
- "loss": 1.6991,
546
  "step": 8100
547
  },
548
  {
549
  "epoch": 2.31,
550
- "learning_rate": 1.6687855030699798e-05,
551
- "loss": 1.7131,
552
  "step": 8200
553
  },
554
  {
555
  "epoch": 2.34,
556
- "learning_rate": 1.52621277266162e-05,
557
- "loss": 1.7091,
558
  "step": 8300
559
  },
560
  {
561
  "epoch": 2.37,
562
- "learning_rate": 1.3873239763665138e-05,
563
- "loss": 1.7208,
564
  "step": 8400
565
  },
566
  {
567
  "epoch": 2.4,
568
- "learning_rate": 1.2526418602527839e-05,
569
- "loss": 1.7373,
570
  "step": 8500
571
  },
572
  {
573
  "epoch": 2.42,
574
- "learning_rate": 1.1226733373942214e-05,
575
- "loss": 1.6905,
576
  "step": 8600
577
  },
578
  {
579
  "epoch": 2.45,
580
- "learning_rate": 9.979075799639335e-06,
581
- "loss": 1.7103,
582
  "step": 8700
583
  },
584
  {
585
  "epoch": 2.48,
586
- "learning_rate": 8.788141781007439e-06,
587
- "loss": 1.7381,
588
  "step": 8800
589
  },
590
  {
591
  "epoch": 2.51,
592
- "learning_rate": 7.658413724779386e-06,
593
- "loss": 1.7005,
594
  "step": 8900
595
  },
596
  {
597
  "epoch": 2.54,
598
- "learning_rate": 6.594143672265889e-06,
599
- "loss": 1.692,
600
  "step": 9000
601
  },
602
  {
603
  "epoch": 2.56,
604
- "learning_rate": 5.599337295632133e-06,
605
- "loss": 1.6929,
606
  "step": 9100
607
  },
608
  {
609
  "epoch": 2.59,
610
- "learning_rate": 4.677738821452162e-06,
611
- "loss": 1.7245,
612
  "step": 9200
613
  },
614
  {
615
  "epoch": 2.62,
616
- "learning_rate": 3.832816938285592e-06,
617
- "loss": 1.6818,
618
  "step": 9300
619
  },
620
  {
621
  "epoch": 2.65,
622
- "learning_rate": 3.0677517413170523e-06,
623
- "loss": 1.7,
624
  "step": 9400
625
  },
626
  {
627
  "epoch": 2.68,
628
- "learning_rate": 2.385422763195976e-06,
629
- "loss": 1.6961,
630
  "step": 9500
631
  },
632
  {
633
  "epoch": 2.71,
634
- "learning_rate": 1.7883981361259272e-06,
635
- "loss": 1.7067,
636
  "step": 9600
637
  },
638
  {
639
  "epoch": 2.73,
640
- "learning_rate": 1.2789249259948616e-06,
641
- "loss": 1.6975,
642
  "step": 9700
643
  },
644
  {
645
  "epoch": 2.76,
646
- "learning_rate": 8.589206749264998e-07,
647
- "loss": 1.7217,
648
  "step": 9800
649
  },
650
  {
651
  "epoch": 2.79,
652
- "learning_rate": 5.29966184084788e-07,
653
- "loss": 1.7213,
654
  "step": 9900
655
  },
656
  {
657
  "epoch": 2.82,
658
- "learning_rate": 2.932995638952263e-07,
659
- "loss": 1.7072,
660
  "step": 10000
661
  },
662
  {
663
  "epoch": 2.82,
664
  "eval_gen_len": 20.0,
665
- "eval_loss": 1.933449387550354,
666
- "eval_rouge1": 22.3137,
667
- "eval_rouge2": 5.8376,
668
- "eval_rougeL": 22.1264,
669
- "eval_rougeLsum": 22.1274,
670
- "eval_runtime": 162.4524,
671
- "eval_samples_per_second": 43.139,
672
- "eval_steps_per_second": 2.696,
673
  "step": 10000
674
  },
675
  {
676
  "epoch": 2.85,
677
- "learning_rate": 1.5078315448915942e-07,
678
- "loss": 1.6934,
679
  "step": 10100
680
  },
681
  {
682
  "epoch": 2.87,
683
- "learning_rate": 1.0007514846990862e-07,
684
- "loss": 1.6952,
685
  "step": 10200
686
  },
687
  {
688
  "epoch": 2.9,
689
- "learning_rate": 4.998917939712543e-05,
690
- "loss": 1.757,
691
  "step": 10300
692
  },
693
  {
694
  "epoch": 2.93,
695
- "learning_rate": 4.995490569865264e-05,
696
- "loss": 1.7591,
697
  "step": 10400
698
  },
699
  {
700
  "epoch": 2.96,
701
- "learning_rate": 4.989719237466243e-05,
702
- "loss": 1.7812,
703
  "step": 10500
704
  },
705
  {
706
  "epoch": 2.99,
707
- "learning_rate": 4.9816093742915244e-05,
708
- "loss": 1.7639,
709
  "step": 10600
710
  },
711
  {
712
  "epoch": 3.02,
713
- "learning_rate": 4.9711686130604656e-05,
714
- "loss": 1.7264,
715
  "step": 10700
716
  },
717
  {
718
  "epoch": 3.04,
719
- "learning_rate": 4.9584067802520866e-05,
720
- "loss": 1.6739,
721
  "step": 10800
722
  },
723
  {
724
  "epoch": 3.07,
725
- "learning_rate": 4.943497980862275e-05,
726
- "loss": 1.6629,
727
  "step": 10900
728
  },
729
  {
730
  "epoch": 3.1,
731
- "learning_rate": 4.926155082525451e-05,
732
- "loss": 1.6756,
733
  "step": 11000
734
  },
735
  {
736
  "epoch": 3.13,
737
- "learning_rate": 4.906533477770676e-05,
738
- "loss": 1.6823,
739
  "step": 11100
740
  },
741
  {
742
  "epoch": 3.16,
743
- "learning_rate": 4.884651633765546e-05,
744
- "loss": 1.6432,
745
  "step": 11200
746
  },
747
  {
748
  "epoch": 3.18,
749
- "learning_rate": 4.860530144935725e-05,
750
- "loss": 1.6803,
751
  "step": 11300
752
  },
753
  {
754
  "epoch": 3.21,
755
- "learning_rate": 4.834191713582197e-05,
756
- "loss": 1.6985,
757
  "step": 11400
758
  },
759
  {
760
  "epoch": 3.24,
761
- "learning_rate": 4.8056611285146466e-05,
762
- "loss": 1.7087,
763
  "step": 11500
764
  },
765
  {
766
  "epoch": 3.27,
767
- "learning_rate": 4.7749652417211076e-05,
768
- "loss": 1.7142,
769
  "step": 11600
770
  },
771
  {
772
  "epoch": 3.3,
773
- "learning_rate": 4.742132943095807e-05,
774
- "loss": 1.7135,
775
  "step": 11700
776
  },
777
  {
778
  "epoch": 3.33,
779
- "learning_rate": 4.70719513324901e-05,
780
- "loss": 1.7136,
781
  "step": 11800
782
  },
783
  {
784
  "epoch": 3.35,
785
- "learning_rate": 4.670184694424442e-05,
786
- "loss": 1.7098,
787
  "step": 11900
788
  },
789
  {
790
  "epoch": 3.38,
791
- "learning_rate": 4.63113645955168e-05,
792
- "loss": 1.7184,
793
  "step": 12000
794
  },
795
  {
796
  "epoch": 3.38,
797
  "eval_gen_len": 20.0,
798
- "eval_loss": 2.0014004707336426,
799
- "eval_rouge1": 21.452,
800
- "eval_rouge2": 5.494,
801
- "eval_rougeL": 21.3143,
802
- "eval_rougeLsum": 21.3158,
803
- "eval_runtime": 162.8289,
804
- "eval_samples_per_second": 43.039,
805
- "eval_steps_per_second": 2.69,
806
  "step": 12000
807
  },
808
  {
809
  "epoch": 3.41,
810
- "learning_rate": 4.590087179462613e-05,
811
- "loss": 1.7121,
812
  "step": 12100
813
  },
814
  {
815
  "epoch": 3.44,
816
- "learning_rate": 4.547075488302845e-05,
817
- "loss": 1.7064,
818
  "step": 12200
819
  },
820
  {
821
  "epoch": 3.47,
822
- "learning_rate": 4.502141867170594e-05,
823
- "loss": 1.7182,
824
  "step": 12300
825
  },
826
  {
827
  "epoch": 3.49,
828
- "learning_rate": 4.455328606017294e-05,
829
- "loss": 1.7126,
830
  "step": 12400
831
  },
832
  {
833
  "epoch": 3.52,
834
- "learning_rate": 4.4066797638457747e-05,
835
- "loss": 1.7135,
836
  "step": 12500
837
  },
838
  {
839
  "epoch": 3.55,
840
- "learning_rate": 4.3562411272434736e-05,
841
- "loss": 1.7224,
842
  "step": 12600
843
  },
844
  {
845
  "epoch": 3.58,
846
- "learning_rate": 4.304060167289698e-05,
847
- "loss": 1.7393,
848
  "step": 12700
849
  },
850
  {
851
  "epoch": 3.61,
852
- "learning_rate": 4.250185994877508e-05,
853
- "loss": 1.7153,
854
  "step": 12800
855
  },
856
  {
857
  "epoch": 3.64,
858
- "learning_rate": 4.194669314492266e-05,
859
- "loss": 1.6978,
860
  "step": 12900
861
  },
862
  {
863
  "epoch": 3.66,
864
- "learning_rate": 4.137562376490343e-05,
865
- "loss": 1.7238,
866
  "step": 13000
867
  },
868
  {
869
  "epoch": 3.69,
870
- "learning_rate": 4.07891892792292e-05,
871
- "loss": 1.73,
872
  "step": 13100
873
  },
874
  {
875
  "epoch": 3.72,
876
- "learning_rate": 4.0187941619511394e-05,
877
- "loss": 1.7077,
878
  "step": 13200
879
  },
880
  {
881
  "epoch": 3.75,
882
- "learning_rate": 3.957244665900235e-05,
883
- "loss": 1.696,
884
  "step": 13300
885
  },
886
  {
887
  "epoch": 3.78,
888
- "learning_rate": 3.8943283680015245e-05,
889
- "loss": 1.7197,
890
  "step": 13400
891
  },
892
  {
893
  "epoch": 3.8,
894
- "learning_rate": 3.830104482872382e-05,
895
- "loss": 1.755,
896
  "step": 13500
897
  },
898
  {
899
  "epoch": 3.83,
900
- "learning_rate": 3.764633455785513e-05,
901
- "loss": 1.7267,
902
  "step": 13600
903
  },
904
  {
905
  "epoch": 3.86,
906
- "learning_rate": 3.697976905779984e-05,
907
- "loss": 1.7325,
908
  "step": 13700
909
  },
910
  {
911
  "epoch": 3.89,
912
- "learning_rate": 3.630197567667534e-05,
913
- "loss": 1.7396,
914
  "step": 13800
915
  },
916
  {
917
  "epoch": 3.92,
918
- "learning_rate": 3.5613592329887714e-05,
919
- "loss": 1.7154,
920
  "step": 13900
921
  },
922
  {
923
  "epoch": 3.95,
924
- "learning_rate": 3.491526689974802e-05,
925
- "loss": 1.7372,
926
  "step": 14000
927
  },
928
  {
929
  "epoch": 3.95,
930
  "eval_gen_len": 20.0,
931
- "eval_loss": 1.9770143032073975,
932
- "eval_rouge1": 22.273,
933
- "eval_rouge2": 5.7637,
934
- "eval_rougeL": 22.1,
935
- "eval_rougeLsum": 22.0813,
936
- "eval_runtime": 163.6202,
937
- "eval_samples_per_second": 42.831,
938
- "eval_steps_per_second": 2.677,
939
  "step": 14000
940
  },
941
  {
942
  "epoch": 3.97,
943
- "learning_rate": 3.42076566257082e-05,
944
- "loss": 1.756,
945
  "step": 14100
946
  },
947
  {
948
  "epoch": 4.0,
949
- "learning_rate": 3.349142748579026e-05,
950
- "loss": 1.7312,
951
  "step": 14200
952
  },
953
  {
954
  "epoch": 4.03,
955
- "learning_rate": 3.276725356979111e-05,
956
- "loss": 1.4431,
957
  "step": 14300
958
  },
959
  {
960
  "epoch": 4.06,
961
- "learning_rate": 3.2035816444852827e-05,
962
- "loss": 1.4696,
963
  "step": 14400
964
  },
965
  {
966
  "epoch": 4.09,
967
- "learning_rate": 3.1297804513995484e-05,
968
- "loss": 1.4713,
969
  "step": 14500
970
  },
971
  {
972
  "epoch": 4.11,
973
- "learning_rate": 3.055391236821639e-05,
974
- "loss": 1.4552,
975
  "step": 14600
976
  },
977
  {
978
  "epoch": 4.14,
979
- "learning_rate": 2.9804840132765305e-05,
980
- "loss": 1.4596,
981
  "step": 14700
982
  },
983
  {
984
  "epoch": 4.17,
985
- "learning_rate": 2.905129280821106e-05,
986
- "loss": 1.4558,
987
  "step": 14800
988
  },
989
  {
990
  "epoch": 4.2,
991
- "learning_rate": 2.8293979606919723e-05,
992
- "loss": 1.4785,
993
  "step": 14900
994
  },
995
  {
996
  "epoch": 4.23,
997
- "learning_rate": 2.7533613285568734e-05,
998
- "loss": 1.4652,
999
  "step": 15000
1000
  },
1001
  {
1002
  "epoch": 4.26,
1003
- "learning_rate": 2.6778545727506706e-05,
1004
- "loss": 1.4952,
1005
  "step": 15100
1006
  },
1007
  {
1008
  "epoch": 4.28,
1009
- "learning_rate": 2.6014234894273854e-05,
1010
- "loss": 1.4794,
1011
  "step": 15200
1012
  },
1013
  {
1014
  "epoch": 4.31,
1015
- "learning_rate": 2.5249016556908847e-05,
1016
- "loss": 1.4764,
1017
  "step": 15300
1018
  },
1019
  {
1020
  "epoch": 4.34,
1021
- "learning_rate": 2.448361091212649e-05,
1022
- "loss": 1.4685,
1023
  "step": 15400
1024
  },
1025
  {
1026
  "epoch": 4.37,
1027
- "learning_rate": 2.3718738332928784e-05,
1028
- "loss": 1.4858,
1029
  "step": 15500
1030
  },
1031
  {
1032
  "epoch": 4.4,
1033
- "learning_rate": 2.295511869061505e-05,
1034
- "loss": 1.4953,
1035
  "step": 15600
1036
  },
1037
  {
1038
  "epoch": 4.43,
1039
- "learning_rate": 2.219347067726429e-05,
1040
- "loss": 1.4836,
1041
  "step": 15700
1042
  },
1043
  {
1044
  "epoch": 4.45,
1045
- "learning_rate": 2.143451112932731e-05,
1046
- "loss": 1.4926,
1047
  "step": 15800
1048
  },
1049
  {
1050
  "epoch": 4.48,
1051
- "learning_rate": 2.0678954352965396e-05,
1052
- "loss": 1.4652,
1053
  "step": 15900
1054
  },
1055
  {
1056
  "epoch": 4.51,
1057
- "learning_rate": 1.992751145177032e-05,
1058
- "loss": 1.4724,
1059
  "step": 16000
1060
  },
1061
  {
1062
  "epoch": 4.51,
1063
  "eval_gen_len": 20.0,
1064
- "eval_loss": 2.019371509552002,
1065
- "eval_rouge1": 22.0979,
1066
- "eval_rouge2": 5.7144,
1067
- "eval_rougeL": 21.9045,
1068
- "eval_rougeLsum": 21.8765,
1069
- "eval_runtime": 162.8312,
1070
- "eval_samples_per_second": 43.038,
1071
- "eval_steps_per_second": 2.69,
1072
  "step": 16000
1073
  },
1074
  {
1075
  "epoch": 4.54,
1076
- "learning_rate": 1.9180889657498532e-05,
1077
- "loss": 1.4723,
1078
  "step": 16100
1079
  },
1080
  {
1081
  "epoch": 4.57,
1082
- "learning_rate": 1.843979166444942e-05,
1083
- "loss": 1.475,
1084
  "step": 16200
1085
  },
1086
  {
1087
  "epoch": 4.59,
1088
- "learning_rate": 1.770491496811398e-05,
1089
- "loss": 1.4657,
1090
  "step": 16300
1091
  },
1092
  {
1093
  "epoch": 4.62,
1094
- "learning_rate": 1.6976951208716527e-05,
1095
- "loss": 1.466,
1096
  "step": 16400
1097
  },
1098
  {
1099
  "epoch": 4.65,
1100
- "learning_rate": 1.625658552026706e-05,
1101
- "loss": 1.4649,
1102
  "step": 16500
1103
  },
1104
  {
1105
  "epoch": 4.68,
1106
- "learning_rate": 1.554449588573719e-05,
1107
- "loss": 1.4792,
1108
  "step": 16600
1109
  },
1110
  {
1111
  "epoch": 4.71,
1112
- "learning_rate": 1.4841352498966237e-05,
1113
- "loss": 1.4477,
1114
  "step": 16700
1115
  },
1116
  {
1117
  "epoch": 4.74,
1118
- "learning_rate": 1.4147817133898276e-05,
1119
- "loss": 1.4744,
1120
  "step": 16800
1121
  },
1122
  {
1123
  "epoch": 4.76,
1124
- "learning_rate": 1.346454252174365e-05,
1125
- "loss": 1.4915,
1126
  "step": 16900
1127
  },
1128
  {
1129
  "epoch": 4.79,
1130
- "learning_rate": 1.2792171736651217e-05,
1131
- "loss": 1.4767,
1132
  "step": 17000
1133
  },
1134
  {
1135
  "epoch": 4.82,
1136
- "learning_rate": 1.213133759046946e-05,
1137
- "loss": 1.483,
1138
  "step": 17100
1139
  },
1140
  {
1141
  "epoch": 4.85,
1142
- "learning_rate": 1.148908659327833e-05,
1143
- "loss": 1.4579,
1144
  "step": 17200
1145
  },
1146
  {
1147
  "epoch": 4.88,
1148
- "learning_rate": 1.0853049469504679e-05,
1149
- "loss": 1.4791,
1150
  "step": 17300
1151
  },
1152
  {
1153
  "epoch": 4.9,
1154
- "learning_rate": 1.0230374018646024e-05,
1155
- "loss": 1.4593,
1156
  "step": 17400
1157
  },
1158
  {
1159
  "epoch": 4.93,
1160
- "learning_rate": 9.621646281041709e-06,
1161
- "loss": 1.4629,
1162
  "step": 17500
1163
  },
1164
  {
1165
  "epoch": 4.96,
1166
- "learning_rate": 9.027439169931561e-06,
1167
- "loss": 1.4677,
1168
  "step": 17600
1169
  },
1170
  {
1171
  "epoch": 4.99,
1172
- "learning_rate": 8.448311932250029e-06,
1173
- "loss": 1.4382,
1174
  "step": 17700
1175
  },
1176
  {
1177
  "epoch": 5.02,
1178
- "learning_rate": 7.884809622282473e-06,
1179
- "loss": 1.369,
1180
  "step": 17800
1181
  },
1182
  {
1183
  "epoch": 5.05,
1184
- "learning_rate": 7.337462588679232e-06,
1185
- "loss": 1.3049,
1186
  "step": 17900
1187
  },
1188
  {
1189
  "epoch": 5.07,
1190
- "learning_rate": 6.806785975309991e-06,
1191
- "loss": 1.3297,
1192
  "step": 18000
1193
  },
1194
  {
1195
  "epoch": 5.07,
1196
  "eval_gen_len": 20.0,
1197
- "eval_loss": 2.037019729614258,
1198
- "eval_rouge1": 21.9,
1199
- "eval_rouge2": 5.533,
1200
- "eval_rougeL": 21.7042,
1201
- "eval_rougeLsum": 21.7021,
1202
- "eval_runtime": 162.394,
1203
- "eval_samples_per_second": 43.154,
1204
- "eval_steps_per_second": 2.697,
1205
  "step": 18000
1206
  },
1207
  {
1208
  "epoch": 5.1,
1209
- "learning_rate": 6.293279236428415e-06,
1210
- "loss": 1.3197,
1211
  "step": 18100
1212
  },
1213
  {
1214
  "epoch": 5.13,
1215
- "learning_rate": 5.7974256666032834e-06,
1216
- "loss": 1.2993,
1217
  "step": 18200
1218
  },
1219
  {
1220
  "epoch": 5.16,
1221
- "learning_rate": 5.31969194585855e-06,
1222
- "loss": 1.3202,
1223
  "step": 18300
1224
  },
1225
  {
1226
  "epoch": 5.19,
1227
- "learning_rate": 4.8605277004504475e-06,
1228
- "loss": 1.2897,
1229
  "step": 18400
1230
  },
1231
  {
1232
  "epoch": 5.21,
1233
- "learning_rate": 4.42036507969501e-06,
1234
- "loss": 1.3157,
1235
  "step": 18500
1236
  },
1237
  {
1238
  "epoch": 5.24,
1239
- "learning_rate": 3.999618349244242e-06,
1240
- "loss": 1.3208,
1241
  "step": 18600
1242
  },
1243
  {
1244
  "epoch": 5.27,
1245
- "learning_rate": 3.5986835011937814e-06,
1246
- "loss": 1.3238,
1247
  "step": 18700
1248
  },
1249
  {
1250
  "epoch": 5.3,
1251
- "learning_rate": 3.2179378813890276e-06,
1252
- "loss": 1.3285,
1253
  "step": 18800
1254
  },
1255
  {
1256
  "epoch": 5.33,
1257
- "learning_rate": 2.857739834280396e-06,
1258
- "loss": 1.3075,
1259
  "step": 18900
1260
  },
1261
  {
1262
  "epoch": 5.36,
1263
- "learning_rate": 2.5184283656621174e-06,
1264
- "loss": 1.3187,
1265
  "step": 19000
1266
  },
1267
  {
1268
  "epoch": 5.38,
1269
- "learning_rate": 2.2003228236118087e-06,
1270
- "loss": 1.3019,
1271
  "step": 19100
1272
  },
1273
  {
1274
  "epoch": 5.41,
1275
- "learning_rate": 1.9037225979312562e-06,
1276
- "loss": 1.3156,
1277
  "step": 19200
1278
  },
1279
  {
1280
  "epoch": 5.44,
1281
- "learning_rate": 1.631546296469755e-06,
1282
- "loss": 1.2991,
1283
  "step": 19300
1284
  },
1285
  {
1286
  "epoch": 5.47,
1287
- "learning_rate": 1.3785520064985022e-06,
1288
- "loss": 1.2969,
1289
  "step": 19400
1290
  },
1291
  {
1292
  "epoch": 5.5,
1293
- "learning_rate": 1.1478364548263003e-06,
1294
- "loss": 1.3149,
1295
  "step": 19500
1296
  },
1297
  {
1298
  "epoch": 5.52,
1299
- "learning_rate": 9.396167828548325e-07,
1300
- "loss": 1.2895,
1301
  "step": 19600
1302
  },
1303
  {
1304
  "epoch": 5.55,
1305
- "learning_rate": 7.540889596512168e-07,
1306
- "loss": 1.3152,
1307
  "step": 19700
1308
  },
1309
  {
1310
  "epoch": 5.58,
1311
- "learning_rate": 5.914275975087762e-07,
1312
- "loss": 1.3272,
1313
  "step": 19800
1314
  },
1315
  {
1316
  "epoch": 5.61,
1317
- "learning_rate": 4.517857876080481e-07,
1318
- "loss": 1.3142,
1319
  "step": 19900
1320
  },
1321
  {
1322
  "epoch": 5.64,
1323
- "learning_rate": 3.352949559327476e-07,
1324
- "loss": 1.3186,
1325
  "step": 20000
1326
  },
1327
  {
1328
  "epoch": 5.64,
1329
  "eval_gen_len": 20.0,
1330
- "eval_loss": 2.039311170578003,
1331
- "eval_rouge1": 21.585,
1332
- "eval_rouge2": 5.4626,
1333
- "eval_rougeL": 21.4273,
1334
- "eval_rougeLsum": 21.4329,
1335
- "eval_runtime": 162.2232,
1336
- "eval_samples_per_second": 43.2,
1337
- "eval_steps_per_second": 2.7,
1338
  "step": 20000
1339
  },
1340
  {
1341
  "epoch": 5.67,
1342
- "learning_rate": 2.4206473957621197e-07,
1343
- "loss": 1.3001,
1344
  "step": 20100
1345
  },
1346
  {
1347
  "epoch": 5.69,
1348
- "learning_rate": 1.721828835548135e-07,
1349
- "loss": 1.3033,
1350
  "step": 20200
1351
  },
1352
  {
1353
  "epoch": 5.72,
1354
- "learning_rate": 1.2571515822542734e-07,
1355
- "loss": 1.3053,
1356
  "step": 20300
1357
  },
1358
  {
1359
  "epoch": 5.75,
1360
- "learning_rate": 1.0270529738465452e-07,
1361
- "loss": 1.3123,
1362
  "step": 20400
1363
  },
1364
  {
1365
  "epoch": 5.78,
1366
- "learning_rate": 4.9999206248096876e-05,
1367
- "loss": 1.3399,
1368
  "step": 20500
1369
  },
1370
  {
1371
  "epoch": 5.81,
1372
- "learning_rate": 4.999321815444586e-05,
1373
- "loss": 1.3654,
1374
  "step": 20600
1375
  },
1376
  {
1377
  "epoch": 5.83,
1378
- "learning_rate": 4.9981360794562834e-05,
1379
- "loss": 1.3834,
1380
  "step": 20700
1381
  },
1382
  {
1383
  "epoch": 5.86,
1384
- "learning_rate": 4.996363695854496e-05,
1385
- "loss": 1.4284,
1386
  "step": 20800
1387
  },
1388
  {
1389
  "epoch": 5.89,
1390
- "learning_rate": 4.994005081690109e-05,
1391
- "loss": 1.4216,
1392
  "step": 20900
1393
  },
1394
  {
1395
  "epoch": 5.92,
1396
- "learning_rate": 4.991060791957044e-05,
1397
- "loss": 1.4262,
1398
  "step": 21000
1399
  },
1400
  {
1401
  "epoch": 5.95,
1402
- "learning_rate": 4.987531519461667e-05,
1403
- "loss": 1.4331,
1404
  "step": 21100
1405
  },
1406
  {
1407
  "epoch": 5.98,
1408
- "learning_rate": 4.983418094659765e-05,
1409
- "loss": 1.4425,
1410
  "step": 21200
1411
  },
1412
  {
1413
  "epoch": 6.0,
1414
- "learning_rate": 4.978721485461138e-05,
1415
- "loss": 1.4284,
1416
  "step": 21300
1417
  },
1418
  {
1419
  "epoch": 6.03,
1420
- "learning_rate": 4.973554067339494e-05,
1421
- "loss": 1.3686,
1422
  "step": 21400
1423
  },
1424
  {
1425
  "epoch": 6.06,
1426
- "learning_rate": 4.967706145407849e-05,
1427
- "loss": 1.3545,
1428
  "step": 21500
1429
  },
1430
  {
1431
  "epoch": 6.09,
1432
- "learning_rate": 4.961278736181136e-05,
1433
- "loss": 1.3899,
1434
  "step": 21600
1435
  },
1436
  {
1437
  "epoch": 6.12,
1438
- "learning_rate": 4.954273352061473e-05,
1439
- "loss": 1.3801,
1440
  "step": 21700
1441
  },
1442
  {
1443
  "epoch": 6.14,
1444
- "learning_rate": 4.946691641451411e-05,
1445
- "loss": 1.3902,
1446
  "step": 21800
1447
  },
1448
  {
1449
  "epoch": 6.17,
1450
- "learning_rate": 4.938535388366059e-05,
1451
- "loss": 1.3836,
1452
  "step": 21900
1453
  },
1454
  {
1455
  "epoch": 6.2,
1456
- "learning_rate": 4.9298065120132974e-05,
1457
- "loss": 1.3839,
1458
  "step": 22000
1459
  },
1460
  {
1461
  "epoch": 6.2,
1462
  "eval_gen_len": 20.0,
1463
- "eval_loss": 2.1140034198760986,
1464
- "eval_rouge1": 20.6468,
1465
- "eval_rouge2": 5.2318,
1466
- "eval_rougeL": 20.4445,
1467
- "eval_rougeLsum": 20.4398,
1468
- "eval_runtime": 162.2862,
1469
- "eval_samples_per_second": 43.183,
1470
- "eval_steps_per_second": 2.699,
1471
  "step": 22000
1472
  },
1473
  {
1474
  "epoch": 6.23,
1475
- "learning_rate": 4.920507066342175e-05,
1476
- "loss": 1.4005,
1477
  "step": 22100
1478
  },
1479
  {
1480
  "epoch": 6.26,
1481
- "learning_rate": 4.910639239559606e-05,
1482
- "loss": 1.3887,
1483
  "step": 22200
1484
  },
1485
  {
1486
  "epoch": 6.29,
1487
- "learning_rate": 4.900205353615477e-05,
1488
- "loss": 1.4081,
1489
  "step": 22300
1490
  },
1491
  {
1492
  "epoch": 6.31,
1493
- "learning_rate": 4.889207863656274e-05,
1494
- "loss": 1.4239,
1495
  "step": 22400
1496
  },
1497
  {
1498
  "epoch": 6.34,
1499
- "learning_rate": 4.87764935744738e-05,
1500
- "loss": 1.4332,
1501
  "step": 22500
1502
  },
1503
  {
1504
  "epoch": 6.37,
1505
- "learning_rate": 4.865532554764157e-05,
1506
- "loss": 1.4264,
1507
  "step": 22600
1508
  },
1509
  {
1510
  "epoch": 6.4,
1511
- "learning_rate": 4.852860306751969e-05,
1512
- "loss": 1.4465,
1513
  "step": 22700
1514
  },
1515
  {
1516
  "epoch": 6.43,
1517
- "learning_rate": 4.839635595255296e-05,
1518
- "loss": 1.4339,
1519
  "step": 22800
1520
  },
1521
  {
1522
  "epoch": 6.45,
1523
- "learning_rate": 4.825861532116087e-05,
1524
- "loss": 1.4463,
1525
  "step": 22900
1526
  },
1527
  {
1528
  "epoch": 6.48,
1529
- "learning_rate": 4.811541358441531e-05,
1530
- "loss": 1.447,
1531
  "step": 23000
1532
  },
1533
  {
1534
  "epoch": 6.51,
1535
- "learning_rate": 4.796678443841405e-05,
1536
- "loss": 1.4343,
1537
  "step": 23100
1538
  },
1539
  {
1540
  "epoch": 6.54,
1541
- "learning_rate": 4.78127628563519e-05,
1542
- "loss": 1.4502,
1543
  "step": 23200
1544
  },
1545
  {
1546
  "epoch": 6.57,
1547
- "learning_rate": 4.765338508029135e-05,
1548
- "loss": 1.4459,
1549
  "step": 23300
1550
  },
1551
  {
1552
  "epoch": 6.6,
1553
- "learning_rate": 4.748868861263457e-05,
1554
- "loss": 1.4448,
1555
  "step": 23400
1556
  },
1557
  {
1558
  "epoch": 6.62,
1559
- "learning_rate": 4.731871220729898e-05,
1560
- "loss": 1.4377,
1561
  "step": 23500
1562
  },
1563
  {
1564
  "epoch": 6.65,
1565
- "learning_rate": 4.714349586059819e-05,
1566
- "loss": 1.4549,
1567
  "step": 23600
1568
  },
1569
  {
1570
  "epoch": 6.68,
1571
- "learning_rate": 4.696308080183071e-05,
1572
- "loss": 1.4393,
1573
  "step": 23700
1574
  },
1575
  {
1576
  "epoch": 6.71,
1577
- "learning_rate": 4.677750948357847e-05,
1578
- "loss": 1.4678,
1579
  "step": 23800
1580
  },
1581
  {
1582
  "epoch": 6.74,
1583
- "learning_rate": 4.65868255717175e-05,
1584
- "loss": 1.4629,
1585
  "step": 23900
1586
  },
1587
  {
1588
  "epoch": 6.76,
1589
- "learning_rate": 4.639107393514314e-05,
1590
- "loss": 1.4605,
1591
  "step": 24000
1592
  },
1593
  {
1594
  "epoch": 6.76,
1595
  "eval_gen_len": 20.0,
1596
- "eval_loss": 2.088627815246582,
1597
- "eval_rouge1": 21.2304,
1598
- "eval_rouge2": 5.2346,
1599
- "eval_rougeL": 21.0611,
1600
- "eval_rougeLsum": 21.0649,
1601
- "eval_runtime": 163.3508,
1602
- "eval_samples_per_second": 42.902,
1603
- "eval_steps_per_second": 2.681,
1604
  "step": 24000
1605
  },
1606
  {
1607
  "epoch": 6.79,
1608
- "learning_rate": 4.619030063521217e-05,
1609
- "loss": 1.4447,
1610
  "step": 24100
1611
  },
1612
  {
1613
  "epoch": 6.82,
1614
- "learning_rate": 4.5984552914904304e-05,
1615
- "loss": 1.4733,
1616
  "step": 24200
1617
  },
1618
  {
1619
  "epoch": 6.85,
1620
- "learning_rate": 4.5773879187705715e-05,
1621
- "loss": 1.4607,
1622
  "step": 24300
1623
  },
1624
  {
1625
  "epoch": 6.88,
1626
- "learning_rate": 4.555832902621708e-05,
1627
- "loss": 1.4659,
1628
  "step": 24400
1629
  },
1630
  {
1631
  "epoch": 6.91,
1632
- "learning_rate": 4.533795315048888e-05,
1633
- "loss": 1.482,
1634
  "step": 24500
1635
  },
1636
  {
1637
  "epoch": 6.93,
1638
- "learning_rate": 4.511280341608673e-05,
1639
- "loss": 1.4645,
1640
  "step": 24600
1641
  },
1642
  {
1643
  "epoch": 6.96,
1644
- "learning_rate": 4.488293280188952e-05,
1645
- "loss": 1.4645,
1646
  "step": 24700
1647
  },
1648
  {
1649
  "epoch": 6.99,
1650
- "learning_rate": 4.464839539762315e-05,
1651
- "loss": 1.4573,
1652
  "step": 24800
1653
  },
1654
  {
1655
  "epoch": 7.02,
1656
- "learning_rate": 4.4409246391133016e-05,
1657
- "loss": 1.3075,
1658
  "step": 24900
1659
  },
1660
  {
1661
  "epoch": 7.05,
1662
- "learning_rate": 4.416554205539801e-05,
1663
- "loss": 1.2172,
1664
  "step": 25000
1665
  },
1666
  {
1667
  "epoch": 7.07,
1668
- "learning_rate": 4.391733973528915e-05,
1669
- "loss": 1.1984,
1670
  "step": 25100
1671
  },
1672
  {
1673
  "epoch": 7.1,
1674
- "learning_rate": 4.3664697834076095e-05,
1675
- "loss": 1.2395,
1676
  "step": 25200
1677
  },
1678
  {
1679
  "epoch": 7.13,
1680
- "learning_rate": 4.340767579968456e-05,
1681
- "loss": 1.2372,
1682
  "step": 25300
1683
  },
1684
  {
1685
  "epoch": 7.16,
1686
- "learning_rate": 4.314896870880448e-05,
1687
- "loss": 1.2366,
1688
  "step": 25400
1689
  },
1690
  {
1691
  "epoch": 7.19,
1692
- "learning_rate": 4.288341113416578e-05,
1693
- "loss": 1.2374,
1694
  "step": 25500
1695
  },
1696
  {
1697
  "epoch": 7.22,
1698
- "learning_rate": 4.261365726708706e-05,
1699
- "loss": 1.2235,
1700
  "step": 25600
1701
  },
1702
  {
1703
  "epoch": 7.24,
1704
- "learning_rate": 4.2339770582026725e-05,
1705
- "loss": 1.2633,
1706
  "step": 25700
1707
  },
1708
  {
1709
  "epoch": 7.27,
1710
- "learning_rate": 4.206181552591627e-05,
1711
- "loss": 1.2347,
1712
  "step": 25800
1713
  },
1714
  {
1715
  "epoch": 7.3,
1716
- "learning_rate": 4.1779857502995634e-05,
1717
- "loss": 1.2533,
1718
  "step": 25900
1719
  },
1720
  {
1721
  "epoch": 7.33,
1722
- "learning_rate": 4.1493962859423225e-05,
1723
- "loss": 1.2594,
1724
  "step": 26000
1725
  },
1726
  {
1727
  "epoch": 7.33,
1728
  "eval_gen_len": 20.0,
1729
- "eval_loss": 2.1528804302215576,
1730
- "eval_rouge1": 21.6885,
1731
- "eval_rouge2": 5.3904,
1732
- "eval_rougeL": 21.4125,
1733
- "eval_rougeLsum": 21.4342,
1734
- "eval_runtime": 162.2811,
1735
- "eval_samples_per_second": 43.184,
1736
- "eval_steps_per_second": 2.699,
1737
  "step": 26000
1738
  },
1739
  {
1740
  "epoch": 7.36,
1741
- "learning_rate": 4.120419886766432e-05,
1742
- "loss": 1.2742,
1743
  "step": 26100
1744
  },
1745
  {
1746
  "epoch": 7.38,
1747
- "learning_rate": 4.091063371066154e-05,
1748
- "loss": 1.246,
1749
  "step": 26200
1750
  },
1751
  {
1752
  "epoch": 7.41,
1753
- "learning_rate": 4.061632768309558e-05,
1754
- "loss": 1.2496,
1755
  "step": 26300
1756
  },
1757
  {
1758
  "epoch": 7.44,
1759
- "learning_rate": 4.03154045781127e-05,
1760
- "loss": 1.2608,
1761
  "step": 26400
1762
  },
1763
  {
1764
  "epoch": 7.47,
1765
- "learning_rate": 4.001088944570764e-05,
1766
- "loss": 1.2696,
1767
  "step": 26500
1768
  },
1769
  {
1770
  "epoch": 7.5,
1771
- "learning_rate": 3.9702853939841514e-05,
1772
- "loss": 1.2624,
1773
  "step": 26600
1774
  },
1775
  {
1776
  "epoch": 7.53,
1777
- "learning_rate": 3.939137054283725e-05,
1778
- "loss": 1.2809,
1779
  "step": 26700
1780
  },
1781
  {
1782
  "epoch": 7.55,
1783
- "learning_rate": 3.9076512548324085e-05,
1784
- "loss": 1.2533,
1785
  "step": 26800
1786
  },
1787
  {
1788
  "epoch": 7.58,
1789
- "learning_rate": 3.875835404399126e-05,
1790
- "loss": 1.2811,
1791
  "step": 26900
1792
  },
1793
  {
1794
  "epoch": 7.61,
1795
- "learning_rate": 3.843696989415477e-05,
1796
- "loss": 1.2851,
1797
  "step": 27000
1798
  },
1799
  {
1800
  "epoch": 7.64,
1801
- "learning_rate": 3.811243572214143e-05,
1802
- "loss": 1.2661,
1803
  "step": 27100
1804
  },
1805
  {
1806
  "epoch": 7.67,
1807
- "learning_rate": 3.7784827892494295e-05,
1808
- "loss": 1.2661,
1809
  "step": 27200
1810
  },
1811
  {
1812
  "epoch": 7.69,
1813
- "learning_rate": 3.745422349300373e-05,
1814
- "loss": 1.2525,
1815
  "step": 27300
1816
  },
1817
  {
1818
  "epoch": 7.72,
1819
- "learning_rate": 3.712070031656822e-05,
1820
- "loss": 1.2864,
1821
  "step": 27400
1822
  },
1823
  {
1824
  "epoch": 7.75,
1825
- "learning_rate": 3.6784336842889355e-05,
1826
- "loss": 1.2801,
1827
  "step": 27500
1828
  },
1829
  {
1830
  "epoch": 7.78,
1831
- "learning_rate": 3.64452122200051e-05,
1832
- "loss": 1.2888,
1833
  "step": 27600
1834
  },
1835
  {
1836
  "epoch": 7.81,
1837
- "learning_rate": 3.6103406245665894e-05,
1838
- "loss": 1.2825,
1839
  "step": 27700
1840
  },
1841
  {
1842
  "epoch": 7.84,
1843
- "learning_rate": 3.575899934855785e-05,
1844
- "loss": 1.2791,
1845
  "step": 27800
1846
  },
1847
  {
1848
  "epoch": 7.86,
1849
- "learning_rate": 3.5412072569377407e-05,
1850
- "loss": 1.2704,
1851
  "step": 27900
1852
  },
1853
  {
1854
  "epoch": 7.89,
1855
- "learning_rate": 3.506270754176212e-05,
1856
- "loss": 1.2809,
1857
  "step": 28000
1858
  },
1859
  {
1860
  "epoch": 7.89,
1861
  "eval_gen_len": 20.0,
1862
- "eval_loss": 2.155640125274658,
1863
- "eval_rouge1": 20.7036,
1864
- "eval_rouge2": 5.3801,
1865
- "eval_rougeL": 20.5265,
1866
- "eval_rougeLsum": 20.5209,
1867
- "eval_runtime": 163.3415,
1868
- "eval_samples_per_second": 42.904,
1869
- "eval_steps_per_second": 2.681,
1870
  "step": 28000
1871
  },
1872
  {
1873
  "epoch": 7.92,
1874
- "learning_rate": 3.471098647308181e-05,
1875
- "loss": 1.2758,
1876
  "step": 28100
1877
  },
1878
  {
1879
  "epoch": 7.95,
1880
- "learning_rate": 3.4356992125094747e-05,
1881
- "loss": 1.2923,
1882
  "step": 28200
1883
  },
1884
  {
1885
  "epoch": 7.98,
1886
- "learning_rate": 3.40008077944734e-05,
1887
- "loss": 1.2823,
1888
  "step": 28300
1889
  },
1890
  {
1891
  "epoch": 8.0,
1892
- "learning_rate": 3.3642517293204305e-05,
1893
- "loss": 1.265,
1894
  "step": 28400
1895
  },
1896
  {
1897
  "epoch": 8.03,
1898
- "learning_rate": 3.328220492886667e-05,
1899
- "loss": 1.0506,
1900
  "step": 28500
1901
  },
1902
  {
1903
  "epoch": 8.06,
1904
- "learning_rate": 3.291995548479439e-05,
1905
- "loss": 1.0599,
1906
  "step": 28600
1907
  },
1908
  {
1909
  "epoch": 8.09,
1910
- "learning_rate": 3.255585420012606e-05,
1911
- "loss": 1.07,
1912
  "step": 28700
1913
  },
1914
  {
1915
  "epoch": 8.12,
1916
- "learning_rate": 3.2189986749747835e-05,
1917
- "loss": 1.0685,
1918
  "step": 28800
1919
  },
1920
  {
1921
  "epoch": 8.15,
1922
- "learning_rate": 3.182243922413361e-05,
1923
- "loss": 1.088,
1924
  "step": 28900
1925
  },
1926
  {
1927
  "epoch": 8.17,
1928
- "learning_rate": 3.145329810908756e-05,
1929
- "loss": 1.0686,
1930
  "step": 29000
1931
  },
1932
  {
1933
  "epoch": 8.2,
1934
- "learning_rate": 3.1082650265393565e-05,
1935
- "loss": 1.0841,
1936
  "step": 29100
1937
  },
1938
  {
1939
  "epoch": 8.23,
1940
- "learning_rate": 3.071058290837644e-05,
1941
- "loss": 1.0808,
1942
  "step": 29200
1943
  },
1944
  {
1945
  "epoch": 8.26,
1946
- "learning_rate": 3.0337183587379695e-05,
1947
- "loss": 1.0797,
1948
  "step": 29300
1949
  },
1950
  {
1951
  "epoch": 8.29,
1952
- "learning_rate": 2.9962540165164753e-05,
1953
- "loss": 1.0824,
1954
  "step": 29400
1955
  },
1956
  {
1957
  "epoch": 8.31,
1958
- "learning_rate": 2.958674079723637e-05,
1959
- "loss": 1.0767,
1960
  "step": 29500
1961
  },
1962
  {
1963
  "epoch": 8.34,
1964
- "learning_rate": 2.9209873911099242e-05,
1965
- "loss": 1.0872,
1966
  "step": 29600
1967
  },
1968
  {
1969
  "epoch": 8.37,
1970
- "learning_rate": 2.8832028185450523e-05,
1971
- "loss": 1.0726,
1972
  "step": 29700
1973
  },
1974
  {
1975
  "epoch": 8.4,
1976
- "learning_rate": 2.845329252931329e-05,
1977
- "loss": 1.0763,
1978
  "step": 29800
1979
  },
1980
  {
1981
  "epoch": 8.43,
1982
- "learning_rate": 2.8073756061115794e-05,
1983
- "loss": 1.1097,
1984
  "step": 29900
1985
  },
1986
  {
1987
  "epoch": 8.46,
1988
- "learning_rate": 2.7693508087721433e-05,
1989
- "loss": 1.0819,
1990
  "step": 30000
1991
  },
1992
  {
1993
  "epoch": 8.46,
1994
  "eval_gen_len": 20.0,
1995
- "eval_loss": 2.23087477684021,
1996
- "eval_rouge1": 20.9622,
1997
- "eval_rouge2": 5.3019,
1998
- "eval_rougeL": 20.7446,
1999
- "eval_rougeLsum": 20.7429,
2000
- "eval_runtime": 162.5566,
2001
- "eval_samples_per_second": 43.111,
2002
- "eval_steps_per_second": 2.694,
2003
  "step": 30000
2004
  },
2005
  {
2006
  "epoch": 8.48,
2007
- "learning_rate": 2.731263808341441e-05,
2008
- "loss": 1.1006,
2009
  "step": 30100
2010
  },
2011
  {
2012
  "epoch": 8.51,
2013
- "learning_rate": 2.6931235668845973e-05,
2014
- "loss": 1.0997,
2015
  "step": 30200
2016
  },
2017
  {
2018
  "epoch": 8.54,
2019
- "learning_rate": 2.65493905899462e-05,
2020
- "loss": 1.1084,
2021
  "step": 30300
2022
  },
2023
  {
2024
  "epoch": 8.57,
2025
- "learning_rate": 2.6167192696806356e-05,
2026
- "loss": 1.0907,
2027
  "step": 30400
2028
  },
2029
  {
2030
  "epoch": 8.6,
2031
- "learning_rate": 2.5784731922536687e-05,
2032
- "loss": 1.0974,
2033
  "step": 30500
2034
  },
2035
  {
2036
  "epoch": 8.62,
2037
- "learning_rate": 2.540592515888669e-05,
2038
- "loss": 1.1108,
2039
  "step": 30600
2040
  },
2041
  {
2042
  "epoch": 8.65,
2043
- "learning_rate": 2.5023209030687986e-05,
2044
- "loss": 1.1082,
2045
  "step": 30700
2046
  },
2047
  {
2048
  "epoch": 8.68,
2049
- "learning_rate": 2.4640499206540774e-05,
2050
- "loss": 1.0964,
2051
  "step": 30800
2052
  },
2053
  {
2054
  "epoch": 8.71,
2055
- "learning_rate": 2.425788574001527e-05,
2056
- "loss": 1.1008,
2057
  "step": 30900
2058
  },
2059
  {
2060
  "epoch": 8.74,
2061
- "learning_rate": 2.3875458662008246e-05,
2062
- "loss": 1.0956,
2063
  "step": 31000
2064
  },
2065
  {
2066
  "epoch": 8.77,
2067
- "learning_rate": 2.3493307959558313e-05,
2068
- "loss": 1.1051,
2069
  "step": 31100
2070
  },
2071
  {
2072
  "epoch": 8.79,
2073
- "learning_rate": 2.3111523554671516e-05,
2074
- "loss": 1.1033,
2075
  "step": 31200
2076
  },
2077
  {
2078
  "epoch": 8.82,
2079
- "learning_rate": 2.2730195283162194e-05,
2080
- "loss": 1.1003,
2081
  "step": 31300
2082
  },
2083
  {
2084
  "epoch": 8.85,
2085
- "learning_rate": 2.2349412873514212e-05,
2086
- "loss": 1.1031,
2087
  "step": 31400
2088
  },
2089
  {
2090
  "epoch": 8.88,
2091
- "learning_rate": 2.1969265925767334e-05,
2092
- "loss": 1.1195,
2093
  "step": 31500
2094
  },
2095
  {
2096
  "epoch": 8.91,
2097
- "learning_rate": 2.158984389043391e-05,
2098
- "loss": 1.0978,
2099
  "step": 31600
2100
  },
2101
  {
2102
  "epoch": 8.93,
2103
- "learning_rate": 2.1211236047450704e-05,
2104
- "loss": 1.1105,
2105
  "step": 31700
2106
  },
2107
  {
2108
  "epoch": 8.96,
2109
- "learning_rate": 2.08335314851709e-05,
2110
- "loss": 1.1088,
2111
  "step": 31800
2112
  },
2113
  {
2114
  "epoch": 8.99,
2115
- "learning_rate": 2.0456819079401094e-05,
2116
- "loss": 1.1241,
2117
  "step": 31900
2118
  },
2119
  {
2120
  "epoch": 9.02,
2121
- "learning_rate": 2.008118747248843e-05,
2122
- "loss": 0.9814,
2123
  "step": 32000
2124
  },
2125
  {
2126
  "epoch": 9.02,
2127
  "eval_gen_len": 20.0,
2128
- "eval_loss": 2.2810895442962646,
2129
- "eval_rouge1": 20.6296,
2130
- "eval_rouge2": 5.1936,
2131
- "eval_rougeL": 20.4403,
2132
- "eval_rougeLsum": 20.4304,
2133
- "eval_runtime": 162.301,
2134
- "eval_samples_per_second": 43.179,
2135
- "eval_steps_per_second": 2.699,
2136
  "step": 32000
2137
  },
2138
  {
2139
  "epoch": 9.05,
2140
- "learning_rate": 1.9706725052462575e-05,
2141
- "loss": 0.9353,
2142
  "step": 32100
2143
  },
2144
  {
2145
  "epoch": 9.08,
2146
- "learning_rate": 1.933351993223755e-05,
2147
- "loss": 0.9372,
2148
  "step": 32200
2149
  },
2150
  {
2151
  "epoch": 9.1,
2152
- "learning_rate": 1.8961659928878366e-05,
2153
- "loss": 0.9601,
2154
  "step": 32300
2155
  },
2156
  {
2157
  "epoch": 9.13,
2158
- "learning_rate": 1.859123254293715e-05,
2159
- "loss": 0.9461,
2160
  "step": 32400
2161
  },
2162
  {
2163
  "epoch": 9.16,
2164
- "learning_rate": 1.8222324937863884e-05,
2165
- "loss": 0.9575,
2166
  "step": 32500
2167
  },
2168
  {
2169
  "epoch": 9.19,
2170
- "learning_rate": 1.7858688692966636e-05,
2171
- "loss": 0.951,
2172
  "step": 32600
2173
  },
2174
  {
2175
  "epoch": 9.22,
2176
- "learning_rate": 1.74930633324332e-05,
2177
- "loss": 0.9606,
2178
  "step": 32700
2179
  },
2180
  {
2181
  "epoch": 9.24,
2182
- "learning_rate": 1.712921615757434e-05,
2183
- "loss": 0.9316,
2184
  "step": 32800
2185
  },
2186
  {
2187
  "epoch": 9.27,
2188
- "learning_rate": 1.676723278348288e-05,
2189
- "loss": 0.9447,
2190
  "step": 32900
2191
  },
2192
  {
2193
  "epoch": 9.3,
2194
- "learning_rate": 1.640719838668979e-05,
2195
- "loss": 0.9591,
2196
  "step": 33000
2197
  },
2198
  {
2199
  "epoch": 9.33,
2200
- "learning_rate": 1.604919768512179e-05,
2201
- "loss": 0.9629,
2202
  "step": 33100
2203
  },
2204
  {
2205
  "epoch": 9.36,
2206
- "learning_rate": 1.569331491816673e-05,
2207
- "loss": 0.9541,
2208
  "step": 33200
2209
  },
2210
  {
2211
  "epoch": 9.39,
2212
- "learning_rate": 1.53396338268517e-05,
2213
- "loss": 0.9409,
2214
  "step": 33300
2215
  },
2216
  {
2217
  "epoch": 9.41,
2218
- "learning_rate": 1.4988237634138258e-05,
2219
- "loss": 0.9517,
2220
  "step": 33400
2221
  },
2222
  {
2223
  "epoch": 9.44,
2224
- "learning_rate": 1.4639209025339731e-05,
2225
- "loss": 0.9569,
2226
  "step": 33500
2227
  },
2228
  {
2229
  "epoch": 9.47,
2230
- "learning_rate": 1.4292630128664853e-05,
2231
- "loss": 0.9421,
2232
  "step": 33600
2233
  },
2234
  {
2235
  "epoch": 9.5,
2236
- "learning_rate": 1.3948582495892665e-05,
2237
- "loss": 0.9455,
2238
  "step": 33700
2239
  },
2240
  {
2241
  "epoch": 9.53,
2242
- "learning_rate": 1.3610548242500389e-05,
2243
- "loss": 0.9553,
2244
  "step": 33800
2245
  },
2246
  {
2247
  "epoch": 9.55,
2248
- "learning_rate": 1.3271778070102909e-05,
2249
- "loss": 0.9668,
2250
  "step": 33900
2251
  },
2252
  {
2253
  "epoch": 9.58,
2254
- "learning_rate": 1.2935779373295224e-05,
2255
- "loss": 0.9403,
2256
  "step": 34000
2257
  },
2258
  {
2259
  "epoch": 9.58,
2260
  "eval_gen_len": 20.0,
2261
- "eval_loss": 2.29062819480896,
2262
- "eval_rouge1": 21.0701,
2263
- "eval_rouge2": 5.1666,
2264
- "eval_rougeL": 20.8587,
2265
- "eval_rougeLsum": 20.8463,
2266
- "eval_runtime": 162.5034,
2267
- "eval_samples_per_second": 43.125,
2268
- "eval_steps_per_second": 2.695,
2269
  "step": 34000
2270
  },
2271
  {
2272
  "epoch": 9.61,
2273
- "learning_rate": 1.2602631214281477e-05,
2274
- "loss": 0.9553,
2275
  "step": 34100
2276
  },
2277
  {
2278
  "epoch": 9.64,
2279
- "learning_rate": 1.2272411984519708e-05,
2280
- "loss": 0.9575,
2281
  "step": 34200
2282
  },
2283
  {
2284
  "epoch": 9.67,
2285
- "learning_rate": 1.1945199386276041e-05,
2286
- "loss": 0.9545,
2287
  "step": 34300
2288
  },
2289
  {
2290
  "epoch": 9.7,
2291
- "learning_rate": 1.1621070414340846e-05,
2292
- "loss": 0.9406,
2293
  "step": 34400
2294
  },
2295
  {
2296
  "epoch": 9.72,
2297
- "learning_rate": 1.1300101337911568e-05,
2298
- "loss": 0.9689,
2299
  "step": 34500
2300
  },
2301
  {
2302
  "epoch": 9.75,
2303
- "learning_rate": 1.0982367682646119e-05,
2304
- "loss": 0.9576,
2305
  "step": 34600
2306
  },
2307
  {
2308
  "epoch": 9.78,
2309
- "learning_rate": 1.0667944212891339e-05,
2310
- "loss": 0.9547,
2311
  "step": 34700
2312
  },
2313
  {
2314
  "epoch": 9.81,
2315
- "learning_rate": 1.0356904914090589e-05,
2316
- "loss": 0.9619,
2317
  "step": 34800
2318
  },
2319
  {
2320
  "epoch": 9.84,
2321
- "learning_rate": 1.0049322975374548e-05,
2322
- "loss": 0.9578,
2323
  "step": 34900
2324
  },
2325
  {
2326
  "epoch": 9.86,
2327
- "learning_rate": 9.745270772339474e-06,
2328
- "loss": 0.9509,
2329
  "step": 35000
2330
  },
2331
  {
2332
  "epoch": 9.89,
2333
- "learning_rate": 9.447806300054016e-06,
2334
- "loss": 0.9593,
2335
  "step": 35100
2336
  },
2337
  {
2338
  "epoch": 9.92,
2339
- "learning_rate": 9.15099028916227e-06,
2340
- "loss": 0.9576,
2341
  "step": 35200
2342
  },
2343
  {
2344
  "epoch": 9.95,
2345
- "learning_rate": 8.857915396206473e-06,
2346
- "loss": 0.9653,
2347
  "step": 35300
2348
  },
2349
  {
2350
  "epoch": 9.98,
2351
- "learning_rate": 8.56865058320117e-06,
2352
- "loss": 0.958,
2353
  "step": 35400
2354
  },
2355
  {
2356
  "epoch": 10.0,
2357
  "step": 35480,
2358
  "total_flos": 2.3762843604025344e+17,
2359
- "train_loss": 1.5038130920392954,
2360
- "train_runtime": 13804.3068,
2361
- "train_samples_per_second": 41.118,
2362
- "train_steps_per_second": 2.57
2363
  }
2364
  ],
2365
  "max_steps": 35480,
 
1
  {
2
+ "best_metric": 1.9633301496505737,
3
+ "best_model_checkpoint": "output/checkpoint-6000",
4
  "epoch": 10.0,
5
  "global_step": 35480,
6
  "is_hyper_param_search": false,
 
9
  "log_history": [
10
  {
11
  "epoch": 0.03,
12
+ "learning_rate": 1.1231131887499657e-07,
13
+ "loss": 2.472,
14
  "step": 100
15
  },
16
  {
17
  "epoch": 0.06,
18
+ "learning_rate": 2.5441897566929925e-05,
19
+ "loss": 2.3979,
20
  "step": 200
21
  },
22
  {
23
  "epoch": 0.08,
24
+ "learning_rate": 1.0307801958256833e-07,
25
+ "loss": 2.3356,
26
  "step": 300
27
  },
28
  {
29
  "epoch": 0.11,
30
+ "learning_rate": 4.2830331047183876e-05,
31
+ "loss": 2.3626,
32
  "step": 400
33
  },
34
  {
35
  "epoch": 0.14,
36
+ "learning_rate": 2.5245954827173344e-05,
37
+ "loss": 2.3347,
38
  "step": 500
39
  },
40
  {
41
  "epoch": 0.17,
42
+ "learning_rate": 7.54679092701715e-06,
43
+ "loss": 2.3057,
44
  "step": 600
45
  },
46
  {
47
  "epoch": 0.2,
48
+ "learning_rate": 1.007695167624979e-07,
49
+ "loss": 2.264,
50
  "step": 700
51
  },
52
  {
53
  "epoch": 0.23,
54
+ "learning_rate": 4.813811122230701e-05,
55
+ "loss": 2.2884,
56
  "step": 800
57
  },
58
  {
59
  "epoch": 0.25,
60
+ "learning_rate": 4.276145918519949e-05,
61
+ "loss": 2.3137,
62
  "step": 900
63
  },
64
  {
65
  "epoch": 0.28,
66
+ "learning_rate": 3.468839804192268e-05,
67
+ "loss": 2.3233,
68
  "step": 1000
69
  },
70
  {
71
  "epoch": 0.31,
72
+ "learning_rate": 2.514797816905896e-05,
73
+ "loss": 2.2754,
74
  "step": 1100
75
  },
76
  {
77
  "epoch": 0.34,
78
+ "learning_rate": 1.559264200813033e-05,
79
+ "loss": 2.2508,
80
  "step": 1200
81
  },
82
  {
83
  "epoch": 0.37,
84
+ "learning_rate": 7.54679092701715e-06,
85
+ "loss": 2.2591,
86
  "step": 1300
87
  },
88
  {
89
  "epoch": 0.39,
90
+ "learning_rate": 2.0749052704813964e-06,
91
+ "loss": 2.231,
92
  "step": 1400
93
  },
94
  {
95
  "epoch": 0.42,
96
+ "learning_rate": 1.007695167624979e-07,
97
+ "loss": 2.2262,
98
  "step": 1500
99
  },
100
  {
101
  "epoch": 0.45,
102
+ "learning_rate": 4.953951865520701e-05,
103
+ "loss": 2.2988,
104
  "step": 1600
105
  },
106
  {
107
  "epoch": 0.48,
108
+ "learning_rate": 4.813811122230701e-05,
109
+ "loss": 2.2338,
110
  "step": 1700
111
  },
112
  {
113
  "epoch": 0.51,
114
+ "learning_rate": 4.5849440623095696e-05,
115
+ "loss": 2.2921,
116
  "step": 1800
117
  },
118
  {
119
  "epoch": 0.54,
120
+ "learning_rate": 4.276145918519949e-05,
121
+ "loss": 2.2703,
122
  "step": 1900
123
  },
124
  {
125
  "epoch": 0.56,
126
+ "learning_rate": 3.8992836303516824e-05,
127
+ "loss": 2.2635,
128
  "step": 2000
129
  },
130
  {
131
  "epoch": 0.56,
132
  "eval_gen_len": 20.0,
133
+ "eval_loss": 2.061249256134033,
134
+ "eval_rouge1": 21.7272,
135
+ "eval_rouge2": 5.8776,
136
+ "eval_rougeL": 21.533,
137
+ "eval_rougeLsum": 21.5147,
138
+ "eval_runtime": 165.5553,
139
+ "eval_samples_per_second": 42.33,
140
+ "eval_steps_per_second": 2.646,
141
  "step": 2000
142
  },
143
  {
144
  "epoch": 0.59,
145
+ "learning_rate": 3.468839804192268e-05,
146
+ "loss": 2.2318,
147
  "step": 2100
148
  },
149
  {
150
  "epoch": 0.62,
151
+ "learning_rate": 3.001356154885334e-05,
152
+ "loss": 2.1998,
153
  "step": 2200
154
  },
155
  {
156
  "epoch": 0.65,
157
+ "learning_rate": 2.514797816905896e-05,
158
+ "loss": 2.2198,
159
  "step": 2300
160
  },
161
  {
162
  "epoch": 0.68,
163
+ "learning_rate": 2.027862954317443e-05,
164
+ "loss": 2.1908,
165
  "step": 2400
166
  },
167
  {
168
  "epoch": 0.7,
169
+ "learning_rate": 1.559264200813033e-05,
170
+ "loss": 2.2039,
171
  "step": 2500
172
  },
173
  {
174
  "epoch": 0.73,
175
+ "learning_rate": 1.1270095436966314e-05,
176
+ "loss": 2.1967,
177
  "step": 2600
178
  },
179
  {
180
  "epoch": 0.76,
181
+ "learning_rate": 7.477102870300167e-06,
182
+ "loss": 2.1736,
183
  "step": 2700
184
  },
185
  {
186
  "epoch": 0.79,
187
+ "learning_rate": 4.359426885334149e-06,
188
+ "loss": 2.1954,
189
  "step": 2800
190
  },
191
  {
192
  "epoch": 0.82,
193
+ "learning_rate": 2.0368780217576533e-06,
194
+ "loss": 2.1828,
195
  "step": 2900
196
  },
197
  {
198
  "epoch": 0.85,
199
+ "learning_rate": 5.987105298975171e-07,
200
+ "loss": 2.1688,
201
  "step": 3000
202
  },
203
  {
204
  "epoch": 0.87,
205
+ "learning_rate": 1.0019237993230962e-07,
206
+ "loss": 2.1516,
207
  "step": 3100
208
  },
209
  {
210
  "epoch": 0.9,
211
+ "learning_rate": 4.9886953900836016e-05,
212
+ "loss": 2.2263,
213
  "step": 3200
214
  },
215
  {
216
  "epoch": 0.93,
217
+ "learning_rate": 4.953482257211965e-05,
218
+ "loss": 2.1939,
219
  "step": 3300
220
  },
221
  {
222
  "epoch": 0.96,
223
+ "learning_rate": 4.8946889017268244e-05,
224
+ "loss": 2.2039,
225
  "step": 3400
226
  },
227
  {
228
  "epoch": 0.99,
229
+ "learning_rate": 4.8128815357812196e-05,
230
+ "loss": 2.1897,
231
  "step": 3500
232
  },
233
  {
234
  "epoch": 1.01,
235
+ "learning_rate": 4.708848009029661e-05,
236
+ "loss": 2.1036,
237
  "step": 3600
238
  },
239
  {
240
  "epoch": 1.04,
241
+ "learning_rate": 4.583590221205278e-05,
242
+ "loss": 2.0092,
243
  "step": 3700
244
  },
245
  {
246
  "epoch": 1.07,
247
+ "learning_rate": 4.43831447327769e-05,
248
+ "loss": 2.0428,
249
  "step": 3800
250
  },
251
  {
252
  "epoch": 1.1,
253
+ "learning_rate": 4.2744198501152435e-05,
254
+ "loss": 2.0471,
255
  "step": 3900
256
  },
257
  {
258
  "epoch": 1.13,
259
+ "learning_rate": 4.093484746532906e-05,
260
+ "loss": 2.0365,
261
  "step": 4000
262
  },
263
  {
264
  "epoch": 1.13,
265
  "eval_gen_len": 20.0,
266
+ "eval_loss": 2.034134864807129,
267
+ "eval_rouge1": 21.7168,
268
+ "eval_rouge2": 5.7378,
269
+ "eval_rougeL": 21.5563,
270
+ "eval_rougeLsum": 21.552,
271
+ "eval_runtime": 164.9689,
272
+ "eval_samples_per_second": 42.481,
273
+ "eval_steps_per_second": 2.655,
274
  "step": 4000
275
  },
276
  {
277
  "epoch": 1.16,
278
+ "learning_rate": 3.897251666487364e-05,
279
+ "loss": 2.0227,
280
  "step": 4100
281
  },
282
  {
283
  "epoch": 1.18,
284
+ "learning_rate": 3.68761044181137e-05,
285
+ "loss": 2.0152,
286
  "step": 4200
287
  },
288
  {
289
  "epoch": 1.21,
290
+ "learning_rate": 3.4665800321001055e-05,
291
+ "loss": 2.0071,
292
  "step": 4300
293
  },
294
  {
295
  "epoch": 1.24,
296
+ "learning_rate": 3.2362890810266e-05,
297
+ "loss": 2.0391,
298
  "step": 4400
299
  },
300
  {
301
  "epoch": 1.27,
302
+ "learning_rate": 2.9989554163395335e-05,
303
+ "loss": 2.0197,
304
  "step": 4500
305
  },
306
  {
307
  "epoch": 1.3,
308
+ "learning_rate": 2.7568646909696192e-05,
309
+ "loss": 1.9765,
310
  "step": 4600
311
  },
312
  {
313
  "epoch": 1.32,
314
+ "learning_rate": 2.512348370942422e-05,
315
+ "loss": 1.9761,
316
  "step": 4700
317
  },
318
  {
319
  "epoch": 1.35,
320
+ "learning_rate": 2.2677612820860213e-05,
321
+ "loss": 1.992,
322
  "step": 4800
323
  },
324
  {
325
  "epoch": 1.38,
326
+ "learning_rate": 2.0254589317710082e-05,
327
+ "loss": 2.0143,
328
  "step": 4900
329
  },
330
  {
331
  "epoch": 1.41,
332
+ "learning_rate": 1.7877748240868528e-05,
333
+ "loss": 1.9675,
334
  "step": 5000
335
  },
336
  {
337
  "epoch": 1.44,
338
+ "learning_rate": 1.556997986921904e-05,
339
+ "loss": 2.0144,
340
  "step": 5100
341
  },
342
  {
343
  "epoch": 1.47,
344
+ "learning_rate": 1.3353509273735138e-05,
345
+ "loss": 2.0131,
346
  "step": 5200
347
  },
348
  {
349
  "epoch": 1.49,
350
+ "learning_rate": 1.1249682277897386e-05,
351
+ "loss": 1.9536,
352
  "step": 5300
353
  },
354
  {
355
  "epoch": 1.52,
356
+ "learning_rate": 9.278759885744533e-06,
357
+ "loss": 1.989,
358
  "step": 5400
359
  },
360
  {
361
  "epoch": 1.55,
362
+ "learning_rate": 7.45972315732879e-06,
363
+ "loss": 1.9796,
364
  "step": 5500
365
  },
366
  {
367
  "epoch": 1.58,
368
+ "learning_rate": 5.810090410731417e-06,
369
+ "loss": 1.9478,
370
  "step": 5600
371
  },
372
  {
373
  "epoch": 1.61,
374
+ "learning_rate": 4.345748511083142e-06,
375
+ "loss": 1.9681,
376
  "step": 5700
377
  },
378
  {
379
  "epoch": 1.63,
380
+ "learning_rate": 3.08079987136829e-06,
381
+ "loss": 1.9642,
382
  "step": 5800
383
  },
384
  {
385
  "epoch": 1.66,
386
+ "learning_rate": 2.027426638478571e-06,
387
+ "loss": 1.983,
388
  "step": 5900
389
  },
390
  {
391
  "epoch": 1.69,
392
+ "learning_rate": 1.1957733724791018e-06,
393
+ "loss": 1.9847,
394
  "step": 6000
395
  },
396
  {
397
  "epoch": 1.69,
398
  "eval_gen_len": 20.0,
399
+ "eval_loss": 1.9633301496505737,
400
+ "eval_rouge1": 22.6551,
401
+ "eval_rouge2": 6.1328,
402
+ "eval_rougeL": 22.457,
403
+ "eval_rougeLsum": 22.4619,
404
+ "eval_runtime": 165.0565,
405
+ "eval_samples_per_second": 42.458,
406
+ "eval_steps_per_second": 2.654,
407
  "step": 6000
408
  },
409
  {
410
  "epoch": 1.72,
411
+ "learning_rate": 5.938493489493758e-07,
412
+ "loss": 1.9596,
413
  "step": 6100
414
  },
415
  {
416
  "epoch": 1.75,
417
+ "learning_rate": 2.27451425281131e-07,
418
+ "loss": 1.9612,
419
  "step": 6200
420
  },
421
  {
422
  "epoch": 1.78,
423
+ "learning_rate": 1.0010821377276459e-07,
424
+ "loss": 1.9514,
425
  "step": 6300
426
  },
427
  {
428
  "epoch": 1.8,
429
+ "learning_rate": 4.99717224507643e-05,
430
+ "loss": 1.986,
431
  "step": 6400
432
  },
433
  {
434
  "epoch": 1.83,
435
+ "learning_rate": 4.988343334271749e-05,
436
+ "loss": 2.0322,
437
  "step": 6500
438
  },
439
  {
440
  "epoch": 1.86,
441
+ "learning_rate": 4.9735318318596204e-05,
442
+ "loss": 2.0483,
443
  "step": 6600
444
  },
445
  {
446
  "epoch": 1.89,
447
+ "learning_rate": 4.9527734200466905e-05,
448
+ "loss": 2.0378,
449
  "step": 6700
450
  },
451
  {
452
  "epoch": 1.92,
453
+ "learning_rate": 4.926118107665238e-05,
454
+ "loss": 2.012,
455
  "step": 6800
456
  },
457
  {
458
  "epoch": 1.94,
459
+ "learning_rate": 4.893983616044946e-05,
460
+ "loss": 2.0259,
461
  "step": 6900
462
  },
463
  {
464
  "epoch": 1.97,
465
+ "learning_rate": 4.8557983100045764e-05,
466
+ "loss": 2.0511,
467
  "step": 7000
468
  },
469
  {
470
  "epoch": 2.0,
471
+ "learning_rate": 4.811949724929791e-05,
472
+ "loss": 2.0332,
473
  "step": 7100
474
  },
475
  {
476
  "epoch": 2.03,
477
+ "learning_rate": 4.762543495902719e-05,
478
+ "loss": 1.8404,
479
  "step": 7200
480
  },
481
  {
482
  "epoch": 2.06,
483
+ "learning_rate": 4.707698646856561e-05,
484
+ "loss": 1.8565,
485
  "step": 7300
486
  },
487
  {
488
  "epoch": 2.09,
489
+ "learning_rate": 4.64754730383651e-05,
490
+ "loss": 1.8571,
491
  "step": 7400
492
  },
493
  {
494
  "epoch": 2.11,
495
+ "learning_rate": 4.582234376696538e-05,
496
+ "loss": 1.8639,
497
  "step": 7500
498
  },
499
  {
500
  "epoch": 2.14,
501
+ "learning_rate": 4.511917209998861e-05,
502
+ "loss": 1.9017,
503
  "step": 7600
504
  },
505
  {
506
  "epoch": 2.17,
507
+ "learning_rate": 4.43676520395711e-05,
508
+ "loss": 1.8625,
509
  "step": 7700
510
  },
511
  {
512
  "epoch": 2.2,
513
+ "learning_rate": 4.3569594063363784e-05,
514
+ "loss": 1.8604,
515
  "step": 7800
516
  },
517
  {
518
  "epoch": 2.23,
519
+ "learning_rate": 4.2726920762932964e-05,
520
+ "loss": 1.8748,
521
  "step": 7900
522
  },
523
  {
524
  "epoch": 2.25,
525
+ "learning_rate": 4.1841662212068846e-05,
526
+ "loss": 1.861,
527
  "step": 8000
528
  },
529
  {
530
  "epoch": 2.25,
531
  "eval_gen_len": 20.0,
532
+ "eval_loss": 2.010927438735962,
533
+ "eval_rouge1": 21.5977,
534
+ "eval_rouge2": 5.9659,
535
+ "eval_rougeL": 21.4021,
536
+ "eval_rougeLsum": 21.3731,
537
+ "eval_runtime": 165.7344,
538
+ "eval_samples_per_second": 42.285,
539
+ "eval_steps_per_second": 2.643,
540
  "step": 8000
541
  },
542
  {
543
  "epoch": 2.28,
544
+ "learning_rate": 4.091595107615995e-05,
545
+ "loss": 1.8498,
546
  "step": 8100
547
  },
548
  {
549
  "epoch": 2.31,
550
+ "learning_rate": 3.99520174744154e-05,
551
+ "loss": 1.8755,
552
  "step": 8200
553
  },
554
  {
555
  "epoch": 2.34,
556
+ "learning_rate": 3.895218360731241e-05,
557
+ "loss": 1.8772,
558
  "step": 8300
559
  },
560
  {
561
  "epoch": 2.37,
562
+ "learning_rate": 3.791885816221194e-05,
563
+ "loss": 1.8761,
564
  "step": 8400
565
  },
566
  {
567
  "epoch": 2.4,
568
+ "learning_rate": 3.6854530510619846e-05,
569
+ "loss": 1.879,
570
  "step": 8500
571
  },
572
  {
573
  "epoch": 2.42,
574
+ "learning_rate": 3.576176471107291e-05,
575
+ "loss": 1.8655,
576
  "step": 8600
577
  },
578
  {
579
  "epoch": 2.45,
580
+ "learning_rate": 3.4643193332097226e-05,
581
+ "loss": 1.8636,
582
  "step": 8700
583
  },
584
  {
585
  "epoch": 2.48,
586
+ "learning_rate": 3.350151111012003e-05,
587
+ "loss": 1.896,
588
  "step": 8800
589
  },
590
  {
591
  "epoch": 2.51,
592
+ "learning_rate": 3.23394684576136e-05,
593
+ "loss": 1.8561,
594
  "step": 8900
595
  },
596
  {
597
  "epoch": 2.54,
598
+ "learning_rate": 3.115986483711075e-05,
599
+ "loss": 1.8555,
600
  "step": 9000
601
  },
602
  {
603
  "epoch": 2.56,
604
+ "learning_rate": 2.9965542017054346e-05,
605
+ "loss": 1.8629,
606
  "step": 9100
607
  },
608
  {
609
  "epoch": 2.59,
610
+ "learning_rate": 2.8759377225728234e-05,
611
+ "loss": 1.8816,
612
  "step": 9200
613
  },
614
  {
615
  "epoch": 2.62,
616
+ "learning_rate": 2.7544276219762262e-05,
617
+ "loss": 1.8372,
618
  "step": 9300
619
  },
620
  {
621
  "epoch": 2.65,
622
+ "learning_rate": 2.632316628391009e-05,
623
+ "loss": 1.8493,
624
  "step": 9400
625
  },
626
  {
627
  "epoch": 2.68,
628
+ "learning_rate": 2.5098989178963792e-05,
629
+ "loss": 1.8573,
630
  "step": 9500
631
  },
632
  {
633
  "epoch": 2.71,
634
+ "learning_rate": 2.3874694054794416e-05,
635
+ "loss": 1.8587,
636
  "step": 9600
637
  },
638
  {
639
  "epoch": 2.73,
640
+ "learning_rate": 2.2653230345591686e-05,
641
+ "loss": 1.8501,
642
  "step": 9700
643
  },
644
  {
645
  "epoch": 2.76,
646
+ "learning_rate": 2.143754066441865e-05,
647
+ "loss": 1.8655,
648
  "step": 9800
649
  },
650
  {
651
  "epoch": 2.79,
652
+ "learning_rate": 2.02305537141991e-05,
653
+ "loss": 1.8599,
654
  "step": 9900
655
  },
656
  {
657
  "epoch": 2.82,
658
+ "learning_rate": 1.9035177232215682e-05,
659
+ "loss": 1.8634,
660
  "step": 10000
661
  },
662
  {
663
  "epoch": 2.82,
664
  "eval_gen_len": 20.0,
665
+ "eval_loss": 1.9686568975448608,
666
+ "eval_rouge1": 22.5506,
667
+ "eval_rouge2": 6.0881,
668
+ "eval_rougeL": 22.4021,
669
+ "eval_rougeLsum": 22.3998,
670
+ "eval_runtime": 164.864,
671
+ "eval_samples_per_second": 42.508,
672
+ "eval_steps_per_second": 2.657,
673
  "step": 10000
674
  },
675
  {
676
  "epoch": 2.85,
677
+ "learning_rate": 1.7854290985116118e-05,
678
+ "loss": 1.8382,
679
  "step": 10100
680
  },
681
  {
682
  "epoch": 2.87,
683
+ "learning_rate": 1.669073983130307e-05,
684
+ "loss": 1.8459,
685
  "step": 10200
686
  },
687
  {
688
  "epoch": 2.9,
689
+ "learning_rate": 1.5547326867421132e-05,
690
+ "loss": 1.8562,
691
  "step": 10300
692
  },
693
  {
694
  "epoch": 2.93,
695
+ "learning_rate": 1.4426806675451438e-05,
696
+ "loss": 1.8404,
697
  "step": 10400
698
  },
699
  {
700
  "epoch": 2.96,
701
+ "learning_rate": 1.3331878686682424e-05,
702
+ "loss": 1.8492,
703
  "step": 10500
704
  },
705
  {
706
  "epoch": 2.99,
707
+ "learning_rate": 1.2265180678543392e-05,
708
+ "loss": 1.8172,
709
  "step": 10600
710
  },
711
  {
712
  "epoch": 3.02,
713
+ "learning_rate": 1.1229282419967673e-05,
714
+ "loss": 1.7255,
715
  "step": 10700
716
  },
717
  {
718
  "epoch": 3.04,
719
+ "learning_rate": 1.022667948059414e-05,
720
+ "loss": 1.6157,
721
  "step": 10800
722
  },
723
  {
724
  "epoch": 3.07,
725
+ "learning_rate": 9.25978721872139e-06,
726
+ "loss": 1.6024,
727
  "step": 10900
728
  },
729
  {
730
  "epoch": 3.1,
731
+ "learning_rate": 8.330934962498027e-06,
732
+ "loss": 1.5984,
733
  "step": 11000
734
  },
735
  {
736
  "epoch": 3.13,
737
+ "learning_rate": 7.442360398367151e-06,
738
+ "loss": 1.6075,
739
  "step": 11100
740
  },
741
  {
742
  "epoch": 3.16,
743
+ "learning_rate": 6.596204180283686e-06,
744
+ "loss": 1.5859,
745
  "step": 11200
746
  },
747
  {
748
  "epoch": 3.18,
749
+ "learning_rate": 5.79450477269154e-06,
750
+ "loss": 1.5973,
751
  "step": 11300
752
  },
753
  {
754
  "epoch": 3.21,
755
+ "learning_rate": 5.039193539684164e-06,
756
+ "loss": 1.6132,
757
  "step": 11400
758
  },
759
  {
760
  "epoch": 3.24,
761
+ "learning_rate": 4.332090092179324e-06,
762
+ "loss": 1.6221,
763
  "step": 11500
764
  },
765
  {
766
  "epoch": 3.27,
767
+ "learning_rate": 3.6748979043170137e-06,
768
+ "loss": 1.6258,
769
  "step": 11600
770
  },
771
  {
772
  "epoch": 3.3,
773
+ "learning_rate": 3.0692002096410223e-06,
774
+ "loss": 1.6149,
775
  "step": 11700
776
  },
777
  {
778
  "epoch": 3.33,
779
+ "learning_rate": 2.516456186950684e-06,
780
+ "loss": 1.624,
781
  "step": 11800
782
  },
783
  {
784
  "epoch": 3.35,
785
+ "learning_rate": 2.0179974450113125e-06,
786
+ "loss": 1.6138,
787
  "step": 11900
788
  },
789
  {
790
  "epoch": 3.38,
791
+ "learning_rate": 1.575024814592075e-06,
792
+ "loss": 1.6036,
793
  "step": 12000
794
  },
795
  {
796
  "epoch": 3.38,
797
  "eval_gen_len": 20.0,
798
+ "eval_loss": 1.983000636100769,
799
+ "eval_rouge1": 22.1876,
800
+ "eval_rouge2": 5.6559,
801
+ "eval_rougeL": 21.9989,
802
+ "eval_rougeLsum": 21.9753,
803
+ "eval_runtime": 166.2195,
804
+ "eval_samples_per_second": 42.161,
805
+ "eval_steps_per_second": 2.635,
806
  "step": 12000
807
  },
808
  {
809
  "epoch": 3.41,
810
+ "learning_rate": 1.188605455559466e-06,
811
+ "loss": 1.5965,
812
  "step": 12100
813
  },
814
  {
815
  "epoch": 3.44,
816
+ "learning_rate": 8.596702859957324e-07,
817
+ "loss": 1.5971,
818
  "step": 12200
819
  },
820
  {
821
  "epoch": 3.47,
822
+ "learning_rate": 5.890117395356814e-07,
823
+ "loss": 1.6098,
824
  "step": 12300
825
  },
826
  {
827
  "epoch": 3.49,
828
+ "learning_rate": 3.772818563246123e-07,
829
+ "loss": 1.6057,
830
  "step": 12400
831
  },
832
  {
833
  "epoch": 3.52,
834
+ "learning_rate": 2.2499071219653263e-07,
835
+ "loss": 1.611,
836
  "step": 12500
837
  },
838
  {
839
  "epoch": 3.55,
840
+ "learning_rate": 1.3250518985677592e-07,
841
+ "loss": 1.6154,
842
  "step": 12600
843
  },
844
  {
845
  "epoch": 3.58,
846
+ "learning_rate": 1.0004809502943347e-07,
847
+ "loss": 1.6232,
848
  "step": 12700
849
  },
850
  {
851
  "epoch": 3.61,
852
+ "learning_rate": 4.999307463393912e-05,
853
+ "loss": 1.6383,
854
  "step": 12800
855
  },
856
  {
857
  "epoch": 3.64,
858
+ "learning_rate": 4.997113651616064e-05,
859
+ "loss": 1.6506,
860
  "step": 12900
861
  },
862
  {
863
  "epoch": 3.66,
864
+ "learning_rate": 4.993418683760613e-05,
865
+ "loss": 1.6974,
866
  "step": 13000
867
  },
868
  {
869
  "epoch": 3.69,
870
+ "learning_rate": 4.988224785538034e-05,
871
+ "loss": 1.7072,
872
  "step": 13100
873
  },
874
  {
875
  "epoch": 3.72,
876
+ "learning_rate": 4.981535085558401e-05,
877
+ "loss": 1.6934,
878
  "step": 13200
879
  },
880
  {
881
  "epoch": 3.75,
882
+ "learning_rate": 4.97335361344684e-05,
883
+ "loss": 1.6793,
884
  "step": 13300
885
  },
886
  {
887
  "epoch": 3.78,
888
+ "learning_rate": 4.963685297416225e-05,
889
+ "loss": 1.7101,
890
  "step": 13400
891
  },
892
  {
893
  "epoch": 3.8,
894
+ "learning_rate": 4.952535961298611e-05,
895
+ "loss": 1.75,
896
  "step": 13500
897
  },
898
  {
899
  "epoch": 3.83,
900
+ "learning_rate": 4.939912321037175e-05,
901
+ "loss": 1.7315,
902
  "step": 13600
903
  },
904
  {
905
  "epoch": 3.86,
906
+ "learning_rate": 4.9258219806407825e-05,
907
+ "loss": 1.7341,
908
  "step": 13700
909
  },
910
  {
911
  "epoch": 3.89,
912
+ "learning_rate": 4.910273427603616e-05,
913
+ "loss": 1.7312,
914
  "step": 13800
915
  },
916
  {
917
  "epoch": 3.92,
918
+ "learning_rate": 4.893276027792627e-05,
919
+ "loss": 1.7162,
920
  "step": 13900
921
  },
922
  {
923
  "epoch": 3.95,
924
+ "learning_rate": 4.874840019805889e-05,
925
+ "loss": 1.7558,
926
  "step": 14000
927
  },
928
  {
929
  "epoch": 3.95,
930
  "eval_gen_len": 20.0,
931
+ "eval_loss": 2.0177605152130127,
932
+ "eval_rouge1": 21.6845,
933
+ "eval_rouge2": 5.7382,
934
+ "eval_rougeL": 21.4848,
935
+ "eval_rougeLsum": 21.4995,
936
+ "eval_runtime": 165.4038,
937
+ "eval_samples_per_second": 42.369,
938
+ "eval_steps_per_second": 2.648,
939
  "step": 14000
940
  },
941
  {
942
  "epoch": 3.97,
943
+ "learning_rate": 4.854976508805247e-05,
944
+ "loss": 1.767,
945
  "step": 14100
946
  },
947
  {
948
  "epoch": 4.0,
949
+ "learning_rate": 4.833697459826981e-05,
950
+ "loss": 1.7569,
951
  "step": 14200
952
  },
953
  {
954
  "epoch": 4.03,
955
+ "learning_rate": 4.811015690574522e-05,
956
+ "loss": 1.5871,
957
  "step": 14300
958
  },
959
  {
960
  "epoch": 4.06,
961
+ "learning_rate": 4.7869448636975414e-05,
962
+ "loss": 1.6034,
963
  "step": 14400
964
  },
965
  {
966
  "epoch": 4.09,
967
+ "learning_rate": 4.761499478562084e-05,
968
+ "loss": 1.6308,
969
  "step": 14500
970
  },
971
  {
972
  "epoch": 4.11,
973
+ "learning_rate": 4.73469486251669e-05,
974
+ "loss": 1.5885,
975
  "step": 14600
976
  },
977
  {
978
  "epoch": 4.14,
979
+ "learning_rate": 4.7065471616597775e-05,
980
+ "loss": 1.612,
981
  "step": 14700
982
  },
983
  {
984
  "epoch": 4.17,
985
+ "learning_rate": 4.677073331113833e-05,
986
+ "loss": 1.6088,
987
  "step": 14800
988
  },
989
  {
990
  "epoch": 4.2,
991
+ "learning_rate": 4.646291124812277e-05,
992
+ "loss": 1.6277,
993
  "step": 14900
994
  },
995
  {
996
  "epoch": 4.23,
997
+ "learning_rate": 4.614219084805167e-05,
998
+ "loss": 1.6087,
999
  "step": 15000
1000
  },
1001
  {
1002
  "epoch": 4.26,
1003
+ "learning_rate": 4.580876530090152e-05,
1004
+ "loss": 1.6462,
1005
  "step": 15100
1006
  },
1007
  {
1008
  "epoch": 4.28,
1009
+ "learning_rate": 4.546283544975434e-05,
1010
+ "loss": 1.6423,
1011
  "step": 15200
1012
  },
1013
  {
1014
  "epoch": 4.31,
1015
+ "learning_rate": 4.51046096698173e-05,
1016
+ "loss": 1.642,
1017
  "step": 15300
1018
  },
1019
  {
1020
  "epoch": 4.34,
1021
+ "learning_rate": 4.473430374290533e-05,
1022
+ "loss": 1.6263,
1023
  "step": 15400
1024
  },
1025
  {
1026
  "epoch": 4.37,
1027
+ "learning_rate": 4.435214072746215e-05,
1028
+ "loss": 1.6582,
1029
  "step": 15500
1030
  },
1031
  {
1032
  "epoch": 4.4,
1033
+ "learning_rate": 4.3958350824198286e-05,
1034
+ "loss": 1.6671,
1035
  "step": 15600
1036
  },
1037
  {
1038
  "epoch": 4.43,
1039
+ "learning_rate": 4.355317123742669e-05,
1040
+ "loss": 1.6473,
1041
  "step": 15700
1042
  },
1043
  {
1044
  "epoch": 4.45,
1045
+ "learning_rate": 4.314106363710374e-05,
1046
+ "loss": 1.6576,
1047
  "step": 15800
1048
  },
1049
  {
1050
  "epoch": 4.48,
1051
+ "learning_rate": 4.271395127748479e-05,
1052
+ "loss": 1.633,
1053
  "step": 15900
1054
  },
1055
  {
1056
  "epoch": 4.51,
1057
+ "learning_rate": 4.227619881411398e-05,
1058
+ "loss": 1.64,
1059
  "step": 16000
1060
  },
1061
  {
1062
  "epoch": 4.51,
1063
  "eval_gen_len": 20.0,
1064
+ "eval_loss": 2.032703161239624,
1065
+ "eval_rouge1": 22.2681,
1066
+ "eval_rouge2": 5.8023,
1067
+ "eval_rougeL": 22.0116,
1068
+ "eval_rougeLsum": 21.9997,
1069
+ "eval_runtime": 165.398,
1070
+ "eval_samples_per_second": 42.371,
1071
+ "eval_steps_per_second": 2.648,
1072
  "step": 16000
1073
  },
1074
  {
1075
  "epoch": 4.54,
1076
+ "learning_rate": 4.18280699327066e-05,
1077
+ "loss": 1.651,
1078
  "step": 16100
1079
  },
1080
  {
1081
  "epoch": 4.57,
1082
+ "learning_rate": 4.1369834569344205e-05,
1083
+ "loss": 1.6491,
1084
  "step": 16200
1085
  },
1086
  {
1087
  "epoch": 4.59,
1088
+ "learning_rate": 4.090176874787515e-05,
1089
+ "loss": 1.6302,
1090
  "step": 16300
1091
  },
1092
  {
1093
  "epoch": 4.62,
1094
+ "learning_rate": 4.042415441364819e-05,
1095
+ "loss": 1.6376,
1096
  "step": 16400
1097
  },
1098
  {
1099
  "epoch": 4.65,
1100
+ "learning_rate": 3.993727926367911e-05,
1101
+ "loss": 1.6322,
1102
  "step": 16500
1103
  },
1104
  {
1105
  "epoch": 4.68,
1106
+ "learning_rate": 3.944143657335282e-05,
1107
+ "loss": 1.6651,
1108
  "step": 16600
1109
  },
1110
  {
1111
  "epoch": 4.71,
1112
+ "learning_rate": 3.8936925019765214e-05,
1113
+ "loss": 1.6308,
1114
  "step": 16700
1115
  },
1116
  {
1117
  "epoch": 4.74,
1118
+ "learning_rate": 3.842404850181127e-05,
1119
+ "loss": 1.6703,
1120
  "step": 16800
1121
  },
1122
  {
1123
  "epoch": 4.76,
1124
+ "learning_rate": 3.790311595712772e-05,
1125
+ "loss": 1.6816,
1126
  "step": 16900
1127
  },
1128
  {
1129
  "epoch": 4.79,
1130
+ "learning_rate": 3.737444117600056e-05,
1131
+ "loss": 1.6568,
1132
  "step": 17000
1133
  },
1134
  {
1135
  "epoch": 4.82,
1136
+ "learning_rate": 3.6838342612349524e-05,
1137
+ "loss": 1.6697,
1138
  "step": 17100
1139
  },
1140
  {
1141
  "epoch": 4.85,
1142
+ "learning_rate": 3.629514319190331e-05,
1143
+ "loss": 1.6352,
1144
  "step": 17200
1145
  },
1146
  {
1147
  "epoch": 4.88,
1148
+ "learning_rate": 3.57451701176813e-05,
1149
+ "loss": 1.6724,
1150
  "step": 17300
1151
  },
1152
  {
1153
  "epoch": 4.9,
1154
+ "learning_rate": 3.5188754672898564e-05,
1155
+ "loss": 1.6444,
1156
  "step": 17400
1157
  },
1158
  {
1159
  "epoch": 4.93,
1160
+ "learning_rate": 3.462623202141332e-05,
1161
+ "loss": 1.6568,
1162
  "step": 17500
1163
  },
1164
  {
1165
  "epoch": 4.96,
1166
+ "learning_rate": 3.4057941005836765e-05,
1167
+ "loss": 1.6626,
1168
  "step": 17600
1169
  },
1170
  {
1171
  "epoch": 4.99,
1172
+ "learning_rate": 3.34842239434269e-05,
1173
+ "loss": 1.6246,
1174
  "step": 17700
1175
  },
1176
  {
1177
  "epoch": 5.02,
1178
+ "learning_rate": 3.290542641988946e-05,
1179
+ "loss": 1.4957,
1180
  "step": 17800
1181
  },
1182
  {
1183
  "epoch": 5.05,
1184
+ "learning_rate": 3.232189708121e-05,
1185
+ "loss": 1.3941,
1186
  "step": 17900
1187
  },
1188
  {
1189
  "epoch": 5.07,
1190
+ "learning_rate": 3.173398742364255e-05,
1191
+ "loss": 1.4248,
1192
  "step": 18000
1193
  },
1194
  {
1195
  "epoch": 5.07,
1196
  "eval_gen_len": 20.0,
1197
+ "eval_loss": 2.0742380619049072,
1198
+ "eval_rouge1": 21.8569,
1199
+ "eval_rouge2": 5.5533,
1200
+ "eval_rougeL": 21.6308,
1201
+ "eval_rougeLsum": 21.638,
1202
+ "eval_runtime": 166.3338,
1203
+ "eval_samples_per_second": 42.132,
1204
+ "eval_steps_per_second": 2.633,
1205
  "step": 18000
1206
  },
1207
  {
1208
  "epoch": 5.1,
1209
+ "learning_rate": 3.114205158198149e-05,
1210
+ "loss": 1.4175,
1211
  "step": 18100
1212
  },
1213
  {
1214
  "epoch": 5.13,
1215
+ "learning_rate": 3.054644611624394e-05,
1216
+ "loss": 1.3888,
1217
  "step": 18200
1218
  },
1219
  {
1220
  "epoch": 5.16,
1221
+ "learning_rate": 2.9947529796891307e-05,
1222
+ "loss": 1.4086,
1223
  "step": 18300
1224
  },
1225
  {
1226
  "epoch": 5.19,
1227
+ "learning_rate": 2.9345663388719467e-05,
1228
+ "loss": 1.3948,
1229
  "step": 18400
1230
  },
1231
  {
1232
  "epoch": 5.21,
1233
+ "learning_rate": 2.8741209433547458e-05,
1234
+ "loss": 1.4149,
1235
  "step": 18500
1236
  },
1237
  {
1238
  "epoch": 5.24,
1239
+ "learning_rate": 2.8134532031835893e-05,
1240
+ "loss": 1.4184,
1241
  "step": 18600
1242
  },
1243
  {
1244
  "epoch": 5.27,
1245
+ "learning_rate": 2.7525996623366436e-05,
1246
+ "loss": 1.43,
1247
  "step": 18700
1248
  },
1249
  {
1250
  "epoch": 5.3,
1251
+ "learning_rate": 2.691596976711453e-05,
1252
+ "loss": 1.4435,
1253
  "step": 18800
1254
  },
1255
  {
1256
  "epoch": 5.33,
1257
+ "learning_rate": 2.630481892044803e-05,
1258
+ "loss": 1.4092,
1259
  "step": 18900
1260
  },
1261
  {
1262
  "epoch": 5.36,
1263
+ "learning_rate": 2.5692912217784543e-05,
1264
+ "loss": 1.4241,
1265
  "step": 19000
1266
  },
1267
  {
1268
  "epoch": 5.38,
1269
+ "learning_rate": 2.5080618248841106e-05,
1270
+ "loss": 1.414,
1271
  "step": 19100
1272
  },
1273
  {
1274
  "epoch": 5.41,
1275
+ "learning_rate": 2.4468305836609452e-05,
1276
+ "loss": 1.4265,
1277
  "step": 19200
1278
  },
1279
  {
1280
  "epoch": 5.44,
1281
+ "learning_rate": 2.3856343815190883e-05,
1282
+ "loss": 1.4061,
1283
  "step": 19300
1284
  },
1285
  {
1286
  "epoch": 5.47,
1287
+ "learning_rate": 2.324510080762437e-05,
1288
+ "loss": 1.3992,
1289
  "step": 19400
1290
  },
1291
  {
1292
  "epoch": 5.5,
1293
+ "learning_rate": 2.2634945003841808e-05,
1294
+ "loss": 1.4221,
1295
  "step": 19500
1296
  },
1297
  {
1298
  "epoch": 5.52,
1299
+ "learning_rate": 2.202624393888421e-05,
1300
+ "loss": 1.3943,
1301
  "step": 19600
1302
  },
1303
  {
1304
  "epoch": 5.55,
1305
+ "learning_rate": 2.1419364271512303e-05,
1306
+ "loss": 1.4266,
1307
  "step": 19700
1308
  },
1309
  {
1310
  "epoch": 5.58,
1311
+ "learning_rate": 2.0814671563345028e-05,
1312
+ "loss": 1.4466,
1313
  "step": 19800
1314
  },
1315
  {
1316
  "epoch": 5.61,
1317
+ "learning_rate": 2.021853765291574e-05,
1318
+ "loss": 1.4312,
1319
  "step": 19900
1320
  },
1321
  {
1322
  "epoch": 5.64,
1323
+ "learning_rate": 1.9619279130307914e-05,
1324
+ "loss": 1.4263,
1325
  "step": 20000
1326
  },
1327
  {
1328
  "epoch": 5.64,
1329
  "eval_gen_len": 20.0,
1330
+ "eval_loss": 2.0796058177948,
1331
+ "eval_rouge1": 22.037,
1332
+ "eval_rouge2": 5.454,
1333
+ "eval_rougeL": 21.7861,
1334
+ "eval_rougeLsum": 21.7976,
1335
+ "eval_runtime": 165.631,
1336
+ "eval_samples_per_second": 42.311,
1337
+ "eval_steps_per_second": 2.644,
1338
  "step": 20000
1339
  },
1340
  {
1341
  "epoch": 5.67,
1342
+ "learning_rate": 1.902329187088426e-05,
1343
+ "loss": 1.4114,
1344
  "step": 20100
1345
  },
1346
  {
1347
  "epoch": 5.69,
1348
+ "learning_rate": 1.8430934875084445e-05,
1349
+ "loss": 1.4068,
1350
  "step": 20200
1351
  },
1352
  {
1353
  "epoch": 5.72,
1354
+ "learning_rate": 1.7842564956613097e-05,
1355
+ "loss": 1.4212,
1356
  "step": 20300
1357
  },
1358
  {
1359
  "epoch": 5.75,
1360
+ "learning_rate": 1.7258536527508527e-05,
1361
+ "loss": 1.4179,
1362
  "step": 20400
1363
  },
1364
  {
1365
  "epoch": 5.78,
1366
+ "learning_rate": 1.6679201384658204e-05,
1367
+ "loss": 1.4263,
1368
  "step": 20500
1369
  },
1370
  {
1371
  "epoch": 5.81,
1372
+ "learning_rate": 1.6104908497889437e-05,
1373
+ "loss": 1.4062,
1374
  "step": 20600
1375
  },
1376
  {
1377
  "epoch": 5.83,
1378
+ "learning_rate": 1.5536003799763033e-05,
1379
+ "loss": 1.4168,
1380
  "step": 20700
1381
  },
1382
  {
1383
  "epoch": 5.86,
1384
+ "learning_rate": 1.497282997719642e-05,
1385
+ "loss": 1.4502,
1386
  "step": 20800
1387
  },
1388
  {
1389
  "epoch": 5.89,
1390
+ "learning_rate": 1.4415726265041886e-05,
1391
+ "loss": 1.42,
1392
  "step": 20900
1393
  },
1394
  {
1395
  "epoch": 5.92,
1396
+ "learning_rate": 1.3865028241744182e-05,
1397
+ "loss": 1.421,
1398
  "step": 21000
1399
  },
1400
  {
1401
  "epoch": 5.95,
1402
+ "learning_rate": 1.332106762720061e-05,
1403
+ "loss": 1.4272,
1404
  "step": 21100
1405
  },
1406
  {
1407
  "epoch": 5.98,
1408
+ "learning_rate": 1.2784172082945395e-05,
1409
+ "loss": 1.4115,
1410
  "step": 21200
1411
  },
1412
  {
1413
  "epoch": 6.0,
1414
+ "learning_rate": 1.2254665014778574e-05,
1415
+ "loss": 1.3844,
1416
  "step": 21300
1417
  },
1418
  {
1419
  "epoch": 6.03,
1420
+ "learning_rate": 1.1732865377958439e-05,
1421
+ "loss": 1.2502,
1422
  "step": 21400
1423
  },
1424
  {
1425
  "epoch": 6.06,
1426
+ "learning_rate": 1.1219087485074895e-05,
1427
+ "loss": 1.232,
1428
  "step": 21500
1429
  },
1430
  {
1431
  "epoch": 6.09,
1432
+ "learning_rate": 1.0713640816719211e-05,
1433
+ "loss": 1.2675,
1434
  "step": 21600
1435
  },
1436
  {
1437
  "epoch": 6.12,
1438
+ "learning_rate": 1.021682983506454e-05,
1439
+ "loss": 1.2537,
1440
  "step": 21700
1441
  },
1442
  {
1443
  "epoch": 6.14,
1444
+ "learning_rate": 9.728953800469257e-06,
1445
+ "loss": 1.2703,
1446
  "step": 21800
1447
  },
1448
  {
1449
  "epoch": 6.17,
1450
+ "learning_rate": 9.250306591213766e-06,
1451
+ "loss": 1.2496,
1452
  "step": 21900
1453
  },
1454
  {
1455
  "epoch": 6.2,
1456
+ "learning_rate": 8.781176526479135e-06,
1457
+ "loss": 1.2478,
1458
  "step": 22000
1459
  },
1460
  {
1461
  "epoch": 6.2,
1462
  "eval_gen_len": 20.0,
1463
+ "eval_loss": 2.1144540309906006,
1464
+ "eval_rouge1": 21.5358,
1465
+ "eval_rouge2": 5.3838,
1466
+ "eval_rougeL": 21.3299,
1467
+ "eval_rougeLsum": 21.3191,
1468
+ "eval_runtime": 166.7361,
1469
+ "eval_samples_per_second": 42.03,
1470
+ "eval_steps_per_second": 2.627,
1471
  "step": 22000
1472
  },
1473
  {
1474
  "epoch": 6.23,
1475
+ "learning_rate": 8.321846192674462e-06,
1476
+ "loss": 1.2486,
1477
  "step": 22100
1478
  },
1479
  {
1480
  "epoch": 6.26,
1481
+ "learning_rate": 7.877034040486934e-06,
1482
+ "loss": 1.2412,
1483
  "step": 22200
1484
  },
1485
  {
1486
  "epoch": 6.29,
1487
+ "learning_rate": 7.4380223596522265e-06,
1488
+ "loss": 1.2552,
1489
  "step": 22300
1490
  },
1491
  {
1492
  "epoch": 6.31,
1493
+ "learning_rate": 7.009619475593796e-06,
1494
+ "loss": 1.2642,
1495
  "step": 22400
1496
  },
1497
  {
1498
  "epoch": 6.34,
1499
+ "learning_rate": 6.592083442189996e-06,
1500
+ "loss": 1.265,
1501
  "step": 22500
1502
  },
1503
  {
1504
  "epoch": 6.37,
1505
+ "learning_rate": 6.1856657675346675e-06,
1506
+ "loss": 1.2631,
1507
  "step": 22600
1508
  },
1509
  {
1510
  "epoch": 6.4,
1511
+ "learning_rate": 5.790611262438083e-06,
1512
+ "loss": 1.2736,
1513
  "step": 22700
1514
  },
1515
  {
1516
  "epoch": 6.43,
1517
+ "learning_rate": 5.407157892962079e-06,
1518
+ "loss": 1.2595,
1519
  "step": 22800
1520
  },
1521
  {
1522
  "epoch": 6.45,
1523
+ "learning_rate": 5.035536637078171e-06,
1524
+ "loss": 1.2661,
1525
  "step": 22900
1526
  },
1527
  {
1528
  "epoch": 6.48,
1529
+ "learning_rate": 4.675971345535108e-06,
1530
+ "loss": 1.2651,
1531
  "step": 23000
1532
  },
1533
  {
1534
  "epoch": 6.51,
1535
+ "learning_rate": 4.328678607019489e-06,
1536
+ "loss": 1.2499,
1537
  "step": 23100
1538
  },
1539
  {
1540
  "epoch": 6.54,
1541
+ "learning_rate": 3.993867617690892e-06,
1542
+ "loss": 1.263,
1543
  "step": 23200
1544
  },
1545
  {
1546
  "epoch": 6.57,
1547
+ "learning_rate": 3.6717400551698886e-06,
1548
+ "loss": 1.2546,
1549
  "step": 23300
1550
  },
1551
  {
1552
  "epoch": 6.6,
1553
+ "learning_rate": 3.3624899570550363e-06,
1554
+ "loss": 1.261,
1555
  "step": 23400
1556
  },
1557
  {
1558
  "epoch": 6.62,
1559
+ "learning_rate": 3.066303604041807e-06,
1560
+ "loss": 1.2418,
1561
  "step": 23500
1562
  },
1563
  {
1564
  "epoch": 6.65,
1565
+ "learning_rate": 2.7833594077141535e-06,
1566
+ "loss": 1.2601,
1567
  "step": 23600
1568
  },
1569
  {
1570
  "epoch": 6.68,
1571
+ "learning_rate": 2.5138278030759714e-06,
1572
+ "loss": 1.2397,
1573
  "step": 23700
1574
  },
1575
  {
1576
  "epoch": 6.71,
1577
+ "learning_rate": 2.2578711458874663e-06,
1578
+ "loss": 1.264,
1579
  "step": 23800
1580
  },
1581
  {
1582
  "epoch": 6.74,
1583
+ "learning_rate": 2.0156436148680858e-06,
1584
+ "loss": 1.2497,
1585
  "step": 23900
1586
  },
1587
  {
1588
  "epoch": 6.76,
1589
+ "learning_rate": 1.787291118825041e-06,
1590
+ "loss": 1.2469,
1591
  "step": 24000
1592
  },
1593
  {
1594
  "epoch": 6.76,
1595
  "eval_gen_len": 20.0,
1596
+ "eval_loss": 2.113508462905884,
1597
+ "eval_rouge1": 21.3962,
1598
+ "eval_rouge2": 5.3698,
1599
+ "eval_rougeL": 21.1842,
1600
+ "eval_rougeLsum": 21.159,
1601
+ "eval_runtime": 166.5668,
1602
+ "eval_samples_per_second": 42.073,
1603
+ "eval_steps_per_second": 2.63,
1604
  "step": 24000
1605
  },
1606
  {
1607
  "epoch": 6.79,
1608
+ "learning_rate": 1.5729512087633123e-06,
1609
+ "loss": 1.2383,
1610
  "step": 24100
1611
  },
1612
  {
1613
  "epoch": 6.82,
1614
+ "learning_rate": 1.372752995030021e-06,
1615
+ "loss": 1.2619,
1616
  "step": 24200
1617
  },
1618
  {
1619
  "epoch": 6.85,
1620
+ "learning_rate": 1.1868170695432342e-06,
1621
+ "loss": 1.2536,
1622
  "step": 24300
1623
  },
1624
  {
1625
  "epoch": 6.88,
1626
+ "learning_rate": 1.0152554331518699e-06,
1627
+ "loss": 1.2514,
1628
  "step": 24400
1629
  },
1630
  {
1631
  "epoch": 6.91,
1632
+ "learning_rate": 8.596702859957324e-07,
1633
+ "loss": 1.2585,
1634
  "step": 24500
1635
  },
1636
  {
1637
  "epoch": 6.93,
1638
+ "learning_rate": 7.170123718266064e-07,
1639
+ "loss": 1.2439,
1640
  "step": 24600
1641
  },
1642
  {
1643
  "epoch": 6.96,
1644
+ "learning_rate": 5.890117395356814e-07,
1645
+ "loss": 1.245,
1646
  "step": 24700
1647
  },
1648
  {
1649
  "epoch": 6.99,
1650
+ "learning_rate": 4.7574549191759186e-07,
1651
+ "loss": 1.2446,
1652
  "step": 24800
1653
  },
1654
  {
1655
  "epoch": 7.02,
1656
+ "learning_rate": 3.772818563246123e-07,
1657
+ "loss": 1.2194,
1658
  "step": 24900
1659
  },
1660
  {
1661
  "epoch": 7.05,
1662
+ "learning_rate": 2.936801435690251e-07,
1663
+ "loss": 1.2058,
1664
  "step": 25000
1665
  },
1666
  {
1667
  "epoch": 7.07,
1668
+ "learning_rate": 2.2499071219653263e-07,
1669
+ "loss": 1.1916,
1670
  "step": 25100
1671
  },
1672
  {
1673
  "epoch": 7.1,
1674
+ "learning_rate": 1.7171816152852273e-07,
1675
+ "loss": 1.2186,
1676
  "step": 25200
1677
  },
1678
  {
1679
  "epoch": 7.13,
1680
+ "learning_rate": 1.328184223787534e-07,
1681
+ "loss": 1.2151,
1682
  "step": 25300
1683
  },
1684
  {
1685
  "epoch": 7.16,
1686
+ "learning_rate": 1.0892786169791435e-07,
1687
+ "loss": 1.2098,
1688
  "step": 25400
1689
  },
1690
  {
1691
  "epoch": 7.19,
1692
+ "learning_rate": 1.0006087026643128e-07,
1693
+ "loss": 1.2011,
1694
  "step": 25500
1695
  },
1696
  {
1697
  "epoch": 7.22,
1698
+ "learning_rate": 4.9998444254188996e-05,
1699
+ "loss": 1.2333,
1700
  "step": 25600
1701
  },
1702
  {
1703
  "epoch": 7.24,
1704
+ "learning_rate": 4.9993146582012154e-05,
1705
+ "loss": 1.3057,
1706
  "step": 25700
1707
  },
1708
  {
1709
  "epoch": 7.27,
1710
+ "learning_rate": 4.998409256370816e-05,
1711
+ "loss": 1.2949,
1712
  "step": 25800
1713
  },
1714
  {
1715
  "epoch": 7.3,
1716
+ "learning_rate": 4.997128356277887e-05,
1717
+ "loss": 1.3107,
1718
  "step": 25900
1719
  },
1720
  {
1721
  "epoch": 7.33,
1722
+ "learning_rate": 4.995472150821271e-05,
1723
+ "loss": 1.3323,
1724
  "step": 26000
1725
  },
1726
  {
1727
  "epoch": 7.33,
1728
  "eval_gen_len": 20.0,
1729
+ "eval_loss": 2.143944501876831,
1730
+ "eval_rouge1": 21.5245,
1731
+ "eval_rouge2": 5.444,
1732
+ "eval_rougeL": 21.312,
1733
+ "eval_rougeLsum": 21.3015,
1734
+ "eval_runtime": 166.5992,
1735
+ "eval_samples_per_second": 42.065,
1736
+ "eval_steps_per_second": 2.629,
1737
  "step": 26000
1738
  },
1739
  {
1740
  "epoch": 7.36,
1741
+ "learning_rate": 4.9934408894194186e-05,
1742
+ "loss": 1.3458,
1743
  "step": 26100
1744
  },
1745
  {
1746
  "epoch": 7.38,
1747
+ "learning_rate": 4.991034877972824e-05,
1748
+ "loss": 1.3346,
1749
  "step": 26200
1750
  },
1751
  {
1752
  "epoch": 7.41,
1753
+ "learning_rate": 4.988254478817961e-05,
1754
+ "loss": 1.3482,
1755
  "step": 26300
1756
  },
1757
  {
1758
  "epoch": 7.44,
1759
+ "learning_rate": 4.9851001106727165e-05,
1760
+ "loss": 1.3519,
1761
  "step": 26400
1762
  },
1763
  {
1764
  "epoch": 7.47,
1765
+ "learning_rate": 4.9815722485733305e-05,
1766
+ "loss": 1.3699,
1767
  "step": 26500
1768
  },
1769
  {
1770
  "epoch": 7.5,
1771
+ "learning_rate": 4.9776714238028576e-05,
1772
+ "loss": 1.3496,
1773
  "step": 26600
1774
  },
1775
  {
1776
  "epoch": 7.53,
1777
+ "learning_rate": 4.97339822381116e-05,
1778
+ "loss": 1.3835,
1779
  "step": 26700
1780
  },
1781
  {
1782
  "epoch": 7.55,
1783
+ "learning_rate": 4.968753292126438e-05,
1784
+ "loss": 1.3604,
1785
  "step": 26800
1786
  },
1787
  {
1788
  "epoch": 7.58,
1789
+ "learning_rate": 4.96373732825832e-05,
1790
+ "loss": 1.3776,
1791
  "step": 26900
1792
  },
1793
  {
1794
  "epoch": 7.61,
1795
+ "learning_rate": 4.9583510875925124e-05,
1796
+ "loss": 1.3988,
1797
  "step": 27000
1798
  },
1799
  {
1800
  "epoch": 7.64,
1801
+ "learning_rate": 4.952595381277048e-05,
1802
+ "loss": 1.3795,
1803
  "step": 27100
1804
  },
1805
  {
1806
  "epoch": 7.67,
1807
+ "learning_rate": 4.946471076100126e-05,
1808
+ "loss": 1.3823,
1809
  "step": 27200
1810
  },
1811
  {
1812
  "epoch": 7.69,
1813
+ "learning_rate": 4.939979094359581e-05,
1814
+ "loss": 1.3671,
1815
  "step": 27300
1816
  },
1817
  {
1818
  "epoch": 7.72,
1819
+ "learning_rate": 4.933120413723981e-05,
1820
+ "loss": 1.4092,
1821
  "step": 27400
1822
  },
1823
  {
1824
  "epoch": 7.75,
1825
+ "learning_rate": 4.925896067085404e-05,
1826
+ "loss": 1.4064,
1827
  "step": 27500
1828
  },
1829
  {
1830
  "epoch": 7.78,
1831
+ "learning_rate": 4.918307142403879e-05,
1832
+ "loss": 1.4131,
1833
  "step": 27600
1834
  },
1835
  {
1836
  "epoch": 7.81,
1837
+ "learning_rate": 4.91035478254355e-05,
1838
+ "loss": 1.4069,
1839
  "step": 27700
1840
  },
1841
  {
1842
  "epoch": 7.84,
1843
+ "learning_rate": 4.902040185100559e-05,
1844
+ "loss": 1.4013,
1845
  "step": 27800
1846
  },
1847
  {
1848
  "epoch": 7.86,
1849
+ "learning_rate": 4.893364602222699e-05,
1850
+ "loss": 1.4102,
1851
  "step": 27900
1852
  },
1853
  {
1854
  "epoch": 7.89,
1855
+ "learning_rate": 4.88432934042084e-05,
1856
+ "loss": 1.4175,
1857
  "step": 28000
1858
  },
1859
  {
1860
  "epoch": 7.89,
1861
  "eval_gen_len": 20.0,
1862
+ "eval_loss": 2.1455013751983643,
1863
+ "eval_rouge1": 21.4843,
1864
+ "eval_rouge2": 5.5418,
1865
+ "eval_rougeL": 21.2359,
1866
+ "eval_rougeLsum": 21.2323,
1867
+ "eval_runtime": 165.871,
1868
+ "eval_samples_per_second": 42.25,
1869
+ "eval_steps_per_second": 2.641,
1870
  "step": 28000
1871
  },
1872
  {
1873
  "epoch": 7.92,
1874
+ "learning_rate": 4.874935760372175e-05,
1875
+ "loss": 1.407,
1876
  "step": 28100
1877
  },
1878
  {
1879
  "epoch": 7.95,
1880
+ "learning_rate": 4.865284543447741e-05,
1881
+ "loss": 1.4325,
1882
  "step": 28200
1883
  },
1884
  {
1885
  "epoch": 7.98,
1886
+ "learning_rate": 4.855182171477396e-05,
1887
+ "loss": 1.4169,
1888
  "step": 28300
1889
  },
1890
  {
1891
  "epoch": 8.0,
1892
+ "learning_rate": 4.844725870716676e-05,
1893
+ "loss": 1.4046,
1894
  "step": 28400
1895
  },
1896
  {
1897
  "epoch": 8.03,
1898
+ "learning_rate": 4.833917215846013e-05,
1899
+ "loss": 1.1944,
1900
  "step": 28500
1901
  },
1902
  {
1903
  "epoch": 8.06,
1904
+ "learning_rate": 4.822757834609075e-05,
1905
+ "loss": 1.2087,
1906
  "step": 28600
1907
  },
1908
  {
1909
  "epoch": 8.09,
1910
+ "learning_rate": 4.811249407567628e-05,
1911
+ "loss": 1.2257,
1912
  "step": 28700
1913
  },
1914
  {
1915
  "epoch": 8.12,
1916
+ "learning_rate": 4.799393667848454e-05,
1917
+ "loss": 1.2269,
1918
  "step": 28800
1919
  },
1920
  {
1921
  "epoch": 8.15,
1922
+ "learning_rate": 4.787192400882345e-05,
1923
+ "loss": 1.244,
1924
  "step": 28900
1925
  },
1926
  {
1927
  "epoch": 8.17,
1928
+ "learning_rate": 4.774647444135227e-05,
1929
+ "loss": 1.2302,
1930
  "step": 29000
1931
  },
1932
  {
1933
  "epoch": 8.2,
1934
+ "learning_rate": 4.7617606868314405e-05,
1935
+ "loss": 1.2461,
1936
  "step": 29100
1937
  },
1938
  {
1939
  "epoch": 8.23,
1940
+ "learning_rate": 4.748534069669236e-05,
1941
+ "loss": 1.2514,
1942
  "step": 29200
1943
  },
1944
  {
1945
  "epoch": 8.26,
1946
+ "learning_rate": 4.7349695845285055e-05,
1947
+ "loss": 1.2402,
1948
  "step": 29300
1949
  },
1950
  {
1951
  "epoch": 8.29,
1952
+ "learning_rate": 4.721069274170819e-05,
1953
+ "loss": 1.2567,
1954
  "step": 29400
1955
  },
1956
  {
1957
  "epoch": 8.31,
1958
+ "learning_rate": 4.706835231931785e-05,
1959
+ "loss": 1.2423,
1960
  "step": 29500
1961
  },
1962
  {
1963
  "epoch": 8.34,
1964
+ "learning_rate": 4.6922696014058083e-05,
1965
+ "loss": 1.2573,
1966
  "step": 29600
1967
  },
1968
  {
1969
  "epoch": 8.37,
1970
+ "learning_rate": 4.677374576123271e-05,
1971
+ "loss": 1.253,
1972
  "step": 29700
1973
  },
1974
  {
1975
  "epoch": 8.4,
1976
+ "learning_rate": 4.6621523992201896e-05,
1977
+ "loss": 1.2469,
1978
  "step": 29800
1979
  },
1980
  {
1981
  "epoch": 8.43,
1982
+ "learning_rate": 4.646605363100417e-05,
1983
+ "loss": 1.2911,
1984
  "step": 29900
1985
  },
1986
  {
1987
  "epoch": 8.46,
1988
+ "learning_rate": 4.6307358090904045e-05,
1989
+ "loss": 1.2541,
1990
  "step": 30000
1991
  },
1992
  {
1993
  "epoch": 8.46,
1994
  "eval_gen_len": 20.0,
1995
+ "eval_loss": 2.217175006866455,
1996
+ "eval_rouge1": 20.9025,
1997
+ "eval_rouge2": 5.1166,
1998
+ "eval_rougeL": 20.7062,
1999
+ "eval_rougeLsum": 20.6916,
2000
+ "eval_runtime": 165.6707,
2001
+ "eval_samples_per_second": 42.301,
2002
+ "eval_steps_per_second": 2.644,
2003
  "step": 30000
2004
  },
2005
  {
2006
  "epoch": 8.48,
2007
+ "learning_rate": 4.6147096005744875e-05,
2008
+ "loss": 1.2806,
2009
  "step": 30100
2010
  },
2011
  {
2012
  "epoch": 8.51,
2013
+ "learning_rate": 4.598205393356323e-05,
2014
+ "loss": 1.2891,
2015
  "step": 30200
2016
  },
2017
  {
2018
  "epoch": 8.54,
2019
+ "learning_rate": 4.581385957105333e-05,
2020
+ "loss": 1.2964,
2021
  "step": 30300
2022
  },
2023
  {
2024
  "epoch": 8.57,
2025
+ "learning_rate": 4.564253824766751e-05,
2026
+ "loss": 1.2694,
2027
  "step": 30400
2028
  },
2029
  {
2030
  "epoch": 8.6,
2031
+ "learning_rate": 4.54681157637669e-05,
2032
+ "loss": 1.2883,
2033
  "step": 30500
2034
  },
2035
  {
2036
  "epoch": 8.62,
2037
+ "learning_rate": 4.5290618386736e-05,
2038
+ "loss": 1.3061,
2039
  "step": 30600
2040
  },
2041
  {
2042
  "epoch": 8.65,
2043
+ "learning_rate": 4.51100728470269e-05,
2044
+ "loss": 1.3022,
2045
  "step": 30700
2046
  },
2047
  {
2048
  "epoch": 8.68,
2049
+ "learning_rate": 4.492650633413379e-05,
2050
+ "loss": 1.2958,
2051
  "step": 30800
2052
  },
2053
  {
2054
  "epoch": 8.71,
2055
+ "learning_rate": 4.473994649249829e-05,
2056
+ "loss": 1.2966,
2057
  "step": 30900
2058
  },
2059
  {
2060
  "epoch": 8.74,
2061
+ "learning_rate": 4.455042141734634e-05,
2062
+ "loss": 1.2956,
2063
  "step": 31000
2064
  },
2065
  {
2066
  "epoch": 8.77,
2067
+ "learning_rate": 4.4357959650457124e-05,
2068
+ "loss": 1.3057,
2069
  "step": 31100
2070
  },
2071
  {
2072
  "epoch": 8.79,
2073
+ "learning_rate": 4.41625901758648e-05,
2074
+ "loss": 1.3006,
2075
  "step": 31200
2076
  },
2077
  {
2078
  "epoch": 8.82,
2079
+ "learning_rate": 4.396434241549363e-05,
2080
+ "loss": 1.311,
2081
  "step": 31300
2082
  },
2083
  {
2084
  "epoch": 8.85,
2085
+ "learning_rate": 4.37632462247271e-05,
2086
+ "loss": 1.309,
2087
  "step": 31400
2088
  },
2089
  {
2090
  "epoch": 8.88,
2091
+ "learning_rate": 4.355933188791186e-05,
2092
+ "loss": 1.3324,
2093
  "step": 31500
2094
  },
2095
  {
2096
  "epoch": 8.91,
2097
+ "learning_rate": 4.335263011379698e-05,
2098
+ "loss": 1.3186,
2099
  "step": 31600
2100
  },
2101
  {
2102
  "epoch": 8.93,
2103
+ "learning_rate": 4.314317203090931e-05,
2104
+ "loss": 1.3298,
2105
  "step": 31700
2106
  },
2107
  {
2108
  "epoch": 8.96,
2109
+ "learning_rate": 4.2930989182865715e-05,
2110
+ "loss": 1.3148,
2111
  "step": 31800
2112
  },
2113
  {
2114
  "epoch": 8.99,
2115
+ "learning_rate": 4.271611352362262e-05,
2116
+ "loss": 1.3318,
2117
  "step": 31900
2118
  },
2119
  {
2120
  "epoch": 9.02,
2121
+ "learning_rate": 4.2498577412663946e-05,
2122
+ "loss": 1.1331,
2123
  "step": 32000
2124
  },
2125
  {
2126
  "epoch": 9.02,
2127
  "eval_gen_len": 20.0,
2128
+ "eval_loss": 2.278179168701172,
2129
+ "eval_rouge1": 20.9415,
2130
+ "eval_rouge2": 5.2115,
2131
+ "eval_rougeL": 20.7647,
2132
+ "eval_rougeLsum": 20.7806,
2133
+ "eval_runtime": 166.4103,
2134
+ "eval_samples_per_second": 42.113,
2135
+ "eval_steps_per_second": 2.632,
2136
  "step": 32000
2137
  },
2138
  {
2139
  "epoch": 9.05,
2140
+ "learning_rate": 4.2278413610127834e-05,
2141
+ "loss": 1.0677,
2142
  "step": 32100
2143
  },
2144
  {
2145
  "epoch": 9.08,
2146
+ "learning_rate": 4.205565527187312e-05,
2147
+ "loss": 1.0712,
2148
  "step": 32200
2149
  },
2150
  {
2151
  "epoch": 9.1,
2152
+ "learning_rate": 4.183033594448618e-05,
2153
+ "loss": 1.0928,
2154
  "step": 32300
2155
  },
2156
  {
2157
  "epoch": 9.13,
2158
+ "learning_rate": 4.160248956022893e-05,
2159
+ "loss": 1.0821,
2160
  "step": 32400
2161
  },
2162
  {
2163
  "epoch": 9.16,
2164
+ "learning_rate": 4.137215043192875e-05,
2165
+ "loss": 1.0988,
2166
  "step": 32500
2167
  },
2168
  {
2169
  "epoch": 9.19,
2170
+ "learning_rate": 4.11393532478111e-05,
2171
+ "loss": 1.103,
2172
  "step": 32600
2173
  },
2174
  {
2175
  "epoch": 9.22,
2176
+ "learning_rate": 4.0904133066275636e-05,
2177
+ "loss": 1.1109,
2178
  "step": 32700
2179
  },
2180
  {
2181
  "epoch": 9.24,
2182
+ "learning_rate": 4.066652531061647e-05,
2183
+ "loss": 1.0833,
2184
  "step": 32800
2185
  },
2186
  {
2187
  "epoch": 9.27,
2188
+ "learning_rate": 4.0426565763687624e-05,
2189
+ "loss": 1.0964,
2190
  "step": 32900
2191
  },
2192
  {
2193
  "epoch": 9.3,
2194
+ "learning_rate": 4.0184290562514214e-05,
2195
+ "loss": 1.1168,
2196
  "step": 33000
2197
  },
2198
  {
2199
  "epoch": 9.33,
2200
+ "learning_rate": 3.9939736192850386e-05,
2201
+ "loss": 1.1122,
2202
  "step": 33100
2203
  },
2204
  {
2205
  "epoch": 9.36,
2206
+ "learning_rate": 3.969293948368467e-05,
2207
+ "loss": 1.1165,
2208
  "step": 33200
2209
  },
2210
  {
2211
  "epoch": 9.39,
2212
+ "learning_rate": 3.944393760169368e-05,
2213
+ "loss": 1.1106,
2214
  "step": 33300
2215
  },
2216
  {
2217
  "epoch": 9.41,
2218
+ "learning_rate": 3.9192768045644926e-05,
2219
+ "loss": 1.1119,
2220
  "step": 33400
2221
  },
2222
  {
2223
  "epoch": 9.44,
2224
+ "learning_rate": 3.8939468640749696e-05,
2225
+ "loss": 1.1258,
2226
  "step": 33500
2227
  },
2228
  {
2229
  "epoch": 9.47,
2230
+ "learning_rate": 3.868407753296665e-05,
2231
+ "loss": 1.099,
2232
  "step": 33600
2233
  },
2234
  {
2235
  "epoch": 9.5,
2236
+ "learning_rate": 3.842921766325347e-05,
2237
+ "loss": 1.1246,
2238
  "step": 33700
2239
  },
2240
  {
2241
  "epoch": 9.53,
2242
+ "learning_rate": 3.816977879359396e-05,
2243
+ "loss": 1.1175,
2244
  "step": 33800
2245
  },
2246
  {
2247
  "epoch": 9.55,
2248
+ "learning_rate": 3.790836413350648e-05,
2249
+ "loss": 1.1313,
2250
  "step": 33900
2251
  },
2252
  {
2253
  "epoch": 9.58,
2254
+ "learning_rate": 3.764501305107728e-05,
2255
+ "loss": 1.1067,
2256
  "step": 34000
2257
  },
2258
  {
2259
  "epoch": 9.58,
2260
  "eval_gen_len": 20.0,
2261
+ "eval_loss": 2.27223801612854,
2262
+ "eval_rouge1": 21.5648,
2263
+ "eval_rouge2": 5.4445,
2264
+ "eval_rougeL": 21.3624,
2265
+ "eval_rougeLsum": 21.3838,
2266
+ "eval_runtime": 166.4194,
2267
+ "eval_samples_per_second": 42.11,
2268
+ "eval_steps_per_second": 2.632,
2269
  "step": 34000
2270
  },
2271
  {
2272
  "epoch": 9.61,
2273
+ "learning_rate": 3.737976520601068e-05,
2274
+ "loss": 1.1316,
2275
  "step": 34100
2276
  },
2277
  {
2278
  "epoch": 9.64,
2279
+ "learning_rate": 3.7112660543656495e-05,
2280
+ "loss": 1.137,
2281
  "step": 34200
2282
  },
2283
  {
2284
  "epoch": 9.67,
2285
+ "learning_rate": 3.6843739288994395e-05,
2286
+ "loss": 1.1319,
2287
  "step": 34300
2288
  },
2289
  {
2290
  "epoch": 9.7,
2291
+ "learning_rate": 3.657304194057615e-05,
2292
+ "loss": 1.1217,
2293
  "step": 34400
2294
  },
2295
  {
2296
  "epoch": 9.72,
2297
+ "learning_rate": 3.630060926442673e-05,
2298
+ "loss": 1.1465,
2299
  "step": 34500
2300
  },
2301
  {
2302
  "epoch": 9.75,
2303
+ "learning_rate": 3.602648228790508e-05,
2304
+ "loss": 1.1396,
2305
  "step": 34600
2306
  },
2307
  {
2308
  "epoch": 9.78,
2309
+ "learning_rate": 3.5750702293525555e-05,
2310
+ "loss": 1.132,
2311
  "step": 34700
2312
  },
2313
  {
2314
  "epoch": 9.81,
2315
+ "learning_rate": 3.5473310812740905e-05,
2316
+ "loss": 1.159,
2317
  "step": 34800
2318
  },
2319
  {
2320
  "epoch": 9.84,
2321
+ "learning_rate": 3.519434961968779e-05,
2322
+ "loss": 1.1484,
2323
  "step": 34900
2324
  },
2325
  {
2326
  "epoch": 9.86,
2327
+ "learning_rate": 3.491386072489581e-05,
2328
+ "loss": 1.1375,
2329
  "step": 35000
2330
  },
2331
  {
2332
  "epoch": 9.89,
2333
+ "learning_rate": 3.4631886368960794e-05,
2334
+ "loss": 1.1493,
2335
  "step": 35100
2336
  },
2337
  {
2338
  "epoch": 9.92,
2339
+ "learning_rate": 3.434846901618357e-05,
2340
+ "loss": 1.1374,
2341
  "step": 35200
2342
  },
2343
  {
2344
  "epoch": 9.95,
2345
+ "learning_rate": 3.406365134817494e-05,
2346
+ "loss": 1.1569,
2347
  "step": 35300
2348
  },
2349
  {
2350
  "epoch": 9.98,
2351
+ "learning_rate": 3.3777476257428106e-05,
2352
+ "loss": 1.1441,
2353
  "step": 35400
2354
  },
2355
  {
2356
  "epoch": 10.0,
2357
  "step": 35480,
2358
  "total_flos": 2.3762843604025344e+17,
2359
+ "train_loss": 1.581618417182931,
2360
+ "train_runtime": 14068.2834,
2361
+ "train_samples_per_second": 40.346,
2362
+ "train_steps_per_second": 2.522
2363
  }
2364
  ],
2365
  "max_steps": 35480,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cdba313b31bdac3438861022cd20b0533cea266378d23d59149777ab8970935
3
- size 3567
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb6c58ad85c6302cbcc0ea2d49c40e211911e0bcc1dfc125087695e4f753b0c
3
+ size 3503