truocpham commited on
Commit
d942770
·
1 Parent(s): 30ee914

upload flan dialogue summary checkpoint

Browse files
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/flan-t5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "n_positions": 512,
20
+ "num_decoder_layers": 12,
21
+ "num_heads": 12,
22
+ "num_layers": 12,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "task_specific_params": {
28
+ "summarization": {
29
+ "early_stopping": true,
30
+ "length_penalty": 2.0,
31
+ "max_length": 200,
32
+ "min_length": 30,
33
+ "no_repeat_ngram_size": 3,
34
+ "num_beams": 4,
35
+ "prefix": "summarize: "
36
+ },
37
+ "translation_en_to_de": {
38
+ "early_stopping": true,
39
+ "max_length": 300,
40
+ "num_beams": 4,
41
+ "prefix": "translate English to German: "
42
+ },
43
+ "translation_en_to_fr": {
44
+ "early_stopping": true,
45
+ "max_length": 300,
46
+ "num_beams": 4,
47
+ "prefix": "translate English to French: "
48
+ },
49
+ "translation_en_to_ro": {
50
+ "early_stopping": true,
51
+ "max_length": 300,
52
+ "num_beams": 4,
53
+ "prefix": "translate English to Romanian: "
54
+ }
55
+ },
56
+ "tie_word_embeddings": false,
57
+ "torch_dtype": "float32",
58
+ "transformers_version": "4.27.2",
59
+ "use_cache": true,
60
+ "vocab_size": 32128
61
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.27.2"
7
+ }
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edd71738541229c729e50d1bda7c29049a2ca5920f173c34bd2cdc27e9a5fa38
3
+ size 1980790149
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a842404c48dfe3227b10ecaa927d2e9f39b1c598ea7b2b221c35ddbd6c493799
3
+ size 990408885
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f7d8204e7b1526587690f1e38fde9610f66ca0de1767ba6af2331793c0cbf21
3
+ size 14575
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d33c264c21acad66b94bb8201538053bc92b569e972bb89f038ad48526d9f119
3
+ size 627
trainer_state.json ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 31150,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.16,
12
+ "learning_rate": 9.94649545211343e-06,
13
+ "loss": 4.4121,
14
+ "step": 500
15
+ },
16
+ {
17
+ "epoch": 0.32,
18
+ "learning_rate": 9.892990904226861e-06,
19
+ "loss": 0.1731,
20
+ "step": 1000
21
+ },
22
+ {
23
+ "epoch": 0.48,
24
+ "learning_rate": 9.83948635634029e-06,
25
+ "loss": 0.1161,
26
+ "step": 1500
27
+ },
28
+ {
29
+ "epoch": 0.64,
30
+ "learning_rate": 9.78598180845372e-06,
31
+ "loss": 0.1059,
32
+ "step": 2000
33
+ },
34
+ {
35
+ "epoch": 0.8,
36
+ "learning_rate": 9.73247726056715e-06,
37
+ "loss": 0.1011,
38
+ "step": 2500
39
+ },
40
+ {
41
+ "epoch": 0.96,
42
+ "learning_rate": 9.67897271268058e-06,
43
+ "loss": 0.1013,
44
+ "step": 3000
45
+ },
46
+ {
47
+ "epoch": 1.0,
48
+ "eval_loss": 0.08340005576610565,
49
+ "eval_runtime": 17.1185,
50
+ "eval_samples_per_second": 29.208,
51
+ "eval_steps_per_second": 7.302,
52
+ "step": 3115
53
+ },
54
+ {
55
+ "epoch": 1.12,
56
+ "learning_rate": 9.62546816479401e-06,
57
+ "loss": 0.0987,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 1.28,
62
+ "learning_rate": 9.571963616907438e-06,
63
+ "loss": 0.0946,
64
+ "step": 4000
65
+ },
66
+ {
67
+ "epoch": 1.44,
68
+ "learning_rate": 9.518459069020868e-06,
69
+ "loss": 0.0956,
70
+ "step": 4500
71
+ },
72
+ {
73
+ "epoch": 1.61,
74
+ "learning_rate": 9.464954521134298e-06,
75
+ "loss": 0.0984,
76
+ "step": 5000
77
+ },
78
+ {
79
+ "epoch": 1.77,
80
+ "learning_rate": 9.411449973247728e-06,
81
+ "loss": 0.0949,
82
+ "step": 5500
83
+ },
84
+ {
85
+ "epoch": 1.93,
86
+ "learning_rate": 9.357945425361158e-06,
87
+ "loss": 0.0929,
88
+ "step": 6000
89
+ },
90
+ {
91
+ "epoch": 2.0,
92
+ "eval_loss": 0.08016223460435867,
93
+ "eval_runtime": 17.1151,
94
+ "eval_samples_per_second": 29.214,
95
+ "eval_steps_per_second": 7.304,
96
+ "step": 6230
97
+ },
98
+ {
99
+ "epoch": 2.09,
100
+ "learning_rate": 9.304440877474586e-06,
101
+ "loss": 0.0909,
102
+ "step": 6500
103
+ },
104
+ {
105
+ "epoch": 2.25,
106
+ "learning_rate": 9.250936329588016e-06,
107
+ "loss": 0.0917,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 2.41,
112
+ "learning_rate": 9.197431781701446e-06,
113
+ "loss": 0.0913,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 2.57,
118
+ "learning_rate": 9.143927233814876e-06,
119
+ "loss": 0.0912,
120
+ "step": 8000
121
+ },
122
+ {
123
+ "epoch": 2.73,
124
+ "learning_rate": 9.090422685928304e-06,
125
+ "loss": 0.0904,
126
+ "step": 8500
127
+ },
128
+ {
129
+ "epoch": 2.89,
130
+ "learning_rate": 9.036918138041734e-06,
131
+ "loss": 0.0908,
132
+ "step": 9000
133
+ },
134
+ {
135
+ "epoch": 3.0,
136
+ "eval_loss": 0.0781576856970787,
137
+ "eval_runtime": 17.0736,
138
+ "eval_samples_per_second": 29.285,
139
+ "eval_steps_per_second": 7.321,
140
+ "step": 9345
141
+ },
142
+ {
143
+ "epoch": 3.05,
144
+ "learning_rate": 8.983413590155164e-06,
145
+ "loss": 0.089,
146
+ "step": 9500
147
+ },
148
+ {
149
+ "epoch": 3.21,
150
+ "learning_rate": 8.929909042268593e-06,
151
+ "loss": 0.0874,
152
+ "step": 10000
153
+ },
154
+ {
155
+ "epoch": 3.37,
156
+ "learning_rate": 8.876404494382023e-06,
157
+ "loss": 0.0868,
158
+ "step": 10500
159
+ },
160
+ {
161
+ "epoch": 3.53,
162
+ "learning_rate": 8.822899946495453e-06,
163
+ "loss": 0.0868,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 3.69,
168
+ "learning_rate": 8.769395398608883e-06,
169
+ "loss": 0.0877,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 3.85,
174
+ "learning_rate": 8.715890850722311e-06,
175
+ "loss": 0.0876,
176
+ "step": 12000
177
+ },
178
+ {
179
+ "epoch": 4.0,
180
+ "eval_loss": 0.07677410542964935,
181
+ "eval_runtime": 17.0783,
182
+ "eval_samples_per_second": 29.277,
183
+ "eval_steps_per_second": 7.319,
184
+ "step": 12460
185
+ },
186
+ {
187
+ "epoch": 4.01,
188
+ "learning_rate": 8.662386302835741e-06,
189
+ "loss": 0.0887,
190
+ "step": 12500
191
+ },
192
+ {
193
+ "epoch": 4.17,
194
+ "learning_rate": 8.608881754949171e-06,
195
+ "loss": 0.0848,
196
+ "step": 13000
197
+ },
198
+ {
199
+ "epoch": 4.33,
200
+ "learning_rate": 8.555377207062601e-06,
201
+ "loss": 0.0825,
202
+ "step": 13500
203
+ },
204
+ {
205
+ "epoch": 4.49,
206
+ "learning_rate": 8.501872659176031e-06,
207
+ "loss": 0.085,
208
+ "step": 14000
209
+ },
210
+ {
211
+ "epoch": 4.65,
212
+ "learning_rate": 8.44836811128946e-06,
213
+ "loss": 0.0858,
214
+ "step": 14500
215
+ },
216
+ {
217
+ "epoch": 4.82,
218
+ "learning_rate": 8.39486356340289e-06,
219
+ "loss": 0.0855,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 4.98,
224
+ "learning_rate": 8.34135901551632e-06,
225
+ "loss": 0.0857,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 5.0,
230
+ "eval_loss": 0.07618943601846695,
231
+ "eval_runtime": 17.0787,
232
+ "eval_samples_per_second": 29.276,
233
+ "eval_steps_per_second": 7.319,
234
+ "step": 15575
235
+ },
236
+ {
237
+ "epoch": 5.14,
238
+ "learning_rate": 8.28785446762975e-06,
239
+ "loss": 0.0833,
240
+ "step": 16000
241
+ },
242
+ {
243
+ "epoch": 5.3,
244
+ "learning_rate": 8.234349919743178e-06,
245
+ "loss": 0.0819,
246
+ "step": 16500
247
+ },
248
+ {
249
+ "epoch": 5.46,
250
+ "learning_rate": 8.180845371856608e-06,
251
+ "loss": 0.0842,
252
+ "step": 17000
253
+ },
254
+ {
255
+ "epoch": 5.62,
256
+ "learning_rate": 8.127340823970038e-06,
257
+ "loss": 0.0814,
258
+ "step": 17500
259
+ },
260
+ {
261
+ "epoch": 5.78,
262
+ "learning_rate": 8.073836276083468e-06,
263
+ "loss": 0.0828,
264
+ "step": 18000
265
+ },
266
+ {
267
+ "epoch": 5.94,
268
+ "learning_rate": 8.020331728196898e-06,
269
+ "loss": 0.0847,
270
+ "step": 18500
271
+ },
272
+ {
273
+ "epoch": 6.0,
274
+ "eval_loss": 0.07566038519144058,
275
+ "eval_runtime": 17.0768,
276
+ "eval_samples_per_second": 29.28,
277
+ "eval_steps_per_second": 7.32,
278
+ "step": 18690
279
+ },
280
+ {
281
+ "epoch": 6.1,
282
+ "learning_rate": 7.966827180310326e-06,
283
+ "loss": 0.0826,
284
+ "step": 19000
285
+ },
286
+ {
287
+ "epoch": 6.26,
288
+ "learning_rate": 7.913322632423756e-06,
289
+ "loss": 0.0791,
290
+ "step": 19500
291
+ },
292
+ {
293
+ "epoch": 6.42,
294
+ "learning_rate": 7.859818084537186e-06,
295
+ "loss": 0.0809,
296
+ "step": 20000
297
+ },
298
+ {
299
+ "epoch": 6.58,
300
+ "learning_rate": 7.806313536650616e-06,
301
+ "loss": 0.083,
302
+ "step": 20500
303
+ },
304
+ {
305
+ "epoch": 6.74,
306
+ "learning_rate": 7.752808988764046e-06,
307
+ "loss": 0.0817,
308
+ "step": 21000
309
+ },
310
+ {
311
+ "epoch": 6.9,
312
+ "learning_rate": 7.699304440877475e-06,
313
+ "loss": 0.0815,
314
+ "step": 21500
315
+ },
316
+ {
317
+ "epoch": 7.0,
318
+ "eval_loss": 0.07512963563203812,
319
+ "eval_runtime": 17.0959,
320
+ "eval_samples_per_second": 29.247,
321
+ "eval_steps_per_second": 7.312,
322
+ "step": 21805
323
+ },
324
+ {
325
+ "epoch": 7.06,
326
+ "learning_rate": 7.645799892990905e-06,
327
+ "loss": 0.0812,
328
+ "step": 22000
329
+ },
330
+ {
331
+ "epoch": 7.22,
332
+ "learning_rate": 7.592295345104335e-06,
333
+ "loss": 0.0805,
334
+ "step": 22500
335
+ },
336
+ {
337
+ "epoch": 7.38,
338
+ "learning_rate": 7.538790797217765e-06,
339
+ "loss": 0.0808,
340
+ "step": 23000
341
+ },
342
+ {
343
+ "epoch": 7.54,
344
+ "learning_rate": 7.485286249331194e-06,
345
+ "loss": 0.0792,
346
+ "step": 23500
347
+ },
348
+ {
349
+ "epoch": 7.7,
350
+ "learning_rate": 7.431781701444624e-06,
351
+ "loss": 0.0792,
352
+ "step": 24000
353
+ },
354
+ {
355
+ "epoch": 7.87,
356
+ "learning_rate": 7.378277153558053e-06,
357
+ "loss": 0.0807,
358
+ "step": 24500
359
+ },
360
+ {
361
+ "epoch": 8.0,
362
+ "eval_loss": 0.07496295124292374,
363
+ "eval_runtime": 17.08,
364
+ "eval_samples_per_second": 29.274,
365
+ "eval_steps_per_second": 7.319,
366
+ "step": 24920
367
+ },
368
+ {
369
+ "epoch": 8.03,
370
+ "learning_rate": 7.324772605671483e-06,
371
+ "loss": 0.0784,
372
+ "step": 25000
373
+ },
374
+ {
375
+ "epoch": 8.19,
376
+ "learning_rate": 7.271268057784913e-06,
377
+ "loss": 0.0776,
378
+ "step": 25500
379
+ },
380
+ {
381
+ "epoch": 8.35,
382
+ "learning_rate": 7.217763509898342e-06,
383
+ "loss": 0.0764,
384
+ "step": 26000
385
+ },
386
+ {
387
+ "epoch": 8.51,
388
+ "learning_rate": 7.164258962011772e-06,
389
+ "loss": 0.0792,
390
+ "step": 26500
391
+ },
392
+ {
393
+ "epoch": 8.67,
394
+ "learning_rate": 7.110754414125201e-06,
395
+ "loss": 0.0802,
396
+ "step": 27000
397
+ },
398
+ {
399
+ "epoch": 8.83,
400
+ "learning_rate": 7.057249866238631e-06,
401
+ "loss": 0.0803,
402
+ "step": 27500
403
+ },
404
+ {
405
+ "epoch": 8.99,
406
+ "learning_rate": 7.003745318352061e-06,
407
+ "loss": 0.0779,
408
+ "step": 28000
409
+ },
410
+ {
411
+ "epoch": 9.0,
412
+ "eval_loss": 0.0747738629579544,
413
+ "eval_runtime": 17.0877,
414
+ "eval_samples_per_second": 29.261,
415
+ "eval_steps_per_second": 7.315,
416
+ "step": 28035
417
+ },
418
+ {
419
+ "epoch": 9.15,
420
+ "learning_rate": 6.950240770465491e-06,
421
+ "loss": 0.0765,
422
+ "step": 28500
423
+ },
424
+ {
425
+ "epoch": 9.31,
426
+ "learning_rate": 6.896736222578921e-06,
427
+ "loss": 0.0762,
428
+ "step": 29000
429
+ },
430
+ {
431
+ "epoch": 9.47,
432
+ "learning_rate": 6.84323167469235e-06,
433
+ "loss": 0.0777,
434
+ "step": 29500
435
+ },
436
+ {
437
+ "epoch": 9.63,
438
+ "learning_rate": 6.789727126805778e-06,
439
+ "loss": 0.078,
440
+ "step": 30000
441
+ },
442
+ {
443
+ "epoch": 9.79,
444
+ "learning_rate": 6.736222578919208e-06,
445
+ "loss": 0.0782,
446
+ "step": 30500
447
+ },
448
+ {
449
+ "epoch": 9.95,
450
+ "learning_rate": 6.682718031032638e-06,
451
+ "loss": 0.0757,
452
+ "step": 31000
453
+ },
454
+ {
455
+ "epoch": 10.0,
456
+ "eval_loss": 0.07480964064598083,
457
+ "eval_runtime": 17.0749,
458
+ "eval_samples_per_second": 29.283,
459
+ "eval_steps_per_second": 7.321,
460
+ "step": 31150
461
+ }
462
+ ],
463
+ "max_steps": 93450,
464
+ "num_train_epochs": 30,
465
+ "total_flos": 8.53207661150208e+16,
466
+ "trial_name": null,
467
+ "trial_params": null
468
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0fa116c5e3ab65f9b0008617c5a811cff17751cf47eec575f94d2784b55ef9d
3
+ size 3643