shivanandmn commited on
Commit
6ad02c3
·
verified ·
1 Parent(s): 37eb7f1

Training in progress, step 500

Browse files
BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped/default/0.0.0/e6ef704dbf95ade0b1fde03c931b3112df6ad731.incomplete_info.lock ADDED
File without changes
BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped/default/0.0.0/e6ef704dbf95ade0b1fde03c931b3112df6ad731/dataset_info.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"description": "", "citation": "", "homepage": "", "license": "", "features": {"input_ids": {"feature": {"dtype": "int32", "_type": "Value"}, "_type": "Sequence"}, "attention_mask": {"feature": {"dtype": "int8", "_type": "Value"}, "_type": "Sequence"}}, "builder_name": "parquet", "dataset_name": "wikitext-103-raw-v1-seq1024-tokenized-grouped", "config_name": "default", "version": {"version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 5997096, "num_examples": 1141, "dataset_name": "wikitext-103-raw-v1-seq1024-tokenized-grouped"}, "train": {"name": "train", "num_bytes": 599152464, "num_examples": 113994, "shard_lengths": [96000, 17994], "dataset_name": "wikitext-103-raw-v1-seq1024-tokenized-grouped"}}, "download_checksums": {"hf://datasets/BluebrainAI/wikitext-103-raw-v1-seq1024-tokenized-grouped@e6ef704dbf95ade0b1fde03c931b3112df6ad731/data/validation-00000-of-00001.parquet": {"num_bytes": 2318981, "checksum": null}, "hf://datasets/BluebrainAI/wikitext-103-raw-v1-seq1024-tokenized-grouped@e6ef704dbf95ade0b1fde03c931b3112df6ad731/data/train-00000-of-00001.parquet": {"num_bytes": 231535449, "checksum": null}}, "download_size": 233854430, "dataset_size": 605149560, "size_in_bytes": 839003990}
BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped/default/0.0.0/e6ef704dbf95ade0b1fde03c931b3112df6ad731/wikitext-103-raw-v1-seq1024-tokenized-grouped-train-00000-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31fa9caad52ee8eb6cc36d08cfe1ee76cdfe3366db187b8f87fc84f8f59f9106
3
+ size 492317736
BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped/default/0.0.0/e6ef704dbf95ade0b1fde03c931b3112df6ad731/wikitext-103-raw-v1-seq1024-tokenized-grouped-train-00001-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8e3a8d0170e5706b6aae210f6762884c873c66754a7eeab5fd4379cabf4286
3
+ size 92279256
BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped/default/0.0.0/e6ef704dbf95ade0b1fde03c931b3112df6ad731/wikitext-103-raw-v1-seq1024-tokenized-grouped-validation.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61e41d26a8c63613e66724bf11ada0918b7180bedbe2b5f6a9cc7fe9dc003352
3
+ size 5852200
BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped/default/0.0.0/e6ef704dbf95ade0b1fde03c931b3112df6ad731_builder.lock ADDED
File without changes
all_results.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.41983802858530744,
4
- "eval_bleu": 0.14543247859716402,
5
- "eval_loss": 3.180941343307495,
6
- "eval_perplexity": 24.06940046017307,
7
- "eval_runtime": 11.0676,
8
  "eval_samples": 1141,
9
- "eval_samples_per_second": 103.094,
10
- "eval_steps_per_second": 1.626,
11
- "perplexity": 24.06940046017307,
12
  "total_flos": 1.0586630697202483e+18,
13
- "train_loss": 1.7646983184129434,
14
- "train_runtime": 7367.504,
15
- "train_samples_per_second": 77.363,
16
- "train_steps_per_second": 1.209
17
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.41932913712054815,
4
+ "eval_bleu": 0.1439933070551526,
5
+ "eval_loss": 3.1860642433166504,
6
+ "eval_perplexity": 24.19302197197161,
7
+ "eval_runtime": 11.0615,
8
  "eval_samples": 1141,
9
+ "eval_samples_per_second": 103.151,
10
+ "eval_steps_per_second": 1.627,
11
+ "perplexity": 24.19302197197161,
12
  "total_flos": 1.0586630697202483e+18,
13
+ "train_loss": 3.7704881070840237,
14
+ "train_runtime": 13409.8958,
15
+ "train_samples_per_second": 42.504,
16
+ "train_steps_per_second": 0.664
17
  }
eval_metrics.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_loss": 9.138617515563965,
3
+ "eval_model_preparation_time": 0.0028,
4
+ "eval_accuracy": 0.003479138448463602,
5
+ "eval_perplexity": 9307.888222218875,
6
+ "eval_bleu": 0.0006759862222758408,
7
+ "eval_runtime": 6.6151,
8
+ "eval_samples_per_second": 172.483,
9
+ "eval_steps_per_second": 2.721
10
+ }
eval_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "epoch": 5.0,
3
- "eval_accuracy": 0.41983802858530744,
4
- "eval_bleu": 0.14543247859716402,
5
- "eval_loss": 3.180941343307495,
6
- "eval_perplexity": 24.06940046017307,
7
- "eval_runtime": 11.0676,
8
  "eval_samples": 1141,
9
- "eval_samples_per_second": 103.094,
10
- "eval_steps_per_second": 1.626,
11
- "perplexity": 24.06940046017307
12
  }
 
1
  {
2
  "epoch": 5.0,
3
+ "eval_accuracy": 0.41932913712054815,
4
+ "eval_bleu": 0.1439933070551526,
5
+ "eval_loss": 3.1860642433166504,
6
+ "eval_perplexity": 24.19302197197161,
7
+ "eval_runtime": 11.0615,
8
  "eval_samples": 1141,
9
+ "eval_samples_per_second": 103.151,
10
+ "eval_steps_per_second": 1.627,
11
+ "perplexity": 24.19302197197161
12
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9062c5cf6e74adbf118174e57a33cbd4d8758e18adb16a98323b4894d9c43298
3
  size 1419322880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8022a0380584f3db97c80c2aa17051382e9ba2c24c5e39b4212b39d88a89bdd2
3
  size 1419322880
output_models_parallel-mean-bottleneck-gpt2-medium-wikitext_BluebrainAI___wikitext-103-raw-v1-seq1024-tokenized-grouped_default_0.0.0_e6ef704dbf95ade0b1fde03c931b3112df6ad731.lock ADDED
File without changes
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 5.0,
3
  "total_flos": 1.0586630697202483e+18,
4
- "train_loss": 1.7646983184129434,
5
- "train_runtime": 7367.504,
6
- "train_samples_per_second": 77.363,
7
- "train_steps_per_second": 1.209
8
  }
 
1
  {
2
  "epoch": 5.0,
3
  "total_flos": 1.0586630697202483e+18,
4
+ "train_loss": 3.7704881070840237,
5
+ "train_runtime": 13409.8958,
6
+ "train_samples_per_second": 42.504,
7
+ "train_steps_per_second": 0.664
8
  }
trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 3.185894727706909,
3
  "best_model_checkpoint": "./output/models/parallel-mean-bottleneck-gpt2-medium-wikitext/checkpoint-8500",
4
  "epoch": 5.0,
5
  "eval_steps": 500,
@@ -10,822 +10,822 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.05611672278338945,
13
- "grad_norm": 1.5436944961547852,
14
  "learning_rate": 1.1223344556677892e-05,
15
  "loss": 8.933,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.1122334455667789,
20
- "grad_norm": 1.0424357652664185,
21
  "learning_rate": 2.2446689113355783e-05,
22
  "loss": 7.3249,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.16835016835016836,
27
- "grad_norm": 1.080315351486206,
28
  "learning_rate": 3.3670033670033675e-05,
29
  "loss": 6.6032,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.2244668911335578,
34
- "grad_norm": 1.2949663400650024,
35
  "learning_rate": 4.4893378226711566e-05,
36
- "loss": 6.2747,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.28058361391694725,
41
- "grad_norm": 1.1373127698898315,
42
  "learning_rate": 5.611672278338945e-05,
43
- "loss": 6.0432,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.28058361391694725,
48
- "eval_accuracy": 0.1909473862768935,
49
- "eval_bleu": 0.03783860296240099,
50
- "eval_loss": 5.917980670928955,
51
- "eval_perplexity": 371.66045085486866,
52
- "eval_runtime": 11.0079,
53
- "eval_samples_per_second": 103.653,
54
- "eval_steps_per_second": 1.635,
55
  "step": 500
56
  },
57
  {
58
  "epoch": 0.3367003367003367,
59
- "grad_norm": 1.0700047016143799,
60
  "learning_rate": 6.734006734006735e-05,
61
- "loss": 5.8439,
62
  "step": 600
63
  },
64
  {
65
  "epoch": 0.39281705948372614,
66
- "grad_norm": 1.4188077449798584,
67
  "learning_rate": 7.856341189674523e-05,
68
- "loss": 5.6426,
69
  "step": 700
70
  },
71
  {
72
  "epoch": 0.4489337822671156,
73
- "grad_norm": 1.1133205890655518,
74
  "learning_rate": 8.978675645342313e-05,
75
- "loss": 5.464,
76
  "step": 800
77
  },
78
  {
79
  "epoch": 0.5050505050505051,
80
- "grad_norm": 1.3167998790740967,
81
  "learning_rate": 9.988776655443322e-05,
82
- "loss": 5.25,
83
  "step": 900
84
  },
85
  {
86
  "epoch": 0.5611672278338945,
87
- "grad_norm": 1.0362534523010254,
88
  "learning_rate": 9.864072827035791e-05,
89
- "loss": 5.0476,
90
  "step": 1000
91
  },
92
  {
93
  "epoch": 0.5611672278338945,
94
- "eval_accuracy": 0.26329735967574874,
95
- "eval_bleu": 0.06118160815453846,
96
- "eval_loss": 4.8985185623168945,
97
- "eval_perplexity": 134.0909850321564,
98
- "eval_runtime": 11.0771,
99
- "eval_samples_per_second": 103.006,
100
- "eval_steps_per_second": 1.625,
101
  "step": 1000
102
  },
103
  {
104
  "epoch": 0.6172839506172839,
105
- "grad_norm": 0.9963632225990295,
106
  "learning_rate": 9.73936899862826e-05,
107
- "loss": 4.8705,
108
  "step": 1100
109
  },
110
  {
111
  "epoch": 0.6734006734006734,
112
- "grad_norm": 0.9135327935218811,
113
  "learning_rate": 9.614665170220725e-05,
114
- "loss": 4.7183,
115
  "step": 1200
116
  },
117
  {
118
  "epoch": 0.7295173961840629,
119
- "grad_norm": 0.9141886830329895,
120
  "learning_rate": 9.489961341813194e-05,
121
- "loss": 4.5877,
122
  "step": 1300
123
  },
124
  {
125
  "epoch": 0.7856341189674523,
126
- "grad_norm": 0.8190032839775085,
127
  "learning_rate": 9.365257513405662e-05,
128
- "loss": 4.4743,
129
  "step": 1400
130
  },
131
  {
132
  "epoch": 0.8417508417508418,
133
- "grad_norm": 0.8178410530090332,
134
  "learning_rate": 9.24055368499813e-05,
135
- "loss": 4.3528,
136
  "step": 1500
137
  },
138
  {
139
  "epoch": 0.8417508417508418,
140
- "eval_accuracy": 0.3181762495041735,
141
- "eval_bleu": 0.08335013046001274,
142
- "eval_loss": 4.239789962768555,
143
- "eval_perplexity": 69.39327513659308,
144
- "eval_runtime": 11.1029,
145
- "eval_samples_per_second": 102.766,
146
- "eval_steps_per_second": 1.621,
147
  "step": 1500
148
  },
149
  {
150
  "epoch": 0.8978675645342312,
151
- "grad_norm": 0.9094557166099548,
152
  "learning_rate": 9.115849856590598e-05,
153
- "loss": 4.2651,
154
  "step": 1600
155
  },
156
  {
157
  "epoch": 0.9539842873176206,
158
- "grad_norm": 0.8006865382194519,
159
  "learning_rate": 8.991146028183066e-05,
160
- "loss": 4.1573,
161
  "step": 1700
162
  },
163
  {
164
  "epoch": 1.0101010101010102,
165
- "grad_norm": 0.8540999293327332,
166
  "learning_rate": 8.866442199775533e-05,
167
- "loss": 4.0896,
168
  "step": 1800
169
  },
170
  {
171
  "epoch": 1.0662177328843996,
172
- "grad_norm": 0.8293948173522949,
173
  "learning_rate": 8.741738371368002e-05,
174
- "loss": 4.0039,
175
  "step": 1900
176
  },
177
  {
178
  "epoch": 1.122334455667789,
179
- "grad_norm": 0.7962952256202698,
180
  "learning_rate": 8.617034542960469e-05,
181
- "loss": 3.9497,
182
  "step": 2000
183
  },
184
  {
185
  "epoch": 1.122334455667789,
186
- "eval_accuracy": 0.35195670481639213,
187
- "eval_bleu": 0.10543076899138493,
188
- "eval_loss": 3.8878896236419678,
189
- "eval_perplexity": 48.807774985506946,
190
- "eval_runtime": 11.1215,
191
- "eval_samples_per_second": 102.594,
192
- "eval_steps_per_second": 1.618,
193
  "step": 2000
194
  },
195
  {
196
  "epoch": 1.1784511784511784,
197
- "grad_norm": 0.7807645797729492,
198
  "learning_rate": 8.492330714552937e-05,
199
- "loss": 3.9058,
200
  "step": 2100
201
  },
202
  {
203
  "epoch": 1.2345679012345678,
204
- "grad_norm": 0.7973741292953491,
205
  "learning_rate": 8.367626886145406e-05,
206
- "loss": 3.859,
207
  "step": 2200
208
  },
209
  {
210
  "epoch": 1.2906846240179575,
211
- "grad_norm": 0.7875047922134399,
212
  "learning_rate": 8.242923057737873e-05,
213
- "loss": 3.84,
214
  "step": 2300
215
  },
216
  {
217
  "epoch": 1.3468013468013469,
218
- "grad_norm": 0.7630696296691895,
219
  "learning_rate": 8.11821922933034e-05,
220
- "loss": 3.7989,
221
  "step": 2400
222
  },
223
  {
224
  "epoch": 1.4029180695847363,
225
- "grad_norm": 0.7279968857765198,
226
  "learning_rate": 7.993515400922809e-05,
227
- "loss": 3.7614,
228
  "step": 2500
229
  },
230
  {
231
  "epoch": 1.4029180695847363,
232
- "eval_accuracy": 0.367412783799089,
233
- "eval_bleu": 0.12071702163720974,
234
- "eval_loss": 3.7127673625946045,
235
- "eval_perplexity": 40.967020402690565,
236
- "eval_runtime": 11.2307,
237
- "eval_samples_per_second": 101.596,
238
- "eval_steps_per_second": 1.603,
239
  "step": 2500
240
  },
241
  {
242
  "epoch": 1.4590347923681257,
243
- "grad_norm": 0.7240808606147766,
244
  "learning_rate": 7.868811572515277e-05,
245
- "loss": 3.7429,
246
  "step": 2600
247
  },
248
  {
249
  "epoch": 1.5151515151515151,
250
- "grad_norm": 0.7098649144172668,
251
  "learning_rate": 7.744107744107744e-05,
252
- "loss": 3.7126,
253
  "step": 2700
254
  },
255
  {
256
  "epoch": 1.5712682379349046,
257
- "grad_norm": 0.7229514122009277,
258
  "learning_rate": 7.619403915700213e-05,
259
  "loss": 3.6816,
260
  "step": 2800
261
  },
262
  {
263
  "epoch": 1.627384960718294,
264
- "grad_norm": 0.7334359288215637,
265
  "learning_rate": 7.49470008729268e-05,
266
- "loss": 3.6683,
267
  "step": 2900
268
  },
269
  {
270
  "epoch": 1.6835016835016834,
271
- "grad_norm": 0.7653447389602661,
272
  "learning_rate": 7.369996258885148e-05,
273
- "loss": 3.6543,
274
  "step": 3000
275
  },
276
  {
277
  "epoch": 1.6835016835016834,
278
- "eval_accuracy": 0.37795900253846026,
279
- "eval_bleu": 0.13095688045902315,
280
- "eval_loss": 3.590174913406372,
281
- "eval_perplexity": 36.24041430644117,
282
- "eval_runtime": 11.0101,
283
- "eval_samples_per_second": 103.632,
284
- "eval_steps_per_second": 1.635,
285
  "step": 3000
286
  },
287
  {
288
  "epoch": 1.7396184062850728,
289
- "grad_norm": 0.7163519263267517,
290
  "learning_rate": 7.245292430477615e-05,
291
- "loss": 3.621,
292
  "step": 3100
293
  },
294
  {
295
  "epoch": 1.7957351290684624,
296
- "grad_norm": 0.7292365431785583,
297
  "learning_rate": 7.120588602070084e-05,
298
- "loss": 3.6054,
299
  "step": 3200
300
  },
301
  {
302
  "epoch": 1.8518518518518519,
303
- "grad_norm": 0.7313345670700073,
304
  "learning_rate": 6.995884773662552e-05,
305
- "loss": 3.578,
306
  "step": 3300
307
  },
308
  {
309
  "epoch": 1.9079685746352413,
310
- "grad_norm": 0.7751216292381287,
311
  "learning_rate": 6.871180945255021e-05,
312
- "loss": 3.5709,
313
  "step": 3400
314
  },
315
  {
316
  "epoch": 1.964085297418631,
317
- "grad_norm": 0.7018395066261292,
318
  "learning_rate": 6.746477116847487e-05,
319
  "loss": 3.5527,
320
  "step": 3500
321
  },
322
  {
323
  "epoch": 1.964085297418631,
324
- "eval_accuracy": 0.38635314154807526,
325
- "eval_bleu": 0.13373604633044894,
326
- "eval_loss": 3.504826545715332,
327
- "eval_perplexity": 33.2756715439979,
328
- "eval_runtime": 11.1003,
329
- "eval_samples_per_second": 102.79,
330
- "eval_steps_per_second": 1.622,
331
  "step": 3500
332
  },
333
  {
334
  "epoch": 2.0202020202020203,
335
- "grad_norm": 0.7037595510482788,
336
  "learning_rate": 6.621773288439955e-05,
337
- "loss": 3.5173,
338
  "step": 3600
339
  },
340
  {
341
  "epoch": 2.0763187429854097,
342
- "grad_norm": 0.745313286781311,
343
  "learning_rate": 6.497069460032424e-05,
344
- "loss": 3.4633,
345
  "step": 3700
346
  },
347
  {
348
  "epoch": 2.132435465768799,
349
- "grad_norm": 0.7386572360992432,
350
  "learning_rate": 6.372365631624892e-05,
351
- "loss": 3.4408,
352
  "step": 3800
353
  },
354
  {
355
  "epoch": 2.1885521885521886,
356
- "grad_norm": 0.709396243095398,
357
  "learning_rate": 6.247661803217359e-05,
358
  "loss": 3.4414,
359
  "step": 3900
360
  },
361
  {
362
  "epoch": 2.244668911335578,
363
- "grad_norm": 0.7127860188484192,
364
  "learning_rate": 6.122957974809826e-05,
365
- "loss": 3.4348,
366
  "step": 4000
367
  },
368
  {
369
  "epoch": 2.244668911335578,
370
- "eval_accuracy": 0.39228335487983224,
371
- "eval_bleu": 0.13612405667589095,
372
- "eval_loss": 3.4400899410247803,
373
- "eval_perplexity": 31.189763281432384,
374
- "eval_runtime": 11.0954,
375
- "eval_samples_per_second": 102.835,
376
- "eval_steps_per_second": 1.622,
377
  "step": 4000
378
  },
379
  {
380
  "epoch": 2.3007856341189674,
381
- "grad_norm": 0.7178986668586731,
382
  "learning_rate": 5.998254146402295e-05,
383
- "loss": 3.4225,
384
  "step": 4100
385
  },
386
  {
387
  "epoch": 2.356902356902357,
388
- "grad_norm": 0.701989471912384,
389
  "learning_rate": 5.8735503179947625e-05,
390
- "loss": 3.4018,
391
  "step": 4200
392
  },
393
  {
394
  "epoch": 2.4130190796857462,
395
- "grad_norm": 0.7325447797775269,
396
  "learning_rate": 5.748846489587231e-05,
397
  "loss": 3.3989,
398
  "step": 4300
399
  },
400
  {
401
  "epoch": 2.4691358024691357,
402
- "grad_norm": 0.7156170010566711,
403
  "learning_rate": 5.624142661179699e-05,
404
- "loss": 3.3827,
405
  "step": 4400
406
  },
407
  {
408
  "epoch": 2.525252525252525,
409
- "grad_norm": 0.7117893099784851,
410
  "learning_rate": 5.4994388327721666e-05,
411
- "loss": 3.3739,
412
  "step": 4500
413
  },
414
  {
415
  "epoch": 2.525252525252525,
416
- "eval_accuracy": 0.3973971143969165,
417
- "eval_bleu": 0.1418527614095323,
418
- "eval_loss": 3.3868210315704346,
419
- "eval_perplexity": 29.57179488564003,
420
- "eval_runtime": 10.9961,
421
- "eval_samples_per_second": 103.765,
422
- "eval_steps_per_second": 1.637,
423
  "step": 4500
424
  },
425
  {
426
  "epoch": 2.581369248035915,
427
- "grad_norm": 0.7047144770622253,
428
  "learning_rate": 5.374735004364634e-05,
429
- "loss": 3.3787,
430
  "step": 4600
431
  },
432
  {
433
  "epoch": 2.637485970819304,
434
- "grad_norm": 0.6755483746528625,
435
  "learning_rate": 5.250031175957102e-05,
436
- "loss": 3.3641,
437
  "step": 4700
438
  },
439
  {
440
  "epoch": 2.6936026936026938,
441
- "grad_norm": 0.7206361889839172,
442
  "learning_rate": 5.12532734754957e-05,
443
  "loss": 3.3589,
444
  "step": 4800
445
  },
446
  {
447
  "epoch": 2.749719416386083,
448
- "grad_norm": 0.6900231838226318,
449
  "learning_rate": 5.000623519142038e-05,
450
- "loss": 3.3492,
451
  "step": 4900
452
  },
453
  {
454
  "epoch": 2.8058361391694726,
455
- "grad_norm": 0.7102543115615845,
456
  "learning_rate": 4.8759196907345056e-05,
457
- "loss": 3.3441,
458
  "step": 5000
459
  },
460
  {
461
  "epoch": 2.8058361391694726,
462
- "eval_accuracy": 0.4019574330280841,
463
- "eval_bleu": 0.13938720153734416,
464
- "eval_loss": 3.3418636322021484,
465
- "eval_perplexity": 28.271765813386498,
466
- "eval_runtime": 11.1267,
467
- "eval_samples_per_second": 102.546,
468
- "eval_steps_per_second": 1.618,
469
  "step": 5000
470
  },
471
  {
472
  "epoch": 2.861952861952862,
473
- "grad_norm": 0.7073910236358643,
474
  "learning_rate": 4.751215862326974e-05,
475
- "loss": 3.3419,
476
  "step": 5100
477
  },
478
  {
479
  "epoch": 2.9180695847362514,
480
- "grad_norm": 0.6984754800796509,
481
  "learning_rate": 4.626512033919442e-05,
482
- "loss": 3.3203,
483
  "step": 5200
484
  },
485
  {
486
  "epoch": 2.974186307519641,
487
- "grad_norm": 0.6974130272865295,
488
  "learning_rate": 4.5018082055119096e-05,
489
- "loss": 3.3243,
490
  "step": 5300
491
  },
492
  {
493
  "epoch": 3.0303030303030303,
494
- "grad_norm": 0.7236476540565491,
495
  "learning_rate": 4.3771043771043774e-05,
496
- "loss": 3.2725,
497
  "step": 5400
498
  },
499
  {
500
  "epoch": 3.0864197530864197,
501
- "grad_norm": 0.7239139080047607,
502
  "learning_rate": 4.252400548696845e-05,
503
- "loss": 3.2252,
504
  "step": 5500
505
  },
506
  {
507
  "epoch": 3.0864197530864197,
508
- "eval_accuracy": 0.4057432771068235,
509
- "eval_bleu": 0.1432212835687173,
510
- "eval_loss": 3.306666374206543,
511
- "eval_perplexity": 27.29398570860935,
512
- "eval_runtime": 11.1335,
513
- "eval_samples_per_second": 102.483,
514
- "eval_steps_per_second": 1.617,
515
  "step": 5500
516
  },
517
  {
518
  "epoch": 3.142536475869809,
519
- "grad_norm": 0.737918496131897,
520
  "learning_rate": 4.127696720289313e-05,
521
- "loss": 3.2316,
522
  "step": 5600
523
  },
524
  {
525
  "epoch": 3.1986531986531985,
526
- "grad_norm": 0.7476251125335693,
527
  "learning_rate": 4.002992891881781e-05,
528
- "loss": 3.2276,
529
  "step": 5700
530
  },
531
  {
532
  "epoch": 3.254769921436588,
533
- "grad_norm": 0.7400563359260559,
534
  "learning_rate": 3.8782890634742486e-05,
535
- "loss": 3.224,
536
  "step": 5800
537
  },
538
  {
539
  "epoch": 3.3108866442199774,
540
- "grad_norm": 0.7361284494400024,
541
  "learning_rate": 3.7535852350667164e-05,
542
- "loss": 3.2252,
543
  "step": 5900
544
  },
545
  {
546
  "epoch": 3.3670033670033668,
547
- "grad_norm": 0.7335928082466125,
548
  "learning_rate": 3.628881406659185e-05,
549
- "loss": 3.2188,
550
  "step": 6000
551
  },
552
  {
553
  "epoch": 3.3670033670033668,
554
- "eval_accuracy": 0.4087632138295111,
555
- "eval_bleu": 0.14205763499672933,
556
- "eval_loss": 3.2775487899780273,
557
- "eval_perplexity": 26.510709673638566,
558
- "eval_runtime": 11.1182,
559
- "eval_samples_per_second": 102.625,
560
- "eval_steps_per_second": 1.619,
561
  "step": 6000
562
  },
563
  {
564
  "epoch": 3.4231200897867566,
565
- "grad_norm": 0.7380982637405396,
566
  "learning_rate": 3.504177578251652e-05,
567
- "loss": 3.1985,
568
  "step": 6100
569
  },
570
  {
571
  "epoch": 3.479236812570146,
572
- "grad_norm": 0.7403559684753418,
573
  "learning_rate": 3.3794737498441205e-05,
574
- "loss": 3.2097,
575
  "step": 6200
576
  },
577
  {
578
  "epoch": 3.5353535353535355,
579
- "grad_norm": 0.7356846928596497,
580
  "learning_rate": 3.254769921436588e-05,
581
- "loss": 3.1995,
582
  "step": 6300
583
  },
584
  {
585
  "epoch": 3.591470258136925,
586
- "grad_norm": 0.731614887714386,
587
  "learning_rate": 3.130066093029056e-05,
588
- "loss": 3.201,
589
  "step": 6400
590
  },
591
  {
592
  "epoch": 3.6475869809203143,
593
- "grad_norm": 0.7287305593490601,
594
  "learning_rate": 3.0053622646215242e-05,
595
- "loss": 3.1971,
596
  "step": 6500
597
  },
598
  {
599
  "epoch": 3.6475869809203143,
600
- "eval_accuracy": 0.4115484093714848,
601
- "eval_bleu": 0.14263540857108656,
602
- "eval_loss": 3.250213146209717,
603
- "eval_perplexity": 25.795837616279766,
604
- "eval_runtime": 11.1156,
605
- "eval_samples_per_second": 102.648,
606
- "eval_steps_per_second": 1.619,
607
  "step": 6500
608
  },
609
  {
610
  "epoch": 3.7037037037037037,
611
- "grad_norm": 0.730910062789917,
612
  "learning_rate": 2.880658436213992e-05,
613
- "loss": 3.1895,
614
  "step": 6600
615
  },
616
  {
617
  "epoch": 3.759820426487093,
618
- "grad_norm": 0.7339698672294617,
619
  "learning_rate": 2.7559546078064598e-05,
620
- "loss": 3.1804,
621
  "step": 6700
622
  },
623
  {
624
  "epoch": 3.8159371492704826,
625
- "grad_norm": 0.7487326860427856,
626
  "learning_rate": 2.6312507793989276e-05,
627
- "loss": 3.171,
628
  "step": 6800
629
  },
630
  {
631
  "epoch": 3.872053872053872,
632
- "grad_norm": 0.7359170913696289,
633
  "learning_rate": 2.5065469509913957e-05,
634
- "loss": 3.1629,
635
  "step": 6900
636
  },
637
  {
638
  "epoch": 3.9281705948372614,
639
- "grad_norm": 0.7311303019523621,
640
  "learning_rate": 2.3818431225838632e-05,
641
- "loss": 3.1722,
642
  "step": 7000
643
  },
644
  {
645
  "epoch": 3.9281705948372614,
646
- "eval_accuracy": 0.41430276300650337,
647
- "eval_bleu": 0.14455082637881378,
648
- "eval_loss": 3.2265915870666504,
649
- "eval_perplexity": 25.193640134914105,
650
- "eval_runtime": 11.1079,
651
- "eval_samples_per_second": 102.719,
652
- "eval_steps_per_second": 1.62,
653
  "step": 7000
654
  },
655
  {
656
  "epoch": 3.984287317620651,
657
- "grad_norm": 0.740738034248352,
658
  "learning_rate": 2.2571392941763313e-05,
659
- "loss": 3.1681,
660
  "step": 7100
661
  },
662
  {
663
  "epoch": 4.040404040404041,
664
- "grad_norm": 0.7563683986663818,
665
  "learning_rate": 2.132435465768799e-05,
666
- "loss": 3.1033,
667
  "step": 7200
668
  },
669
  {
670
  "epoch": 4.09652076318743,
671
- "grad_norm": 0.7448651194572449,
672
  "learning_rate": 2.007731637361267e-05,
673
- "loss": 3.0994,
674
  "step": 7300
675
  },
676
  {
677
  "epoch": 4.1526374859708195,
678
- "grad_norm": 0.7695390582084656,
679
  "learning_rate": 1.883027808953735e-05,
680
- "loss": 3.1011,
681
  "step": 7400
682
  },
683
  {
684
  "epoch": 4.2087542087542085,
685
- "grad_norm": 0.7607565522193909,
686
  "learning_rate": 1.758323980546203e-05,
687
- "loss": 3.1052,
688
  "step": 7500
689
  },
690
  {
691
  "epoch": 4.2087542087542085,
692
- "eval_accuracy": 0.41628521224800663,
693
- "eval_bleu": 0.14328695475901032,
694
- "eval_loss": 3.210294723510742,
695
- "eval_perplexity": 24.78639028015837,
696
- "eval_runtime": 11.0972,
697
- "eval_samples_per_second": 102.819,
698
- "eval_steps_per_second": 1.622,
699
  "step": 7500
700
  },
701
  {
702
  "epoch": 4.264870931537598,
703
- "grad_norm": 0.766854465007782,
704
  "learning_rate": 1.6336201521386706e-05,
705
  "loss": 3.0911,
706
  "step": 7600
707
  },
708
  {
709
  "epoch": 4.320987654320987,
710
- "grad_norm": 0.7667710185050964,
711
  "learning_rate": 1.5089163237311384e-05,
712
- "loss": 3.0779,
713
  "step": 7700
714
  },
715
  {
716
  "epoch": 4.377104377104377,
717
- "grad_norm": 0.7539135813713074,
718
  "learning_rate": 1.3842124953236066e-05,
719
- "loss": 3.0881,
720
  "step": 7800
721
  },
722
  {
723
  "epoch": 4.433221099887767,
724
- "grad_norm": 0.768229603767395,
725
  "learning_rate": 1.2595086669160744e-05,
726
- "loss": 3.0859,
727
  "step": 7900
728
  },
729
  {
730
  "epoch": 4.489337822671156,
731
- "grad_norm": 0.7591537833213806,
732
  "learning_rate": 1.1348048385085423e-05,
733
- "loss": 3.0672,
734
  "step": 8000
735
  },
736
  {
737
  "epoch": 4.489337822671156,
738
- "eval_accuracy": 0.41795752898068356,
739
- "eval_bleu": 0.1437527142843717,
740
- "eval_loss": 3.196687936782837,
741
- "eval_perplexity": 24.45141131580561,
742
- "eval_runtime": 11.1317,
743
- "eval_samples_per_second": 102.5,
744
- "eval_steps_per_second": 1.617,
745
  "step": 8000
746
  },
747
  {
748
  "epoch": 4.545454545454545,
749
- "grad_norm": 0.7729194760322571,
750
  "learning_rate": 1.0101010101010101e-05,
751
- "loss": 3.0826,
752
  "step": 8100
753
  },
754
  {
755
  "epoch": 4.601571268237935,
756
- "grad_norm": 0.7760916948318481,
757
  "learning_rate": 8.853971816934781e-06,
758
- "loss": 3.0755,
759
  "step": 8200
760
  },
761
  {
762
  "epoch": 4.657687991021325,
763
- "grad_norm": 0.7464035153388977,
764
  "learning_rate": 7.606933532859459e-06,
765
- "loss": 3.0829,
766
  "step": 8300
767
  },
768
  {
769
  "epoch": 4.713804713804714,
770
- "grad_norm": 0.7666931748390198,
771
  "learning_rate": 6.359895248784138e-06,
772
- "loss": 3.0744,
773
  "step": 8400
774
  },
775
  {
776
  "epoch": 4.7699214365881035,
777
- "grad_norm": 0.7630265355110168,
778
  "learning_rate": 5.112856964708817e-06,
779
- "loss": 3.0774,
780
  "step": 8500
781
  },
782
  {
783
  "epoch": 4.7699214365881035,
784
- "eval_accuracy": 0.41937625670061845,
785
- "eval_bleu": 0.14605654521068695,
786
- "eval_loss": 3.185894727706909,
787
- "eval_perplexity": 24.18892122468071,
788
- "eval_runtime": 11.0781,
789
- "eval_samples_per_second": 102.996,
790
- "eval_steps_per_second": 1.625,
791
  "step": 8500
792
  },
793
  {
794
  "epoch": 4.8260381593714925,
795
- "grad_norm": 0.7553364038467407,
796
  "learning_rate": 3.865818680633495e-06,
797
- "loss": 3.0679,
798
  "step": 8600
799
  },
800
  {
801
  "epoch": 4.882154882154882,
802
- "grad_norm": 0.7630147933959961,
803
  "learning_rate": 2.6187803965581742e-06,
804
- "loss": 3.0784,
805
  "step": 8700
806
  },
807
  {
808
  "epoch": 4.938271604938271,
809
- "grad_norm": 0.7633751034736633,
810
  "learning_rate": 1.3717421124828533e-06,
811
- "loss": 3.0748,
812
  "step": 8800
813
  },
814
  {
815
  "epoch": 4.994388327721661,
816
- "grad_norm": 0.7511401772499084,
817
  "learning_rate": 1.2470382840753213e-07,
818
- "loss": 3.0648,
819
  "step": 8900
820
  },
821
  {
822
  "epoch": 5.0,
823
  "step": 8910,
824
  "total_flos": 1.0586630697202483e+18,
825
- "train_loss": 1.7646983184129434,
826
- "train_runtime": 7367.504,
827
- "train_samples_per_second": 77.363,
828
- "train_steps_per_second": 1.209
829
  }
830
  ],
831
  "logging_steps": 100,
 
1
  {
2
+ "best_metric": 3.1860642433166504,
3
  "best_model_checkpoint": "./output/models/parallel-mean-bottleneck-gpt2-medium-wikitext/checkpoint-8500",
4
  "epoch": 5.0,
5
  "eval_steps": 500,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.05611672278338945,
13
+ "grad_norm": 1.54179847240448,
14
  "learning_rate": 1.1223344556677892e-05,
15
  "loss": 8.933,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.1122334455667789,
20
+ "grad_norm": 1.036051869392395,
21
  "learning_rate": 2.2446689113355783e-05,
22
  "loss": 7.3249,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.16835016835016836,
27
+ "grad_norm": 1.0635498762130737,
28
  "learning_rate": 3.3670033670033675e-05,
29
  "loss": 6.6032,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.2244668911335578,
34
+ "grad_norm": 1.8017141819000244,
35
  "learning_rate": 4.4893378226711566e-05,
36
+ "loss": 6.276,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.28058361391694725,
41
+ "grad_norm": 1.3540623188018799,
42
  "learning_rate": 5.611672278338945e-05,
43
+ "loss": 6.0438,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.28058361391694725,
48
+ "eval_accuracy": 0.189721420475428,
49
+ "eval_bleu": 0.03593862692187448,
50
+ "eval_loss": 5.919970989227295,
51
+ "eval_perplexity": 372.40091008131225,
52
+ "eval_runtime": 11.0176,
53
+ "eval_samples_per_second": 103.562,
54
+ "eval_steps_per_second": 1.634,
55
  "step": 500
56
  },
57
  {
58
  "epoch": 0.3367003367003367,
59
+ "grad_norm": 1.158894658088684,
60
  "learning_rate": 6.734006734006735e-05,
61
+ "loss": 5.8447,
62
  "step": 600
63
  },
64
  {
65
  "epoch": 0.39281705948372614,
66
+ "grad_norm": 1.1720908880233765,
67
  "learning_rate": 7.856341189674523e-05,
68
+ "loss": 5.6413,
69
  "step": 700
70
  },
71
  {
72
  "epoch": 0.4489337822671156,
73
+ "grad_norm": 1.2613935470581055,
74
  "learning_rate": 8.978675645342313e-05,
75
+ "loss": 5.4607,
76
  "step": 800
77
  },
78
  {
79
  "epoch": 0.5050505050505051,
80
+ "grad_norm": 1.0967986583709717,
81
  "learning_rate": 9.988776655443322e-05,
82
+ "loss": 5.2463,
83
  "step": 900
84
  },
85
  {
86
  "epoch": 0.5611672278338945,
87
+ "grad_norm": 1.0036829710006714,
88
  "learning_rate": 9.864072827035791e-05,
89
+ "loss": 5.0422,
90
  "step": 1000
91
  },
92
  {
93
  "epoch": 0.5611672278338945,
94
+ "eval_accuracy": 0.2635603726045048,
95
+ "eval_bleu": 0.06103039117966378,
96
+ "eval_loss": 4.893420696258545,
97
+ "eval_perplexity": 133.40914658800884,
98
+ "eval_runtime": 11.0697,
99
+ "eval_samples_per_second": 103.074,
100
+ "eval_steps_per_second": 1.626,
101
  "step": 1000
102
  },
103
  {
104
  "epoch": 0.6172839506172839,
105
+ "grad_norm": 0.9532070159912109,
106
  "learning_rate": 9.73936899862826e-05,
107
+ "loss": 4.8675,
108
  "step": 1100
109
  },
110
  {
111
  "epoch": 0.6734006734006734,
112
+ "grad_norm": 0.9311050772666931,
113
  "learning_rate": 9.614665170220725e-05,
114
+ "loss": 4.7157,
115
  "step": 1200
116
  },
117
  {
118
  "epoch": 0.7295173961840629,
119
+ "grad_norm": 0.9085267782211304,
120
  "learning_rate": 9.489961341813194e-05,
121
+ "loss": 4.5856,
122
  "step": 1300
123
  },
124
  {
125
  "epoch": 0.7856341189674523,
126
+ "grad_norm": 0.8684507608413696,
127
  "learning_rate": 9.365257513405662e-05,
128
+ "loss": 4.4711,
129
  "step": 1400
130
  },
131
  {
132
  "epoch": 0.8417508417508418,
133
+ "grad_norm": 0.8369265794754028,
134
  "learning_rate": 9.24055368499813e-05,
135
+ "loss": 4.3494,
136
  "step": 1500
137
  },
138
  {
139
  "epoch": 0.8417508417508418,
140
+ "eval_accuracy": 0.318285052898154,
141
+ "eval_bleu": 0.08333640625132697,
142
+ "eval_loss": 4.238930702209473,
143
+ "eval_perplexity": 69.33367384239048,
144
+ "eval_runtime": 11.0928,
145
+ "eval_samples_per_second": 102.859,
146
+ "eval_steps_per_second": 1.623,
147
  "step": 1500
148
  },
149
  {
150
  "epoch": 0.8978675645342312,
151
+ "grad_norm": 0.882328987121582,
152
  "learning_rate": 9.115849856590598e-05,
153
+ "loss": 4.2632,
154
  "step": 1600
155
  },
156
  {
157
  "epoch": 0.9539842873176206,
158
+ "grad_norm": 0.8303613662719727,
159
  "learning_rate": 8.991146028183066e-05,
160
+ "loss": 4.1548,
161
  "step": 1700
162
  },
163
  {
164
  "epoch": 1.0101010101010102,
165
+ "grad_norm": 0.8696436285972595,
166
  "learning_rate": 8.866442199775533e-05,
167
+ "loss": 4.0877,
168
  "step": 1800
169
  },
170
  {
171
  "epoch": 1.0662177328843996,
172
+ "grad_norm": 0.8291804194450378,
173
  "learning_rate": 8.741738371368002e-05,
174
+ "loss": 4.0032,
175
  "step": 1900
176
  },
177
  {
178
  "epoch": 1.122334455667789,
179
+ "grad_norm": 0.7963048815727234,
180
  "learning_rate": 8.617034542960469e-05,
181
+ "loss": 3.9486,
182
  "step": 2000
183
  },
184
  {
185
  "epoch": 1.122334455667789,
186
+ "eval_accuracy": 0.352146039856311,
187
+ "eval_bleu": 0.10372829354298996,
188
+ "eval_loss": 3.885582208633423,
189
+ "eval_perplexity": 48.695285023394725,
190
+ "eval_runtime": 11.0694,
191
+ "eval_samples_per_second": 103.077,
192
+ "eval_steps_per_second": 1.626,
193
  "step": 2000
194
  },
195
  {
196
  "epoch": 1.1784511784511784,
197
+ "grad_norm": 0.7901706099510193,
198
  "learning_rate": 8.492330714552937e-05,
199
+ "loss": 3.9051,
200
  "step": 2100
201
  },
202
  {
203
  "epoch": 1.2345679012345678,
204
+ "grad_norm": 0.8067757487297058,
205
  "learning_rate": 8.367626886145406e-05,
206
+ "loss": 3.8583,
207
  "step": 2200
208
  },
209
  {
210
  "epoch": 1.2906846240179575,
211
+ "grad_norm": 0.7848672866821289,
212
  "learning_rate": 8.242923057737873e-05,
213
+ "loss": 3.8389,
214
  "step": 2300
215
  },
216
  {
217
  "epoch": 1.3468013468013469,
218
+ "grad_norm": 0.7620055675506592,
219
  "learning_rate": 8.11821922933034e-05,
220
+ "loss": 3.7985,
221
  "step": 2400
222
  },
223
  {
224
  "epoch": 1.4029180695847363,
225
+ "grad_norm": 0.7321527600288391,
226
  "learning_rate": 7.993515400922809e-05,
227
+ "loss": 3.7605,
228
  "step": 2500
229
  },
230
  {
231
  "epoch": 1.4029180695847363,
232
+ "eval_accuracy": 0.36705039139236645,
233
+ "eval_bleu": 0.12056364590808125,
234
+ "eval_loss": 3.7143070697784424,
235
+ "eval_perplexity": 41.03014620345847,
236
+ "eval_runtime": 11.0834,
237
+ "eval_samples_per_second": 102.947,
238
+ "eval_steps_per_second": 1.624,
239
  "step": 2500
240
  },
241
  {
242
  "epoch": 1.4590347923681257,
243
+ "grad_norm": 0.7496013641357422,
244
  "learning_rate": 7.868811572515277e-05,
245
+ "loss": 3.7426,
246
  "step": 2600
247
  },
248
  {
249
  "epoch": 1.5151515151515151,
250
+ "grad_norm": 0.7082226872444153,
251
  "learning_rate": 7.744107744107744e-05,
252
+ "loss": 3.7123,
253
  "step": 2700
254
  },
255
  {
256
  "epoch": 1.5712682379349046,
257
+ "grad_norm": 0.7311062812805176,
258
  "learning_rate": 7.619403915700213e-05,
259
  "loss": 3.6816,
260
  "step": 2800
261
  },
262
  {
263
  "epoch": 1.627384960718294,
264
+ "grad_norm": 0.726064920425415,
265
  "learning_rate": 7.49470008729268e-05,
266
+ "loss": 3.6676,
267
  "step": 2900
268
  },
269
  {
270
  "epoch": 1.6835016835016834,
271
+ "grad_norm": 0.7564496994018555,
272
  "learning_rate": 7.369996258885148e-05,
273
+ "loss": 3.6544,
274
  "step": 3000
275
  },
276
  {
277
  "epoch": 1.6835016835016834,
278
+ "eval_accuracy": 0.37813634350345215,
279
+ "eval_bleu": 0.13317535043978146,
280
+ "eval_loss": 3.589838981628418,
281
+ "eval_perplexity": 36.22824204426872,
282
+ "eval_runtime": 11.0756,
283
+ "eval_samples_per_second": 103.019,
284
+ "eval_steps_per_second": 1.625,
285
  "step": 3000
286
  },
287
  {
288
  "epoch": 1.7396184062850728,
289
+ "grad_norm": 0.7231422066688538,
290
  "learning_rate": 7.245292430477615e-05,
291
+ "loss": 3.6204,
292
  "step": 3100
293
  },
294
  {
295
  "epoch": 1.7957351290684624,
296
+ "grad_norm": 0.7412242293357849,
297
  "learning_rate": 7.120588602070084e-05,
298
+ "loss": 3.605,
299
  "step": 3200
300
  },
301
  {
302
  "epoch": 1.8518518518518519,
303
+ "grad_norm": 0.7535395622253418,
304
  "learning_rate": 6.995884773662552e-05,
305
+ "loss": 3.5779,
306
  "step": 3300
307
  },
308
  {
309
  "epoch": 1.9079685746352413,
310
+ "grad_norm": 0.7545406222343445,
311
  "learning_rate": 6.871180945255021e-05,
312
+ "loss": 3.571,
313
  "step": 3400
314
  },
315
  {
316
  "epoch": 1.964085297418631,
317
+ "grad_norm": 0.716988742351532,
318
  "learning_rate": 6.746477116847487e-05,
319
  "loss": 3.5527,
320
  "step": 3500
321
  },
322
  {
323
  "epoch": 1.964085297418631,
324
+ "eval_accuracy": 0.3861560960314176,
325
+ "eval_bleu": 0.13491971280952403,
326
+ "eval_loss": 3.505063772201538,
327
+ "eval_perplexity": 33.283566351026074,
328
+ "eval_runtime": 11.0804,
329
+ "eval_samples_per_second": 102.974,
330
+ "eval_steps_per_second": 1.624,
331
  "step": 3500
332
  },
333
  {
334
  "epoch": 2.0202020202020203,
335
+ "grad_norm": 0.6990429759025574,
336
  "learning_rate": 6.621773288439955e-05,
337
+ "loss": 3.517,
338
  "step": 3600
339
  },
340
  {
341
  "epoch": 2.0763187429854097,
342
+ "grad_norm": 0.7404659390449524,
343
  "learning_rate": 6.497069460032424e-05,
344
+ "loss": 3.4636,
345
  "step": 3700
346
  },
347
  {
348
  "epoch": 2.132435465768799,
349
+ "grad_norm": 0.7441074848175049,
350
  "learning_rate": 6.372365631624892e-05,
351
+ "loss": 3.4411,
352
  "step": 3800
353
  },
354
  {
355
  "epoch": 2.1885521885521886,
356
+ "grad_norm": 0.701506495475769,
357
  "learning_rate": 6.247661803217359e-05,
358
  "loss": 3.4414,
359
  "step": 3900
360
  },
361
  {
362
  "epoch": 2.244668911335578,
363
+ "grad_norm": 0.7056393027305603,
364
  "learning_rate": 6.122957974809826e-05,
365
+ "loss": 3.4346,
366
  "step": 4000
367
  },
368
  {
369
  "epoch": 2.244668911335578,
370
+ "eval_accuracy": 0.39188926384651696,
371
+ "eval_bleu": 0.1334943569993854,
372
+ "eval_loss": 3.4409983158111572,
373
+ "eval_perplexity": 31.218108147922273,
374
+ "eval_runtime": 11.0871,
375
+ "eval_samples_per_second": 102.912,
376
+ "eval_steps_per_second": 1.624,
377
  "step": 4000
378
  },
379
  {
380
  "epoch": 2.3007856341189674,
381
+ "grad_norm": 0.7123896479606628,
382
  "learning_rate": 5.998254146402295e-05,
383
+ "loss": 3.4223,
384
  "step": 4100
385
  },
386
  {
387
  "epoch": 2.356902356902357,
388
+ "grad_norm": 0.7008007168769836,
389
  "learning_rate": 5.8735503179947625e-05,
390
+ "loss": 3.4019,
391
  "step": 4200
392
  },
393
  {
394
  "epoch": 2.4130190796857462,
395
+ "grad_norm": 0.7202064394950867,
396
  "learning_rate": 5.748846489587231e-05,
397
  "loss": 3.3989,
398
  "step": 4300
399
  },
400
  {
401
  "epoch": 2.4691358024691357,
402
+ "grad_norm": 0.7087401151657104,
403
  "learning_rate": 5.624142661179699e-05,
404
+ "loss": 3.3828,
405
  "step": 4400
406
  },
407
  {
408
  "epoch": 2.525252525252525,
409
+ "grad_norm": 0.7096906304359436,
410
  "learning_rate": 5.4994388327721666e-05,
411
+ "loss": 3.374,
412
  "step": 4500
413
  },
414
  {
415
  "epoch": 2.525252525252525,
416
+ "eval_accuracy": 0.3972454750210539,
417
+ "eval_bleu": 0.13536753244366495,
418
+ "eval_loss": 3.3866658210754395,
419
+ "eval_perplexity": 29.56720538889618,
420
+ "eval_runtime": 11.1089,
421
+ "eval_samples_per_second": 102.71,
422
+ "eval_steps_per_second": 1.62,
423
  "step": 4500
424
  },
425
  {
426
  "epoch": 2.581369248035915,
427
+ "grad_norm": 0.7192590832710266,
428
  "learning_rate": 5.374735004364634e-05,
429
+ "loss": 3.3785,
430
  "step": 4600
431
  },
432
  {
433
  "epoch": 2.637485970819304,
434
+ "grad_norm": 0.6848036646842957,
435
  "learning_rate": 5.250031175957102e-05,
436
+ "loss": 3.3642,
437
  "step": 4700
438
  },
439
  {
440
  "epoch": 2.6936026936026938,
441
+ "grad_norm": 0.7250380516052246,
442
  "learning_rate": 5.12532734754957e-05,
443
  "loss": 3.3589,
444
  "step": 4800
445
  },
446
  {
447
  "epoch": 2.749719416386083,
448
+ "grad_norm": 0.6998625993728638,
449
  "learning_rate": 5.000623519142038e-05,
450
+ "loss": 3.3491,
451
  "step": 4900
452
  },
453
  {
454
  "epoch": 2.8058361391694726,
455
+ "grad_norm": 0.7044927477836609,
456
  "learning_rate": 4.8759196907345056e-05,
457
+ "loss": 3.3442,
458
  "step": 5000
459
  },
460
  {
461
  "epoch": 2.8058361391694726,
462
+ "eval_accuracy": 0.40168071258512583,
463
+ "eval_bleu": 0.14053869067204397,
464
+ "eval_loss": 3.340980052947998,
465
+ "eval_perplexity": 28.24679650044353,
466
+ "eval_runtime": 11.0994,
467
+ "eval_samples_per_second": 102.798,
468
+ "eval_steps_per_second": 1.622,
469
  "step": 5000
470
  },
471
  {
472
  "epoch": 2.861952861952862,
473
+ "grad_norm": 0.7032910585403442,
474
  "learning_rate": 4.751215862326974e-05,
475
+ "loss": 3.342,
476
  "step": 5100
477
  },
478
  {
479
  "epoch": 2.9180695847362514,
480
+ "grad_norm": 0.6985570788383484,
481
  "learning_rate": 4.626512033919442e-05,
482
+ "loss": 3.3205,
483
  "step": 5200
484
  },
485
  {
486
  "epoch": 2.974186307519641,
487
+ "grad_norm": 0.7079312801361084,
488
  "learning_rate": 4.5018082055119096e-05,
489
+ "loss": 3.3244,
490
  "step": 5300
491
  },
492
  {
493
  "epoch": 3.0303030303030303,
494
+ "grad_norm": 0.7505896687507629,
495
  "learning_rate": 4.3771043771043774e-05,
496
+ "loss": 3.2727,
497
  "step": 5400
498
  },
499
  {
500
  "epoch": 3.0864197530864197,
501
+ "grad_norm": 0.7251821756362915,
502
  "learning_rate": 4.252400548696845e-05,
503
+ "loss": 3.2251,
504
  "step": 5500
505
  },
506
  {
507
  "epoch": 3.0864197530864197,
508
+ "eval_accuracy": 0.40551710312248607,
509
+ "eval_bleu": 0.14035240790883957,
510
+ "eval_loss": 3.3072268962860107,
511
+ "eval_perplexity": 27.3092888787174,
512
+ "eval_runtime": 11.1046,
513
+ "eval_samples_per_second": 102.75,
514
+ "eval_steps_per_second": 1.621,
515
  "step": 5500
516
  },
517
  {
518
  "epoch": 3.142536475869809,
519
+ "grad_norm": 0.7410485744476318,
520
  "learning_rate": 4.127696720289313e-05,
521
+ "loss": 3.2314,
522
  "step": 5600
523
  },
524
  {
525
  "epoch": 3.1986531986531985,
526
+ "grad_norm": 0.7442547678947449,
527
  "learning_rate": 4.002992891881781e-05,
528
+ "loss": 3.2277,
529
  "step": 5700
530
  },
531
  {
532
  "epoch": 3.254769921436588,
533
+ "grad_norm": 0.7330621480941772,
534
  "learning_rate": 3.8782890634742486e-05,
535
+ "loss": 3.2246,
536
  "step": 5800
537
  },
538
  {
539
  "epoch": 3.3108866442199774,
540
+ "grad_norm": 0.7348044514656067,
541
  "learning_rate": 3.7535852350667164e-05,
542
+ "loss": 3.2255,
543
  "step": 5900
544
  },
545
  {
546
  "epoch": 3.3670033670033668,
547
+ "grad_norm": 0.7259684801101685,
548
  "learning_rate": 3.628881406659185e-05,
549
+ "loss": 3.2187,
550
  "step": 6000
551
  },
552
  {
553
  "epoch": 3.3670033670033668,
554
+ "eval_accuracy": 0.40877263774552514,
555
+ "eval_bleu": 0.14006482323692013,
556
+ "eval_loss": 3.278057813644409,
557
+ "eval_perplexity": 26.524207687387797,
558
+ "eval_runtime": 11.0977,
559
+ "eval_samples_per_second": 102.814,
560
+ "eval_steps_per_second": 1.622,
561
  "step": 6000
562
  },
563
  {
564
  "epoch": 3.4231200897867566,
565
+ "grad_norm": 0.7419958710670471,
566
  "learning_rate": 3.504177578251652e-05,
567
+ "loss": 3.1992,
568
  "step": 6100
569
  },
570
  {
571
  "epoch": 3.479236812570146,
572
+ "grad_norm": 0.7455360889434814,
573
  "learning_rate": 3.3794737498441205e-05,
574
+ "loss": 3.2099,
575
  "step": 6200
576
  },
577
  {
578
  "epoch": 3.5353535353535355,
579
+ "grad_norm": 0.7269027829170227,
580
  "learning_rate": 3.254769921436588e-05,
581
+ "loss": 3.1998,
582
  "step": 6300
583
  },
584
  {
585
  "epoch": 3.591470258136925,
586
+ "grad_norm": 0.7311801910400391,
587
  "learning_rate": 3.130066093029056e-05,
588
+ "loss": 3.2011,
589
  "step": 6400
590
  },
591
  {
592
  "epoch": 3.6475869809203143,
593
+ "grad_norm": 0.7334641218185425,
594
  "learning_rate": 3.0053622646215242e-05,
595
+ "loss": 3.1975,
596
  "step": 6500
597
  },
598
  {
599
  "epoch": 3.6475869809203143,
600
+ "eval_accuracy": 0.41177629679509753,
601
+ "eval_bleu": 0.14331769757778526,
602
+ "eval_loss": 3.2494168281555176,
603
+ "eval_perplexity": 25.775304101751964,
604
+ "eval_runtime": 11.123,
605
+ "eval_samples_per_second": 102.581,
606
+ "eval_steps_per_second": 1.618,
607
  "step": 6500
608
  },
609
  {
610
  "epoch": 3.7037037037037037,
611
+ "grad_norm": 0.7255159616470337,
612
  "learning_rate": 2.880658436213992e-05,
613
+ "loss": 3.1898,
614
  "step": 6600
615
  },
616
  {
617
  "epoch": 3.759820426487093,
618
+ "grad_norm": 0.730241060256958,
619
  "learning_rate": 2.7559546078064598e-05,
620
+ "loss": 3.1806,
621
  "step": 6700
622
  },
623
  {
624
  "epoch": 3.8159371492704826,
625
+ "grad_norm": 0.7587730884552002,
626
  "learning_rate": 2.6312507793989276e-05,
627
+ "loss": 3.1713,
628
  "step": 6800
629
  },
630
  {
631
  "epoch": 3.872053872053872,
632
+ "grad_norm": 0.7403008937835693,
633
  "learning_rate": 2.5065469509913957e-05,
634
+ "loss": 3.1627,
635
  "step": 6900
636
  },
637
  {
638
  "epoch": 3.9281705948372614,
639
+ "grad_norm": 0.7402880191802979,
640
  "learning_rate": 2.3818431225838632e-05,
641
+ "loss": 3.172,
642
  "step": 7000
643
  },
644
  {
645
  "epoch": 3.9281705948372614,
646
+ "eval_accuracy": 0.41417168490194417,
647
+ "eval_bleu": 0.14449203518513976,
648
+ "eval_loss": 3.227550506591797,
649
+ "eval_perplexity": 25.21781039516413,
650
+ "eval_runtime": 11.0586,
651
+ "eval_samples_per_second": 103.178,
652
+ "eval_steps_per_second": 1.628,
653
  "step": 7000
654
  },
655
  {
656
  "epoch": 3.984287317620651,
657
+ "grad_norm": 0.7368999123573303,
658
  "learning_rate": 2.2571392941763313e-05,
659
+ "loss": 3.1682,
660
  "step": 7100
661
  },
662
  {
663
  "epoch": 4.040404040404041,
664
+ "grad_norm": 0.757147490978241,
665
  "learning_rate": 2.132435465768799e-05,
666
+ "loss": 3.1035,
667
  "step": 7200
668
  },
669
  {
670
  "epoch": 4.09652076318743,
671
+ "grad_norm": 0.7439301609992981,
672
  "learning_rate": 2.007731637361267e-05,
673
+ "loss": 3.0995,
674
  "step": 7300
675
  },
676
  {
677
  "epoch": 4.1526374859708195,
678
+ "grad_norm": 0.7797232866287231,
679
  "learning_rate": 1.883027808953735e-05,
680
+ "loss": 3.1013,
681
  "step": 7400
682
  },
683
  {
684
  "epoch": 4.2087542087542085,
685
+ "grad_norm": 0.7685579657554626,
686
  "learning_rate": 1.758323980546203e-05,
687
+ "loss": 3.1055,
688
  "step": 7500
689
  },
690
  {
691
  "epoch": 4.2087542087542085,
692
+ "eval_accuracy": 0.41630834367822295,
693
+ "eval_bleu": 0.1447371014434901,
694
+ "eval_loss": 3.2109010219573975,
695
+ "eval_perplexity": 24.801422786715616,
696
+ "eval_runtime": 11.144,
697
+ "eval_samples_per_second": 102.387,
698
+ "eval_steps_per_second": 1.615,
699
  "step": 7500
700
  },
701
  {
702
  "epoch": 4.264870931537598,
703
+ "grad_norm": 0.7712327241897583,
704
  "learning_rate": 1.6336201521386706e-05,
705
  "loss": 3.0911,
706
  "step": 7600
707
  },
708
  {
709
  "epoch": 4.320987654320987,
710
+ "grad_norm": 0.7560853958129883,
711
  "learning_rate": 1.5089163237311384e-05,
712
+ "loss": 3.0781,
713
  "step": 7700
714
  },
715
  {
716
  "epoch": 4.377104377104377,
717
+ "grad_norm": 0.7597346901893616,
718
  "learning_rate": 1.3842124953236066e-05,
719
+ "loss": 3.0883,
720
  "step": 7800
721
  },
722
  {
723
  "epoch": 4.433221099887767,
724
+ "grad_norm": 0.7666236162185669,
725
  "learning_rate": 1.2595086669160744e-05,
726
+ "loss": 3.0862,
727
  "step": 7900
728
  },
729
  {
730
  "epoch": 4.489337822671156,
731
+ "grad_norm": 0.7614879608154297,
732
  "learning_rate": 1.1348048385085423e-05,
733
+ "loss": 3.0676,
734
  "step": 8000
735
  },
736
  {
737
  "epoch": 4.489337822671156,
738
+ "eval_accuracy": 0.4178333046332255,
739
+ "eval_bleu": 0.14527019621911616,
740
+ "eval_loss": 3.197705030441284,
741
+ "eval_perplexity": 24.476293342725903,
742
+ "eval_runtime": 11.0951,
743
+ "eval_samples_per_second": 102.838,
744
+ "eval_steps_per_second": 1.622,
745
  "step": 8000
746
  },
747
  {
748
  "epoch": 4.545454545454545,
749
+ "grad_norm": 0.7722211480140686,
750
  "learning_rate": 1.0101010101010101e-05,
751
+ "loss": 3.0831,
752
  "step": 8100
753
  },
754
  {
755
  "epoch": 4.601571268237935,
756
+ "grad_norm": 0.7750692963600159,
757
  "learning_rate": 8.853971816934781e-06,
758
+ "loss": 3.0761,
759
  "step": 8200
760
  },
761
  {
762
  "epoch": 4.657687991021325,
763
+ "grad_norm": 0.7448268532752991,
764
  "learning_rate": 7.606933532859459e-06,
765
+ "loss": 3.0831,
766
  "step": 8300
767
  },
768
  {
769
  "epoch": 4.713804713804714,
770
+ "grad_norm": 0.7668105959892273,
771
  "learning_rate": 6.359895248784138e-06,
772
+ "loss": 3.0747,
773
  "step": 8400
774
  },
775
  {
776
  "epoch": 4.7699214365881035,
777
+ "grad_norm": 0.768974244594574,
778
  "learning_rate": 5.112856964708817e-06,
779
+ "loss": 3.0779,
780
  "step": 8500
781
  },
782
  {
783
  "epoch": 4.7699214365881035,
784
+ "eval_accuracy": 0.41932913712054815,
785
+ "eval_bleu": 0.1439933070551526,
786
+ "eval_loss": 3.1860642433166504,
787
+ "eval_perplexity": 24.19302197197161,
788
+ "eval_runtime": 11.1043,
789
+ "eval_samples_per_second": 102.753,
790
+ "eval_steps_per_second": 1.621,
791
  "step": 8500
792
  },
793
  {
794
  "epoch": 4.8260381593714925,
795
+ "grad_norm": 0.7649372816085815,
796
  "learning_rate": 3.865818680633495e-06,
797
+ "loss": 3.0682,
798
  "step": 8600
799
  },
800
  {
801
  "epoch": 4.882154882154882,
802
+ "grad_norm": 0.7638269066810608,
803
  "learning_rate": 2.6187803965581742e-06,
804
+ "loss": 3.0788,
805
  "step": 8700
806
  },
807
  {
808
  "epoch": 4.938271604938271,
809
+ "grad_norm": 0.7597787380218506,
810
  "learning_rate": 1.3717421124828533e-06,
811
+ "loss": 3.0747,
812
  "step": 8800
813
  },
814
  {
815
  "epoch": 4.994388327721661,
816
+ "grad_norm": 0.7531468272209167,
817
  "learning_rate": 1.2470382840753213e-07,
818
+ "loss": 3.0651,
819
  "step": 8900
820
  },
821
  {
822
  "epoch": 5.0,
823
  "step": 8910,
824
  "total_flos": 1.0586630697202483e+18,
825
+ "train_loss": 3.7704881070840237,
826
+ "train_runtime": 13409.8958,
827
+ "train_samples_per_second": 42.504,
828
+ "train_steps_per_second": 0.664
829
  }
830
  ],
831
  "logging_steps": 100,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3d7d0589deb26966fccca9777312b972822cbcf2baece33a62c79e8ceaa6615
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25a6ddb6db8b6570d677716f3e185e04b016980664f78710bc5a660dee677933
3
  size 5560