ellabettison commited on
Commit
04ef989
·
verified ·
1 Parent(s): 99730dd

Training in progress, epoch 2

Browse files
all_results.json CHANGED
@@ -5,9 +5,9 @@
5
  "eval_runtime": 3.9702,
6
  "eval_samples_per_second": 73.548,
7
  "eval_steps_per_second": 9.319,
8
- "total_flos": 3.431561927399424e+17,
9
- "train_loss": 0.02884543471570526,
10
- "train_runtime": 336.86,
11
- "train_samples_per_second": 13.121,
12
- "train_steps_per_second": 0.831
13
  }
 
5
  "eval_runtime": 3.9702,
6
  "eval_samples_per_second": 73.548,
7
  "eval_steps_per_second": 9.319,
8
+ "total_flos": 5.916629591779738e+17,
9
+ "train_loss": 0.020844636825805014,
10
+ "train_runtime": 401.6875,
11
+ "train_samples_per_second": 18.621,
12
+ "train_steps_per_second": 1.17
13
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d874bb4c4bd029cdb2a0ba42bf8b83f0f9b96e8d9b9e0b699d2b4ff8eea7b322
3
  size 350497036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04baf4507a8e1db4fa09c84c382f7ffb623cdc85a6428e220660a802d73b633b
3
  size 350497036
runs/Jan15_20-00-25_a99b99d67614/events.out.tfevents.1736971226.a99b99d67614.505.23 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4136644bdda2b28ecc7c8056dd4a0c62f1045ced8fd4fc4f446d34b810c915ea
3
+ size 16665
runs/Jan15_20-03-54_a99b99d67614/events.out.tfevents.1736971435.a99b99d67614.505.24 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07e4d97f320983e3d0ffbdc2a4bed50d41e082851563e5df7520a88e89cf137c
3
+ size 16693
runs/Jan15_20-04-21_a99b99d67614/events.out.tfevents.1736971462.a99b99d67614.505.25 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:104574891a791fd31d7546229a10d159f6261f828d19f2d9687d194ea2bf6129
3
+ size 16693
runs/Jan15_20-04-45_a99b99d67614/events.out.tfevents.1736971487.a99b99d67614.505.26 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6fa2a8e661341099c7d0c3e9e7ddaff51111c99cced0dedd6ac68ed019b1078
3
+ size 16665
runs/Jan15_20-06-01_a99b99d67614/events.out.tfevents.1736971563.a99b99d67614.505.27 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83f3cf9beb83507f16417ee9009bd49ed7af0b3dd56599fde0a4b45e961f6129
3
+ size 17521
runs/Jan15_20-09-35_a99b99d67614/events.out.tfevents.1736971776.a99b99d67614.505.28 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b01367ff3620586805a4a4a2292429adf9396baa9a64913c564bf7875218a5
3
+ size 17569
runs/Jan15_20-11-17_a99b99d67614/events.out.tfevents.1736971878.a99b99d67614.505.29 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea36559b72d0b9b6e09066ac12a0b159775173f187db37dba87bfbea5db232e
3
+ size 17569
runs/Jan15_20-12-32_a99b99d67614/events.out.tfevents.1736971953.a99b99d67614.505.30 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc4e6eb67a94ecee1c6c3de970aa120f2264f7be6bf7359830688274bbd8d47d
3
+ size 17569
runs/Jan15_20-13-32_a99b99d67614/events.out.tfevents.1736972014.a99b99d67614.505.31 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe679ed8609dd968498732b3a3b28ab9e6ec8a8eba64769fee4109ccc20b0fd4
3
+ size 17569
runs/Jan15_20-14-36_a99b99d67614/events.out.tfevents.1736972077.a99b99d67614.505.32 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:351ebe821b37bf1d768caf8f0c7678bc5d68d54ffe37ec935636c01eec50d914
3
+ size 19238
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 3.431561927399424e+17,
4
- "train_loss": 0.02884543471570526,
5
- "train_runtime": 336.86,
6
- "train_samples_per_second": 13.121,
7
- "train_steps_per_second": 0.831
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 5.916629591779738e+17,
4
+ "train_loss": 0.020844636825805014,
5
+ "train_runtime": 401.6875,
6
+ "train_samples_per_second": 18.621,
7
+ "train_steps_per_second": 1.17
8
  }
trainer_state.json CHANGED
@@ -1,311 +1,444 @@
1
  {
2
- "best_metric": 0.01529290433973074,
3
- "best_model_checkpoint": "./logo-matching-base/checkpoint-252",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 280,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.35714285714285715,
13
- "grad_norm": 0.28576332330703735,
14
- "learning_rate": 0.00019285714285714286,
15
- "loss": 0.3081,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.7142857142857143,
20
- "grad_norm": 0.11291749030351639,
21
- "learning_rate": 0.00018571428571428572,
22
- "loss": 0.0994,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 1.0,
27
- "eval_accuracy": 0.5886287625418061,
28
- "eval_loss": 0.03771600499749184,
29
- "eval_runtime": 4.5177,
30
- "eval_samples_per_second": 66.184,
31
- "eval_steps_per_second": 8.411,
32
- "step": 28
33
- },
34
- {
35
- "epoch": 1.0714285714285714,
36
- "grad_norm": 0.057985421270132065,
37
- "learning_rate": 0.0001785714285714286,
38
- "loss": 0.0461,
39
  "step": 30
40
  },
41
  {
42
- "epoch": 1.4285714285714286,
43
- "grad_norm": 0.03649172931909561,
44
- "learning_rate": 0.00017142857142857143,
45
- "loss": 0.0282,
46
  "step": 40
47
  },
48
  {
49
- "epoch": 1.7857142857142856,
50
- "grad_norm": 0.03120347298681736,
51
- "learning_rate": 0.00016428571428571428,
52
- "loss": 0.0222,
 
 
 
 
 
 
 
 
 
53
  "step": 50
54
  },
55
  {
56
- "epoch": 2.0,
57
- "eval_accuracy": 0.5886287625418061,
58
- "eval_loss": 0.020119963213801384,
59
- "eval_runtime": 3.2248,
60
- "eval_samples_per_second": 92.718,
61
- "eval_steps_per_second": 11.784,
62
- "step": 56
63
- },
64
- {
65
- "epoch": 2.142857142857143,
66
- "grad_norm": 0.02560759335756302,
67
- "learning_rate": 0.00015714285714285716,
68
- "loss": 0.0174,
69
  "step": 60
70
  },
71
  {
72
- "epoch": 2.5,
73
- "grad_norm": 0.03040032461285591,
74
- "learning_rate": 0.00015000000000000001,
75
- "loss": 0.0178,
76
  "step": 70
77
  },
78
  {
79
- "epoch": 2.857142857142857,
80
- "grad_norm": 0.023641686886548996,
81
- "learning_rate": 0.00014285714285714287,
82
- "loss": 0.015,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 3.0,
87
- "eval_accuracy": 0.5886287625418061,
88
- "eval_loss": 0.017308849841356277,
89
- "eval_runtime": 2.905,
90
- "eval_samples_per_second": 102.924,
91
- "eval_steps_per_second": 13.081,
92
- "step": 84
93
- },
94
- {
95
- "epoch": 3.2142857142857144,
96
- "grad_norm": 0.019254567101597786,
97
- "learning_rate": 0.00013571428571428572,
98
- "loss": 0.0145,
99
  "step": 90
100
  },
101
  {
102
- "epoch": 3.571428571428571,
103
- "grad_norm": 0.019579825922846794,
104
- "learning_rate": 0.00012857142857142858,
105
- "loss": 0.0139,
 
 
 
 
 
 
 
 
 
106
  "step": 100
107
  },
108
  {
109
- "epoch": 3.928571428571429,
110
- "grad_norm": 0.019746659323573112,
111
- "learning_rate": 0.00012142857142857143,
112
- "loss": 0.0148,
113
  "step": 110
114
  },
115
  {
116
- "epoch": 4.0,
117
- "eval_accuracy": 0.5886287625418061,
118
- "eval_loss": 0.016309447586536407,
119
- "eval_runtime": 4.4763,
120
- "eval_samples_per_second": 66.797,
121
- "eval_steps_per_second": 8.489,
122
- "step": 112
123
- },
124
- {
125
- "epoch": 4.285714285714286,
126
- "grad_norm": 0.02770456299185753,
127
- "learning_rate": 0.00011428571428571428,
128
- "loss": 0.0145,
129
  "step": 120
130
  },
131
  {
132
- "epoch": 4.642857142857143,
133
- "grad_norm": 0.019606683403253555,
134
- "learning_rate": 0.00010714285714285715,
135
- "loss": 0.013,
136
  "step": 130
137
  },
138
  {
139
- "epoch": 5.0,
140
- "grad_norm": 0.017767397686839104,
141
- "learning_rate": 0.0001,
142
- "loss": 0.0128,
143
  "step": 140
144
  },
145
  {
146
- "epoch": 5.0,
147
- "eval_accuracy": 0.5886287625418061,
148
- "eval_loss": 0.015752054750919342,
149
- "eval_runtime": 3.1756,
150
- "eval_samples_per_second": 94.156,
151
- "eval_steps_per_second": 11.966,
152
- "step": 140
153
- },
154
- {
155
- "epoch": 5.357142857142857,
156
- "grad_norm": 0.01708129048347473,
157
- "learning_rate": 9.285714285714286e-05,
158
- "loss": 0.0113,
159
  "step": 150
160
  },
161
  {
162
- "epoch": 5.714285714285714,
163
- "grad_norm": 0.01555562112480402,
164
- "learning_rate": 8.571428571428571e-05,
165
- "loss": 0.0148,
166
  "step": 160
167
  },
168
  {
169
- "epoch": 6.0,
170
- "eval_accuracy": 0.5886287625418061,
171
- "eval_loss": 0.01554945856332779,
172
- "eval_runtime": 2.7646,
173
- "eval_samples_per_second": 108.154,
174
- "eval_steps_per_second": 13.745,
175
- "step": 168
176
- },
177
- {
178
- "epoch": 6.071428571428571,
179
- "grad_norm": 0.026734082028269768,
180
- "learning_rate": 7.857142857142858e-05,
181
- "loss": 0.0129,
182
  "step": 170
183
  },
184
  {
185
- "epoch": 6.428571428571429,
186
- "grad_norm": 0.01421839464455843,
187
- "learning_rate": 7.142857142857143e-05,
188
- "loss": 0.0111,
189
  "step": 180
190
  },
191
  {
192
- "epoch": 6.785714285714286,
193
- "grad_norm": 0.011912204325199127,
194
- "learning_rate": 6.428571428571429e-05,
195
- "loss": 0.013,
 
 
 
 
 
 
 
 
 
196
  "step": 190
197
  },
198
  {
199
- "epoch": 7.0,
200
- "eval_accuracy": 0.5886287625418061,
201
- "eval_loss": 0.015377058647572994,
202
- "eval_runtime": 3.7025,
203
- "eval_samples_per_second": 80.757,
204
- "eval_steps_per_second": 10.263,
205
- "step": 196
206
- },
207
- {
208
- "epoch": 7.142857142857143,
209
- "grad_norm": 0.023527879267930984,
210
- "learning_rate": 5.714285714285714e-05,
211
- "loss": 0.0116,
212
  "step": 200
213
  },
214
  {
215
- "epoch": 7.5,
216
- "grad_norm": 0.017122577875852585,
217
- "learning_rate": 5e-05,
218
- "loss": 0.0113,
219
  "step": 210
220
  },
221
  {
222
- "epoch": 7.857142857142857,
223
- "grad_norm": 0.018162399530410767,
224
- "learning_rate": 4.2857142857142856e-05,
225
- "loss": 0.0131,
226
  "step": 220
227
  },
228
  {
229
- "epoch": 8.0,
230
- "eval_accuracy": 0.5886287625418061,
231
- "eval_loss": 0.015300475060939789,
232
- "eval_runtime": 2.8162,
233
- "eval_samples_per_second": 106.173,
234
- "eval_steps_per_second": 13.494,
235
- "step": 224
236
- },
237
- {
238
- "epoch": 8.214285714285714,
239
- "grad_norm": 0.023050658404827118,
240
- "learning_rate": 3.571428571428572e-05,
241
- "loss": 0.0124,
242
  "step": 230
243
  },
244
  {
245
- "epoch": 8.571428571428571,
246
- "grad_norm": 0.01859254017472267,
247
- "learning_rate": 2.857142857142857e-05,
 
 
 
 
 
 
 
 
 
248
  "loss": 0.0118,
249
  "step": 240
250
  },
251
  {
252
- "epoch": 8.928571428571429,
253
- "grad_norm": 0.020781200379133224,
254
- "learning_rate": 2.1428571428571428e-05,
255
- "loss": 0.0111,
256
  "step": 250
257
  },
258
  {
259
- "epoch": 9.0,
260
- "eval_accuracy": 0.5886287625418061,
261
- "eval_loss": 0.01529290433973074,
262
- "eval_runtime": 2.8264,
263
- "eval_samples_per_second": 105.789,
264
- "eval_steps_per_second": 13.445,
265
- "step": 252
266
- },
267
- {
268
- "epoch": 9.285714285714286,
269
- "grad_norm": 0.02203691191971302,
270
- "learning_rate": 1.4285714285714285e-05,
271
- "loss": 0.0126,
272
  "step": 260
273
  },
274
  {
275
- "epoch": 9.642857142857142,
276
- "grad_norm": 0.016590403392910957,
277
- "learning_rate": 7.142857142857143e-06,
278
- "loss": 0.0104,
279
  "step": 270
280
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  {
282
  "epoch": 10.0,
283
- "grad_norm": 0.025512348860502243,
284
  "learning_rate": 0.0,
285
- "loss": 0.0127,
286
- "step": 280
287
  },
288
  {
289
  "epoch": 10.0,
290
- "eval_accuracy": 0.5886287625418061,
291
- "eval_loss": 0.015296942554414272,
292
- "eval_runtime": 4.6262,
293
- "eval_samples_per_second": 64.633,
294
- "eval_steps_per_second": 8.214,
295
- "step": 280
296
  },
297
  {
298
  "epoch": 10.0,
299
- "step": 280,
300
- "total_flos": 3.431561927399424e+17,
301
- "train_loss": 0.02884543471570526,
302
- "train_runtime": 336.86,
303
- "train_samples_per_second": 13.121,
304
- "train_steps_per_second": 0.831
305
  }
306
  ],
307
  "logging_steps": 10,
308
- "max_steps": 280,
309
  "num_input_tokens_seen": 0,
310
  "num_train_epochs": 10,
311
  "save_steps": 500,
@@ -321,7 +454,7 @@
321
  "attributes": {}
322
  }
323
  },
324
- "total_flos": 3.431561927399424e+17,
325
  "train_batch_size": 16,
326
  "trial_name": null,
327
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.03154107183218002,
3
+ "best_model_checkpoint": "./logo-matching-base/checkpoint-141",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 470,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.2127659574468085,
13
+ "grad_norm": 0.20348502695560455,
14
+ "learning_rate": 0.00019574468085106384,
15
+ "loss": 0.2136,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.425531914893617,
20
+ "grad_norm": 0.20952267944812775,
21
+ "learning_rate": 0.00019148936170212768,
22
+ "loss": 0.0284,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.6382978723404256,
27
+ "grad_norm": 0.21442832052707672,
28
+ "learning_rate": 0.0001872340425531915,
29
+ "loss": 0.183,
 
 
 
 
 
 
 
 
 
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.851063829787234,
34
+ "grad_norm": 0.38694852590560913,
35
+ "learning_rate": 0.00018297872340425532,
36
+ "loss": 0.0225,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 1.0,
41
+ "eval_accuracy": 0.3229357798165138,
42
+ "eval_loss": 0.032035112380981445,
43
+ "eval_runtime": 6.0654,
44
+ "eval_samples_per_second": 89.854,
45
+ "eval_steps_per_second": 11.376,
46
+ "step": 47
47
+ },
48
+ {
49
+ "epoch": 1.0638297872340425,
50
+ "grad_norm": 0.11812812834978104,
51
+ "learning_rate": 0.00017872340425531915,
52
+ "loss": 0.02,
53
  "step": 50
54
  },
55
  {
56
+ "epoch": 1.2765957446808511,
57
+ "grad_norm": 0.1575324535369873,
58
+ "learning_rate": 0.00017446808510638298,
59
+ "loss": 0.0213,
 
 
 
 
 
 
 
 
 
60
  "step": 60
61
  },
62
  {
63
+ "epoch": 1.4893617021276595,
64
+ "grad_norm": 0.12748625874519348,
65
+ "learning_rate": 0.00017021276595744682,
66
+ "loss": 0.0207,
67
  "step": 70
68
  },
69
  {
70
+ "epoch": 1.702127659574468,
71
+ "grad_norm": 0.11703016608953476,
72
+ "learning_rate": 0.00016595744680851065,
73
+ "loss": 0.0178,
74
  "step": 80
75
  },
76
  {
77
+ "epoch": 1.9148936170212765,
78
+ "grad_norm": 0.09279447048902512,
79
+ "learning_rate": 0.00016170212765957446,
80
+ "loss": 0.0184,
 
 
 
 
 
 
 
 
 
81
  "step": 90
82
  },
83
  {
84
+ "epoch": 2.0,
85
+ "eval_accuracy": 0.27706422018348625,
86
+ "eval_loss": 0.03198189660906792,
87
+ "eval_runtime": 4.7267,
88
+ "eval_samples_per_second": 115.302,
89
+ "eval_steps_per_second": 14.598,
90
+ "step": 94
91
+ },
92
+ {
93
+ "epoch": 2.127659574468085,
94
+ "grad_norm": 0.09347425401210785,
95
+ "learning_rate": 0.00015744680851063832,
96
+ "loss": 0.0161,
97
  "step": 100
98
  },
99
  {
100
+ "epoch": 2.3404255319148937,
101
+ "grad_norm": 0.09166835993528366,
102
+ "learning_rate": 0.00015319148936170213,
103
+ "loss": 0.0169,
104
  "step": 110
105
  },
106
  {
107
+ "epoch": 2.5531914893617023,
108
+ "grad_norm": 0.12563753128051758,
109
+ "learning_rate": 0.00014893617021276596,
110
+ "loss": 0.0181,
 
 
 
 
 
 
 
 
 
111
  "step": 120
112
  },
113
  {
114
+ "epoch": 2.7659574468085104,
115
+ "grad_norm": 0.10614734888076782,
116
+ "learning_rate": 0.0001446808510638298,
117
+ "loss": 0.0166,
118
  "step": 130
119
  },
120
  {
121
+ "epoch": 2.978723404255319,
122
+ "grad_norm": 0.10630539804697037,
123
+ "learning_rate": 0.00014042553191489363,
124
+ "loss": 0.0177,
125
  "step": 140
126
  },
127
  {
128
+ "epoch": 3.0,
129
+ "eval_accuracy": 0.26788990825688075,
130
+ "eval_loss": 0.03154107183218002,
131
+ "eval_runtime": 6.1724,
132
+ "eval_samples_per_second": 88.296,
133
+ "eval_steps_per_second": 11.179,
134
+ "step": 141
135
+ },
136
+ {
137
+ "epoch": 3.1914893617021276,
138
+ "grad_norm": 0.08008704334497452,
139
+ "learning_rate": 0.00013617021276595746,
140
+ "loss": 0.0175,
141
  "step": 150
142
  },
143
  {
144
+ "epoch": 3.404255319148936,
145
+ "grad_norm": 0.1296830177307129,
146
+ "learning_rate": 0.00013191489361702127,
147
+ "loss": 0.0174,
148
  "step": 160
149
  },
150
  {
151
+ "epoch": 3.617021276595745,
152
+ "grad_norm": 0.11523136496543884,
153
+ "learning_rate": 0.00012765957446808513,
154
+ "loss": 0.014,
 
 
 
 
 
 
 
 
 
155
  "step": 170
156
  },
157
  {
158
+ "epoch": 3.829787234042553,
159
+ "grad_norm": 0.10043615847826004,
160
+ "learning_rate": 0.00012340425531914893,
161
+ "loss": 0.0144,
162
  "step": 180
163
  },
164
  {
165
+ "epoch": 4.0,
166
+ "eval_accuracy": 0.25321100917431194,
167
+ "eval_loss": 0.03288332372903824,
168
+ "eval_runtime": 5.7194,
169
+ "eval_samples_per_second": 95.289,
170
+ "eval_steps_per_second": 12.064,
171
+ "step": 188
172
+ },
173
+ {
174
+ "epoch": 4.042553191489362,
175
+ "grad_norm": 0.1341152787208557,
176
+ "learning_rate": 0.00011914893617021277,
177
+ "loss": 0.0143,
178
  "step": 190
179
  },
180
  {
181
+ "epoch": 4.25531914893617,
182
+ "grad_norm": 0.11163297295570374,
183
+ "learning_rate": 0.00011489361702127661,
184
+ "loss": 0.0147,
 
 
 
 
 
 
 
 
 
185
  "step": 200
186
  },
187
  {
188
+ "epoch": 4.468085106382979,
189
+ "grad_norm": 0.14031550288200378,
190
+ "learning_rate": 0.00011063829787234043,
191
+ "loss": 0.0129,
192
  "step": 210
193
  },
194
  {
195
+ "epoch": 4.680851063829787,
196
+ "grad_norm": 0.11095025390386581,
197
+ "learning_rate": 0.00010638297872340425,
198
+ "loss": 0.0159,
199
  "step": 220
200
  },
201
  {
202
+ "epoch": 4.8936170212765955,
203
+ "grad_norm": 0.09944932162761688,
204
+ "learning_rate": 0.00010212765957446809,
205
+ "loss": 0.0133,
 
 
 
 
 
 
 
 
 
206
  "step": 230
207
  },
208
  {
209
+ "epoch": 5.0,
210
+ "eval_accuracy": 0.30825688073394497,
211
+ "eval_loss": 0.03211478143930435,
212
+ "eval_runtime": 4.8718,
213
+ "eval_samples_per_second": 111.869,
214
+ "eval_steps_per_second": 14.163,
215
+ "step": 235
216
+ },
217
+ {
218
+ "epoch": 5.1063829787234045,
219
+ "grad_norm": 0.07057506591081619,
220
+ "learning_rate": 9.787234042553192e-05,
221
  "loss": 0.0118,
222
  "step": 240
223
  },
224
  {
225
+ "epoch": 5.319148936170213,
226
+ "grad_norm": 0.11148671805858612,
227
+ "learning_rate": 9.361702127659576e-05,
228
+ "loss": 0.0123,
229
  "step": 250
230
  },
231
  {
232
+ "epoch": 5.531914893617021,
233
+ "grad_norm": 0.1193113625049591,
234
+ "learning_rate": 8.936170212765958e-05,
235
+ "loss": 0.0128,
 
 
 
 
 
 
 
 
 
236
  "step": 260
237
  },
238
  {
239
+ "epoch": 5.74468085106383,
240
+ "grad_norm": 0.15620863437652588,
241
+ "learning_rate": 8.510638297872341e-05,
242
+ "loss": 0.013,
243
  "step": 270
244
  },
245
+ {
246
+ "epoch": 5.957446808510638,
247
+ "grad_norm": 0.09126376360654831,
248
+ "learning_rate": 8.085106382978723e-05,
249
+ "loss": 0.0108,
250
+ "step": 280
251
+ },
252
+ {
253
+ "epoch": 6.0,
254
+ "eval_accuracy": 0.3137614678899083,
255
+ "eval_loss": 0.03295579180121422,
256
+ "eval_runtime": 4.8988,
257
+ "eval_samples_per_second": 111.251,
258
+ "eval_steps_per_second": 14.085,
259
+ "step": 282
260
+ },
261
+ {
262
+ "epoch": 6.170212765957447,
263
+ "grad_norm": 0.11392467468976974,
264
+ "learning_rate": 7.659574468085106e-05,
265
+ "loss": 0.0115,
266
+ "step": 290
267
+ },
268
+ {
269
+ "epoch": 6.382978723404255,
270
+ "grad_norm": 0.07867500931024551,
271
+ "learning_rate": 7.23404255319149e-05,
272
+ "loss": 0.0105,
273
+ "step": 300
274
+ },
275
+ {
276
+ "epoch": 6.595744680851064,
277
+ "grad_norm": 0.07005509734153748,
278
+ "learning_rate": 6.808510638297873e-05,
279
+ "loss": 0.0099,
280
+ "step": 310
281
+ },
282
+ {
283
+ "epoch": 6.808510638297872,
284
+ "grad_norm": 0.10206523537635803,
285
+ "learning_rate": 6.382978723404256e-05,
286
+ "loss": 0.0097,
287
+ "step": 320
288
+ },
289
+ {
290
+ "epoch": 7.0,
291
+ "eval_accuracy": 0.25688073394495414,
292
+ "eval_loss": 0.034237515181303024,
293
+ "eval_runtime": 5.8682,
294
+ "eval_samples_per_second": 92.873,
295
+ "eval_steps_per_second": 11.758,
296
+ "step": 329
297
+ },
298
+ {
299
+ "epoch": 7.0212765957446805,
300
+ "grad_norm": 0.15130436420440674,
301
+ "learning_rate": 5.9574468085106384e-05,
302
+ "loss": 0.0106,
303
+ "step": 330
304
+ },
305
+ {
306
+ "epoch": 7.23404255319149,
307
+ "grad_norm": 0.07350896298885345,
308
+ "learning_rate": 5.531914893617022e-05,
309
+ "loss": 0.0086,
310
+ "step": 340
311
+ },
312
+ {
313
+ "epoch": 7.446808510638298,
314
+ "grad_norm": 0.10417384654283524,
315
+ "learning_rate": 5.1063829787234044e-05,
316
+ "loss": 0.0108,
317
+ "step": 350
318
+ },
319
+ {
320
+ "epoch": 7.659574468085106,
321
+ "grad_norm": 0.12092699855566025,
322
+ "learning_rate": 4.680851063829788e-05,
323
+ "loss": 0.0079,
324
+ "step": 360
325
+ },
326
+ {
327
+ "epoch": 7.872340425531915,
328
+ "grad_norm": 0.11693856120109558,
329
+ "learning_rate": 4.2553191489361704e-05,
330
+ "loss": 0.0086,
331
+ "step": 370
332
+ },
333
+ {
334
+ "epoch": 8.0,
335
+ "eval_accuracy": 0.3192660550458716,
336
+ "eval_loss": 0.03412623330950737,
337
+ "eval_runtime": 6.8764,
338
+ "eval_samples_per_second": 79.257,
339
+ "eval_steps_per_second": 10.034,
340
+ "step": 376
341
+ },
342
+ {
343
+ "epoch": 8.085106382978724,
344
+ "grad_norm": 0.07324172556400299,
345
+ "learning_rate": 3.829787234042553e-05,
346
+ "loss": 0.0076,
347
+ "step": 380
348
+ },
349
+ {
350
+ "epoch": 8.297872340425531,
351
+ "grad_norm": 0.1128627359867096,
352
+ "learning_rate": 3.4042553191489365e-05,
353
+ "loss": 0.0074,
354
+ "step": 390
355
+ },
356
+ {
357
+ "epoch": 8.51063829787234,
358
+ "grad_norm": 0.10660151392221451,
359
+ "learning_rate": 2.9787234042553192e-05,
360
+ "loss": 0.0067,
361
+ "step": 400
362
+ },
363
+ {
364
+ "epoch": 8.72340425531915,
365
+ "grad_norm": 0.0888807401061058,
366
+ "learning_rate": 2.5531914893617022e-05,
367
+ "loss": 0.0076,
368
+ "step": 410
369
+ },
370
+ {
371
+ "epoch": 8.936170212765958,
372
+ "grad_norm": 0.07239257544279099,
373
+ "learning_rate": 2.1276595744680852e-05,
374
+ "loss": 0.0068,
375
+ "step": 420
376
+ },
377
+ {
378
+ "epoch": 9.0,
379
+ "eval_accuracy": 0.3155963302752294,
380
+ "eval_loss": 0.03542930632829666,
381
+ "eval_runtime": 5.9234,
382
+ "eval_samples_per_second": 92.008,
383
+ "eval_steps_per_second": 11.649,
384
+ "step": 423
385
+ },
386
+ {
387
+ "epoch": 9.148936170212766,
388
+ "grad_norm": 0.08497753739356995,
389
+ "learning_rate": 1.7021276595744682e-05,
390
+ "loss": 0.0067,
391
+ "step": 430
392
+ },
393
+ {
394
+ "epoch": 9.361702127659575,
395
+ "grad_norm": 0.06389721482992172,
396
+ "learning_rate": 1.2765957446808511e-05,
397
+ "loss": 0.0062,
398
+ "step": 440
399
+ },
400
+ {
401
+ "epoch": 9.574468085106384,
402
+ "grad_norm": 0.06799926608800888,
403
+ "learning_rate": 8.510638297872341e-06,
404
+ "loss": 0.0054,
405
+ "step": 450
406
+ },
407
+ {
408
+ "epoch": 9.787234042553191,
409
+ "grad_norm": 0.08153792470693588,
410
+ "learning_rate": 4.255319148936171e-06,
411
+ "loss": 0.0053,
412
+ "step": 460
413
+ },
414
  {
415
  "epoch": 10.0,
416
+ "grad_norm": 0.1752246469259262,
417
  "learning_rate": 0.0,
418
+ "loss": 0.0056,
419
+ "step": 470
420
  },
421
  {
422
  "epoch": 10.0,
423
+ "eval_accuracy": 0.3339449541284404,
424
+ "eval_loss": 0.03554193675518036,
425
+ "eval_runtime": 6.6038,
426
+ "eval_samples_per_second": 82.529,
427
+ "eval_steps_per_second": 10.449,
428
+ "step": 470
429
  },
430
  {
431
  "epoch": 10.0,
432
+ "step": 470,
433
+ "total_flos": 5.916629591779738e+17,
434
+ "train_loss": 0.020844636825805014,
435
+ "train_runtime": 401.6875,
436
+ "train_samples_per_second": 18.621,
437
+ "train_steps_per_second": 1.17
438
  }
439
  ],
440
  "logging_steps": 10,
441
+ "max_steps": 470,
442
  "num_input_tokens_seen": 0,
443
  "num_train_epochs": 10,
444
  "save_steps": 500,
 
454
  "attributes": {}
455
  }
456
  },
457
+ "total_flos": 5.916629591779738e+17,
458
  "train_batch_size": 16,
459
  "trial_name": null,
460
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71319c6b4a4532c672c778ea1f888794e3f01a4eacd84321c42d8d203f79a3a2
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a54f12faa92c2188517ce8459d7a38b184f5075cca10aea02a378d12b2444aa4
3
  size 5368