ngwgsang commited on
Commit
7298db5
·
verified ·
1 Parent(s): 1e9d124

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73bbaf13257e8852680c22d11642ea0013f2247fef35d8272beebba796d36512
3
  size 442668636
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:737bff9e53771f919fd8e65ceb4d4c2d6f17ab4d48951f73831dec9db5e974f5
3
  size 442668636
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf26cb804bb88b36cd19298767e57275ff1af51ef66a60b94073c0dcb74bb3c9
3
  size 885457146
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c15aed075594926bdfb8b0a3fd65422193e443f12843c149dfd9d5c185c0f0
3
  size 885457146
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:049c6f40a328629846cab1b27e3807d44ea469304a69ff0f3d676cc813cde6b3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2477d3715e68f2549c9ecd6a18f4a17a0bfb0a625f50ce4fafa0aa2652affb1c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7668389514d10a3d53f140c85ff46df71dcd9dc34fbc1ed6530f2d1a175df2a0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00b7251b468e4d3cb44eba0757f056754012975e532dc253bb53666972923e5b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,312 +1,87 @@
1
  {
2
- "best_metric": 5.853533426920573,
3
- "best_model_checkpoint": "./results/checkpoint-3664",
4
- "epoch": 4.0,
5
  "eval_steps": 500,
6
- "global_step": 3664,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1091703056768559,
13
- "grad_norm": 23.31266975402832,
14
  "learning_rate": 2.959061135371179e-05,
15
- "loss": 57.4375,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.2183406113537118,
20
- "grad_norm": 36.15751266479492,
21
  "learning_rate": 2.918122270742358e-05,
22
- "loss": 48.4816,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.32751091703056767,
27
- "grad_norm": 44.80302047729492,
28
  "learning_rate": 2.877183406113537e-05,
29
- "loss": 37.1763,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.4366812227074236,
34
- "grad_norm": 29.177419662475586,
35
  "learning_rate": 2.8362445414847164e-05,
36
- "loss": 25.613,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.5458515283842795,
41
- "grad_norm": 42.395999908447266,
42
  "learning_rate": 2.7953056768558954e-05,
43
- "loss": 16.5535,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.6550218340611353,
48
- "grad_norm": 41.26911926269531,
49
  "learning_rate": 2.7543668122270742e-05,
50
- "loss": 12.1195,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.7641921397379913,
55
- "grad_norm": 85.68878173828125,
56
  "learning_rate": 2.7134279475982533e-05,
57
- "loss": 9.5989,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.8733624454148472,
62
- "grad_norm": 29.508316040039062,
63
  "learning_rate": 2.6724890829694323e-05,
64
- "loss": 8.008,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 0.982532751091703,
69
- "grad_norm": 47.07627868652344,
70
  "learning_rate": 2.6315502183406114e-05,
71
- "loss": 7.4156,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 1.0,
76
- "eval_avg_mae": 7.66294542948405,
77
- "eval_loss": 7.66294527053833,
78
- "eval_mae_lex": 7.354457378387451,
79
- "eval_mae_sem": 5.690549373626709,
80
- "eval_mae_syn": 9.943828582763672,
81
- "eval_runtime": 26.9061,
82
- "eval_samples_per_second": 272.318,
83
- "eval_steps_per_second": 8.511,
84
  "step": 916
85
- },
86
- {
87
- "epoch": 1.091703056768559,
88
- "grad_norm": 34.63822555541992,
89
- "learning_rate": 2.5906113537117905e-05,
90
- "loss": 6.9324,
91
- "step": 1000
92
- },
93
- {
94
- "epoch": 1.2008733624454149,
95
- "grad_norm": 25.136709213256836,
96
- "learning_rate": 2.5496724890829696e-05,
97
- "loss": 6.6809,
98
- "step": 1100
99
- },
100
- {
101
- "epoch": 1.3100436681222707,
102
- "grad_norm": 29.977298736572266,
103
- "learning_rate": 2.5087336244541486e-05,
104
- "loss": 6.6569,
105
- "step": 1200
106
- },
107
- {
108
- "epoch": 1.4192139737991267,
109
- "grad_norm": 50.923553466796875,
110
- "learning_rate": 2.4677947598253277e-05,
111
- "loss": 6.5877,
112
- "step": 1300
113
- },
114
- {
115
- "epoch": 1.5283842794759825,
116
- "grad_norm": 24.49920654296875,
117
- "learning_rate": 2.4268558951965064e-05,
118
- "loss": 6.5709,
119
- "step": 1400
120
- },
121
- {
122
- "epoch": 1.6375545851528384,
123
- "grad_norm": 36.14987564086914,
124
- "learning_rate": 2.3859170305676855e-05,
125
- "loss": 6.4067,
126
- "step": 1500
127
- },
128
- {
129
- "epoch": 1.7467248908296944,
130
- "grad_norm": 22.3398380279541,
131
- "learning_rate": 2.344978165938865e-05,
132
- "loss": 6.3692,
133
- "step": 1600
134
- },
135
- {
136
- "epoch": 1.8558951965065502,
137
- "grad_norm": 23.658458709716797,
138
- "learning_rate": 2.3040393013100437e-05,
139
- "loss": 6.3785,
140
- "step": 1700
141
- },
142
- {
143
- "epoch": 1.965065502183406,
144
- "grad_norm": 31.021987915039062,
145
- "learning_rate": 2.2631004366812227e-05,
146
- "loss": 6.2296,
147
- "step": 1800
148
- },
149
- {
150
- "epoch": 2.0,
151
- "eval_avg_mae": 6.929315567016602,
152
- "eval_loss": 6.92931604385376,
153
- "eval_mae_lex": 6.660251617431641,
154
- "eval_mae_sem": 4.739748001098633,
155
- "eval_mae_syn": 9.387948036193848,
156
- "eval_runtime": 26.931,
157
- "eval_samples_per_second": 272.065,
158
- "eval_steps_per_second": 8.503,
159
- "step": 1832
160
- },
161
- {
162
- "epoch": 2.074235807860262,
163
- "grad_norm": 40.95843505859375,
164
- "learning_rate": 2.2221615720524018e-05,
165
- "loss": 6.1465,
166
- "step": 1900
167
- },
168
- {
169
- "epoch": 2.183406113537118,
170
- "grad_norm": 26.046171188354492,
171
- "learning_rate": 2.181222707423581e-05,
172
- "loss": 5.9925,
173
- "step": 2000
174
- },
175
- {
176
- "epoch": 2.2925764192139737,
177
- "grad_norm": 36.05866622924805,
178
- "learning_rate": 2.1402838427947596e-05,
179
- "loss": 5.8884,
180
- "step": 2100
181
- },
182
- {
183
- "epoch": 2.4017467248908297,
184
- "grad_norm": 23.126216888427734,
185
- "learning_rate": 2.099344978165939e-05,
186
- "loss": 5.9357,
187
- "step": 2200
188
- },
189
- {
190
- "epoch": 2.5109170305676853,
191
- "grad_norm": 29.862232208251953,
192
- "learning_rate": 2.058406113537118e-05,
193
- "loss": 5.8846,
194
- "step": 2300
195
- },
196
- {
197
- "epoch": 2.6200873362445414,
198
- "grad_norm": 30.4029541015625,
199
- "learning_rate": 2.0174672489082972e-05,
200
- "loss": 5.8334,
201
- "step": 2400
202
- },
203
- {
204
- "epoch": 2.7292576419213974,
205
- "grad_norm": 30.72637367248535,
206
- "learning_rate": 1.976528384279476e-05,
207
- "loss": 5.8922,
208
- "step": 2500
209
- },
210
- {
211
- "epoch": 2.8384279475982535,
212
- "grad_norm": 24.41779136657715,
213
- "learning_rate": 1.935589519650655e-05,
214
- "loss": 5.912,
215
- "step": 2600
216
- },
217
- {
218
- "epoch": 2.947598253275109,
219
- "grad_norm": 27.00792121887207,
220
- "learning_rate": 1.894650655021834e-05,
221
- "loss": 5.655,
222
- "step": 2700
223
- },
224
- {
225
- "epoch": 3.0,
226
- "eval_avg_mae": 6.088820139567058,
227
- "eval_loss": 6.088819980621338,
228
- "eval_mae_lex": 5.295498847961426,
229
- "eval_mae_sem": 4.145097255706787,
230
- "eval_mae_syn": 8.8258638381958,
231
- "eval_runtime": 26.9401,
232
- "eval_samples_per_second": 271.974,
233
- "eval_steps_per_second": 8.5,
234
- "step": 2748
235
- },
236
- {
237
- "epoch": 3.056768558951965,
238
- "grad_norm": 25.13582420349121,
239
- "learning_rate": 1.8537117903930135e-05,
240
- "loss": 5.5729,
241
- "step": 2800
242
- },
243
- {
244
- "epoch": 3.165938864628821,
245
- "grad_norm": 26.646804809570312,
246
- "learning_rate": 1.8127729257641922e-05,
247
- "loss": 5.6211,
248
- "step": 2900
249
- },
250
- {
251
- "epoch": 3.2751091703056767,
252
- "grad_norm": 28.627378463745117,
253
- "learning_rate": 1.7718340611353713e-05,
254
- "loss": 5.5775,
255
- "step": 3000
256
- },
257
- {
258
- "epoch": 3.3842794759825328,
259
- "grad_norm": 28.54901123046875,
260
- "learning_rate": 1.7308951965065504e-05,
261
- "loss": 5.3086,
262
- "step": 3100
263
- },
264
- {
265
- "epoch": 3.493449781659389,
266
- "grad_norm": 27.549345016479492,
267
- "learning_rate": 1.689956331877729e-05,
268
- "loss": 5.5229,
269
- "step": 3200
270
- },
271
- {
272
- "epoch": 3.6026200873362444,
273
- "grad_norm": 29.306232452392578,
274
- "learning_rate": 1.649017467248908e-05,
275
- "loss": 5.6348,
276
- "step": 3300
277
- },
278
- {
279
- "epoch": 3.7117903930131004,
280
- "grad_norm": 29.256425857543945,
281
- "learning_rate": 1.6080786026200872e-05,
282
- "loss": 5.3936,
283
- "step": 3400
284
- },
285
- {
286
- "epoch": 3.8209606986899565,
287
- "grad_norm": 31.816057205200195,
288
- "learning_rate": 1.5671397379912666e-05,
289
- "loss": 5.431,
290
- "step": 3500
291
- },
292
- {
293
- "epoch": 3.930131004366812,
294
- "grad_norm": 25.876789093017578,
295
- "learning_rate": 1.5262008733624454e-05,
296
- "loss": 5.4981,
297
- "step": 3600
298
- },
299
- {
300
- "epoch": 4.0,
301
- "eval_avg_mae": 5.853533426920573,
302
- "eval_loss": 5.8535332679748535,
303
- "eval_mae_lex": 5.371854305267334,
304
- "eval_mae_sem": 3.812947988510132,
305
- "eval_mae_syn": 8.375797271728516,
306
- "eval_runtime": 26.9386,
307
- "eval_samples_per_second": 271.989,
308
- "eval_steps_per_second": 8.501,
309
- "step": 3664
310
  }
311
  ],
312
  "logging_steps": 100,
@@ -326,7 +101,7 @@
326
  "attributes": {}
327
  }
328
  },
329
- "total_flos": 7711064933354496.0,
330
  "train_batch_size": 32,
331
  "trial_name": null,
332
  "trial_params": null
 
1
  {
2
+ "best_metric": 7.529487609863281,
3
+ "best_model_checkpoint": "./results/checkpoint-916",
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 916,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.1091703056768559,
13
+ "grad_norm": 25.480337142944336,
14
  "learning_rate": 2.959061135371179e-05,
15
+ "loss": 56.923,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.2183406113537118,
20
+ "grad_norm": 39.87223815917969,
21
  "learning_rate": 2.918122270742358e-05,
22
+ "loss": 46.6475,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.32751091703056767,
27
+ "grad_norm": 48.05048751831055,
28
  "learning_rate": 2.877183406113537e-05,
29
+ "loss": 33.6867,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.4366812227074236,
34
+ "grad_norm": 31.941883087158203,
35
  "learning_rate": 2.8362445414847164e-05,
36
+ "loss": 21.1084,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.5458515283842795,
41
+ "grad_norm": 55.025856018066406,
42
  "learning_rate": 2.7953056768558954e-05,
43
+ "loss": 12.9495,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.6550218340611353,
48
+ "grad_norm": 34.957523345947266,
49
  "learning_rate": 2.7543668122270742e-05,
50
+ "loss": 10.0745,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.7641921397379913,
55
+ "grad_norm": 24.020906448364258,
56
  "learning_rate": 2.7134279475982533e-05,
57
+ "loss": 8.3541,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.8733624454148472,
62
+ "grad_norm": 32.709571838378906,
63
  "learning_rate": 2.6724890829694323e-05,
64
+ "loss": 7.5128,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 0.982532751091703,
69
+ "grad_norm": 38.94672393798828,
70
  "learning_rate": 2.6315502183406114e-05,
71
+ "loss": 7.2241,
72
  "step": 900
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "eval_avg_mae": 7.529487609863281,
77
+ "eval_loss": 7.529487609863281,
78
+ "eval_mae_lex": 6.992014408111572,
79
+ "eval_mae_sem": 5.432034492492676,
80
+ "eval_mae_syn": 10.164413452148438,
81
+ "eval_runtime": 27.1764,
82
+ "eval_samples_per_second": 269.609,
83
+ "eval_steps_per_second": 8.426,
84
  "step": 916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  }
86
  ],
87
  "logging_steps": 100,
 
101
  "attributes": {}
102
  }
103
  },
104
+ "total_flos": 1927766233338624.0,
105
  "train_batch_size": 32,
106
  "trial_name": null,
107
  "trial_params": null