winnieyangwannan commited on
Commit
8bd0368
·
verified ·
1 Parent(s): ee92f97

Training in progress, step 200, checkpoint

Browse files
checkpoint-200/adapter_config.json CHANGED
@@ -24,12 +24,12 @@
24
  "revision": null,
25
  "target_modules": [
26
  "gate_proj",
27
- "q_proj",
28
  "k_proj",
29
- "up_proj",
30
  "o_proj",
31
- "down_proj",
32
- "v_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
24
  "revision": null,
25
  "target_modules": [
26
  "gate_proj",
27
+ "v_proj",
28
  "k_proj",
29
+ "q_proj",
30
  "o_proj",
31
+ "up_proj",
32
+ "down_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2150d7fcd41f36cde2dad101837fdaced78d5a7b3214074c0f26ec493f048a26
3
  size 216151256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2a84317654a196ed8df6e09a25614537e583919d403f1e3c38f2443c97e90cf
3
  size 216151256
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:419d4c36022ec31cf0129a8bc118faa3db4c994beb353d46485016486c60651b
3
  size 432640054
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:140fa2c0878462470552e343fb151a98eebf709e8cda433088c757bab2c1c72d
3
  size 432640054
checkpoint-200/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33bb7ca93eba165d5dbde5336ffdac4e369756c81f66fbfa83ee5f59ea4c5215
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a49951da982a98d02a2a003574ee2422f1a7ca4cf2cfefdda4f84b07b79c6aae
3
  size 14244
checkpoint-200/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.25316455696202533,
5
- "eval_steps": 50,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
@@ -10,174 +10,302 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.012658227848101266,
13
- "grad_norm": 16.027029037475586,
14
  "learning_rate": 4.9789029535864986e-05,
15
- "loss": 2.6925,
 
 
 
 
 
 
 
 
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.02531645569620253,
20
- "grad_norm": 1.0948777198791504,
21
  "learning_rate": 4.957805907172996e-05,
22
- "loss": 1.386,
 
 
 
 
 
 
 
 
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.0379746835443038,
27
- "grad_norm": 1.1341983079910278,
28
  "learning_rate": 4.936708860759494e-05,
29
- "loss": 1.108,
 
 
 
 
 
 
 
 
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.05063291139240506,
34
- "grad_norm": 1.152009129524231,
35
  "learning_rate": 4.9156118143459915e-05,
36
- "loss": 0.9436,
 
 
 
 
 
 
 
 
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.06329113924050633,
41
- "grad_norm": 1.0990614891052246,
42
  "learning_rate": 4.89451476793249e-05,
43
- "loss": 0.7499,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.06329113924050633,
48
- "eval_loss": 0.7157873511314392,
49
- "eval_runtime": 12.1594,
50
- "eval_samples_per_second": 39.476,
51
- "eval_steps_per_second": 2.467,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 0.0759493670886076,
56
- "grad_norm": 1.0284477472305298,
57
  "learning_rate": 4.8734177215189874e-05,
58
- "loss": 0.6095,
 
 
 
 
 
 
 
 
59
  "step": 60
60
  },
61
  {
62
  "epoch": 0.08860759493670886,
63
- "grad_norm": 0.8609589338302612,
64
  "learning_rate": 4.852320675105486e-05,
65
- "loss": 0.5355,
 
 
 
 
 
 
 
 
66
  "step": 70
67
  },
68
  {
69
  "epoch": 0.10126582278481013,
70
- "grad_norm": 0.9162376523017883,
71
  "learning_rate": 4.8312236286919834e-05,
72
- "loss": 0.5705,
 
 
 
 
 
 
 
 
73
  "step": 80
74
  },
75
  {
76
  "epoch": 0.11392405063291139,
77
- "grad_norm": 0.9415847659111023,
78
  "learning_rate": 4.810126582278481e-05,
79
- "loss": 0.5449,
 
 
 
 
 
 
 
 
80
  "step": 90
81
  },
82
  {
83
  "epoch": 0.12658227848101267,
84
- "grad_norm": 0.8756884336471558,
85
  "learning_rate": 4.789029535864979e-05,
86
- "loss": 0.5157,
87
  "step": 100
88
  },
89
  {
90
  "epoch": 0.12658227848101267,
91
- "eval_loss": 0.563517153263092,
92
- "eval_runtime": 12.1236,
93
- "eval_samples_per_second": 39.592,
94
- "eval_steps_per_second": 2.475,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 0.13924050632911392,
99
- "grad_norm": 0.8702118396759033,
100
  "learning_rate": 4.767932489451477e-05,
101
- "loss": 0.53,
 
 
 
 
 
 
 
 
102
  "step": 110
103
  },
104
  {
105
  "epoch": 0.1518987341772152,
106
- "grad_norm": 0.8843992352485657,
107
  "learning_rate": 4.7468354430379746e-05,
108
- "loss": 0.4923,
 
 
 
 
 
 
 
 
109
  "step": 120
110
  },
111
  {
112
  "epoch": 0.16455696202531644,
113
- "grad_norm": 0.8294386863708496,
114
  "learning_rate": 4.725738396624473e-05,
115
- "loss": 0.4936,
 
 
 
 
 
 
 
 
116
  "step": 130
117
  },
118
  {
119
  "epoch": 0.17721518987341772,
120
- "grad_norm": 0.8978216648101807,
121
  "learning_rate": 4.704641350210971e-05,
122
- "loss": 0.4581,
 
 
 
 
 
 
 
 
123
  "step": 140
124
  },
125
  {
126
  "epoch": 0.189873417721519,
127
- "grad_norm": 0.8757727742195129,
128
  "learning_rate": 4.683544303797468e-05,
129
- "loss": 0.4571,
130
  "step": 150
131
  },
132
  {
133
  "epoch": 0.189873417721519,
134
- "eval_loss": 0.5096740126609802,
135
- "eval_runtime": 12.0892,
136
- "eval_samples_per_second": 39.705,
137
- "eval_steps_per_second": 2.482,
138
  "step": 150
139
  },
140
  {
141
  "epoch": 0.20253164556962025,
142
- "grad_norm": 0.8959233164787292,
143
  "learning_rate": 4.6624472573839666e-05,
144
- "loss": 0.4429,
 
 
 
 
 
 
 
 
145
  "step": 160
146
  },
147
  {
148
  "epoch": 0.21518987341772153,
149
- "grad_norm": 0.9160757660865784,
150
  "learning_rate": 4.641350210970464e-05,
151
- "loss": 0.4167,
 
 
 
 
 
 
 
 
152
  "step": 170
153
  },
154
  {
155
  "epoch": 0.22784810126582278,
156
- "grad_norm": 0.8140855431556702,
157
  "learning_rate": 4.6202531645569625e-05,
158
- "loss": 0.4249,
 
 
 
 
 
 
 
 
159
  "step": 180
160
  },
161
  {
162
  "epoch": 0.24050632911392406,
163
- "grad_norm": 0.8790073990821838,
164
  "learning_rate": 4.59915611814346e-05,
165
- "loss": 0.4198,
 
 
 
 
 
 
 
 
166
  "step": 190
167
  },
168
  {
169
  "epoch": 0.25316455696202533,
170
- "grad_norm": 0.8366842269897461,
171
  "learning_rate": 4.5780590717299585e-05,
172
- "loss": 0.4148,
173
  "step": 200
174
  },
175
  {
176
  "epoch": 0.25316455696202533,
177
- "eval_loss": 0.47525277733802795,
178
- "eval_runtime": 12.0592,
179
- "eval_samples_per_second": 39.803,
180
- "eval_steps_per_second": 2.488,
181
  "step": 200
182
  }
183
  ],
@@ -185,7 +313,7 @@
185
  "max_steps": 2370,
186
  "num_input_tokens_seen": 0,
187
  "num_train_epochs": 3,
188
- "save_steps": 100,
189
  "stateful_callbacks": {
190
  "TrainerControl": {
191
  "args": {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 0.25316455696202533,
5
+ "eval_steps": 10,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.012658227848101266,
13
+ "grad_norm": 12.934767723083496,
14
  "learning_rate": 4.9789029535864986e-05,
15
+ "loss": 2.6869,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.012658227848101266,
20
+ "eval_loss": 1.651185393333435,
21
+ "eval_runtime": 11.831,
22
+ "eval_samples_per_second": 40.571,
23
+ "eval_steps_per_second": 2.536,
24
  "step": 10
25
  },
26
  {
27
  "epoch": 0.02531645569620253,
28
+ "grad_norm": 1.104798674583435,
29
  "learning_rate": 4.957805907172996e-05,
30
+ "loss": 1.3694,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.02531645569620253,
35
+ "eval_loss": 1.2200205326080322,
36
+ "eval_runtime": 11.8928,
37
+ "eval_samples_per_second": 40.361,
38
+ "eval_steps_per_second": 2.523,
39
  "step": 20
40
  },
41
  {
42
  "epoch": 0.0379746835443038,
43
+ "grad_norm": 1.1069142818450928,
44
  "learning_rate": 4.936708860759494e-05,
45
+ "loss": 1.1029,
46
+ "step": 30
47
+ },
48
+ {
49
+ "epoch": 0.0379746835443038,
50
+ "eval_loss": 1.0691736936569214,
51
+ "eval_runtime": 11.9127,
52
+ "eval_samples_per_second": 40.293,
53
+ "eval_steps_per_second": 2.518,
54
  "step": 30
55
  },
56
  {
57
  "epoch": 0.05063291139240506,
58
+ "grad_norm": 1.1594161987304688,
59
  "learning_rate": 4.9156118143459915e-05,
60
+ "loss": 0.9395,
61
+ "step": 40
62
+ },
63
+ {
64
+ "epoch": 0.05063291139240506,
65
+ "eval_loss": 0.9162012934684753,
66
+ "eval_runtime": 11.9373,
67
+ "eval_samples_per_second": 40.21,
68
+ "eval_steps_per_second": 2.513,
69
  "step": 40
70
  },
71
  {
72
  "epoch": 0.06329113924050633,
73
+ "grad_norm": 1.1133538484573364,
74
  "learning_rate": 4.89451476793249e-05,
75
+ "loss": 0.7489,
76
  "step": 50
77
  },
78
  {
79
  "epoch": 0.06329113924050633,
80
+ "eval_loss": 0.713701605796814,
81
+ "eval_runtime": 11.9661,
82
+ "eval_samples_per_second": 40.113,
83
+ "eval_steps_per_second": 2.507,
84
  "step": 50
85
  },
86
  {
87
  "epoch": 0.0759493670886076,
88
+ "grad_norm": 1.0406183004379272,
89
  "learning_rate": 4.8734177215189874e-05,
90
+ "loss": 0.6096,
91
+ "step": 60
92
+ },
93
+ {
94
+ "epoch": 0.0759493670886076,
95
+ "eval_loss": 0.6309535503387451,
96
+ "eval_runtime": 11.9895,
97
+ "eval_samples_per_second": 40.035,
98
+ "eval_steps_per_second": 2.502,
99
  "step": 60
100
  },
101
  {
102
  "epoch": 0.08860759493670886,
103
+ "grad_norm": 0.8599340915679932,
104
  "learning_rate": 4.852320675105486e-05,
105
+ "loss": 0.5357,
106
+ "step": 70
107
+ },
108
+ {
109
+ "epoch": 0.08860759493670886,
110
+ "eval_loss": 0.6159886717796326,
111
+ "eval_runtime": 12.0107,
112
+ "eval_samples_per_second": 39.965,
113
+ "eval_steps_per_second": 2.498,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.10126582278481013,
118
+ "grad_norm": 0.9128267168998718,
119
  "learning_rate": 4.8312236286919834e-05,
120
+ "loss": 0.5703,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.10126582278481013,
125
+ "eval_loss": 0.5933937430381775,
126
+ "eval_runtime": 11.9716,
127
+ "eval_samples_per_second": 40.095,
128
+ "eval_steps_per_second": 2.506,
129
  "step": 80
130
  },
131
  {
132
  "epoch": 0.11392405063291139,
133
+ "grad_norm": 0.9396541118621826,
134
  "learning_rate": 4.810126582278481e-05,
135
+ "loss": 0.5445,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.11392405063291139,
140
+ "eval_loss": 0.5727818608283997,
141
+ "eval_runtime": 11.9685,
142
+ "eval_samples_per_second": 40.105,
143
+ "eval_steps_per_second": 2.507,
144
  "step": 90
145
  },
146
  {
147
  "epoch": 0.12658227848101267,
148
+ "grad_norm": 0.8805290460586548,
149
  "learning_rate": 4.789029535864979e-05,
150
+ "loss": 0.5151,
151
  "step": 100
152
  },
153
  {
154
  "epoch": 0.12658227848101267,
155
+ "eval_loss": 0.5640087127685547,
156
+ "eval_runtime": 11.9824,
157
+ "eval_samples_per_second": 40.059,
158
+ "eval_steps_per_second": 2.504,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 0.13924050632911392,
163
+ "grad_norm": 0.8829126954078674,
164
  "learning_rate": 4.767932489451477e-05,
165
+ "loss": 0.5301,
166
+ "step": 110
167
+ },
168
+ {
169
+ "epoch": 0.13924050632911392,
170
+ "eval_loss": 0.5558986067771912,
171
+ "eval_runtime": 11.9942,
172
+ "eval_samples_per_second": 40.019,
173
+ "eval_steps_per_second": 2.501,
174
  "step": 110
175
  },
176
  {
177
  "epoch": 0.1518987341772152,
178
+ "grad_norm": 0.8889341950416565,
179
  "learning_rate": 4.7468354430379746e-05,
180
+ "loss": 0.4929,
181
+ "step": 120
182
+ },
183
+ {
184
+ "epoch": 0.1518987341772152,
185
+ "eval_loss": 0.5422877073287964,
186
+ "eval_runtime": 11.9624,
187
+ "eval_samples_per_second": 40.126,
188
+ "eval_steps_per_second": 2.508,
189
  "step": 120
190
  },
191
  {
192
  "epoch": 0.16455696202531644,
193
+ "grad_norm": 0.8428446054458618,
194
  "learning_rate": 4.725738396624473e-05,
195
+ "loss": 0.4932,
196
+ "step": 130
197
+ },
198
+ {
199
+ "epoch": 0.16455696202531644,
200
+ "eval_loss": 0.5370256900787354,
201
+ "eval_runtime": 11.9786,
202
+ "eval_samples_per_second": 40.071,
203
+ "eval_steps_per_second": 2.504,
204
  "step": 130
205
  },
206
  {
207
  "epoch": 0.17721518987341772,
208
+ "grad_norm": 0.8985374569892883,
209
  "learning_rate": 4.704641350210971e-05,
210
+ "loss": 0.4589,
211
+ "step": 140
212
+ },
213
+ {
214
+ "epoch": 0.17721518987341772,
215
+ "eval_loss": 0.5208094716072083,
216
+ "eval_runtime": 11.9711,
217
+ "eval_samples_per_second": 40.097,
218
+ "eval_steps_per_second": 2.506,
219
  "step": 140
220
  },
221
  {
222
  "epoch": 0.189873417721519,
223
+ "grad_norm": 0.8704663515090942,
224
  "learning_rate": 4.683544303797468e-05,
225
+ "loss": 0.4585,
226
  "step": 150
227
  },
228
  {
229
  "epoch": 0.189873417721519,
230
+ "eval_loss": 0.5105039477348328,
231
+ "eval_runtime": 11.9751,
232
+ "eval_samples_per_second": 40.083,
233
+ "eval_steps_per_second": 2.505,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 0.20253164556962025,
238
+ "grad_norm": 0.8930565714836121,
239
  "learning_rate": 4.6624472573839666e-05,
240
+ "loss": 0.4438,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 0.20253164556962025,
245
+ "eval_loss": 0.4988265931606293,
246
+ "eval_runtime": 11.9819,
247
+ "eval_samples_per_second": 40.06,
248
+ "eval_steps_per_second": 2.504,
249
  "step": 160
250
  },
251
  {
252
  "epoch": 0.21518987341772153,
253
+ "grad_norm": 0.9236075282096863,
254
  "learning_rate": 4.641350210970464e-05,
255
+ "loss": 0.4171,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.21518987341772153,
260
+ "eval_loss": 0.4941176176071167,
261
+ "eval_runtime": 11.9797,
262
+ "eval_samples_per_second": 40.068,
263
+ "eval_steps_per_second": 2.504,
264
  "step": 170
265
  },
266
  {
267
  "epoch": 0.22784810126582278,
268
+ "grad_norm": 0.8057528138160706,
269
  "learning_rate": 4.6202531645569625e-05,
270
+ "loss": 0.4244,
271
+ "step": 180
272
+ },
273
+ {
274
+ "epoch": 0.22784810126582278,
275
+ "eval_loss": 0.489634245634079,
276
+ "eval_runtime": 11.9664,
277
+ "eval_samples_per_second": 40.112,
278
+ "eval_steps_per_second": 2.507,
279
  "step": 180
280
  },
281
  {
282
  "epoch": 0.24050632911392406,
283
+ "grad_norm": 0.8802728056907654,
284
  "learning_rate": 4.59915611814346e-05,
285
+ "loss": 0.4196,
286
+ "step": 190
287
+ },
288
+ {
289
+ "epoch": 0.24050632911392406,
290
+ "eval_loss": 0.48033541440963745,
291
+ "eval_runtime": 11.9754,
292
+ "eval_samples_per_second": 40.082,
293
+ "eval_steps_per_second": 2.505,
294
  "step": 190
295
  },
296
  {
297
  "epoch": 0.25316455696202533,
298
+ "grad_norm": 0.8316253423690796,
299
  "learning_rate": 4.5780590717299585e-05,
300
+ "loss": 0.4144,
301
  "step": 200
302
  },
303
  {
304
  "epoch": 0.25316455696202533,
305
+ "eval_loss": 0.4757111966609955,
306
+ "eval_runtime": 11.9744,
307
+ "eval_samples_per_second": 40.086,
308
+ "eval_steps_per_second": 2.505,
309
  "step": 200
310
  }
311
  ],
 
313
  "max_steps": 2370,
314
  "num_input_tokens_seen": 0,
315
  "num_train_epochs": 3,
316
+ "save_steps": 10,
317
  "stateful_callbacks": {
318
  "TrainerControl": {
319
  "args": {
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8ef5fa4aad3a350c14df025074931ad8a003d4b851f4886f3b2f66ae6653e4b
3
  size 5880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1e7aae8b855413d55586dd498c7d7d805796f0c02067ce9d8ccb1ef37f72d29
3
  size 5880