felipeoes commited on
Commit
d57d830
·
verified ·
1 Parent(s): 3900b73

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/felipealumni-usp/huggingface/runs/14hhk8v2)
31
 
32
  This model was trained with SFT.
33
 
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/felipealumni-usp/huggingface/runs/j5fc5zrz)
31
 
32
  This model was trained with SFT.
33
 
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa6314d207d082c24a0086630dde03167d9e0a9590cf9bf2e427b3b4e8752117
3
  size 2185294624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a76f4c4267621f6756d7efc35412a52e1c431c8020c37ec87679dbaf899e34be
3
  size 2185294624
all_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9992542878448919,
3
+ "eval_samples": 26,
4
+ "total_flos": 7.44129634982953e+16,
5
+ "train_loss": 0.2547340336130626,
6
+ "train_runtime": 9634.561,
7
+ "train_samples": 25295,
8
+ "train_samples_per_second": 0.556,
9
+ "train_steps_per_second": 0.07
10
+ }
train_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9992542878448919,
3
+ "eval_samples": 26,
4
+ "total_flos": 7.44129634982953e+16,
5
+ "train_loss": 0.2547340336130626,
6
+ "train_runtime": 9634.561,
7
+ "train_samples": 25295,
8
+ "train_samples_per_second": 0.556,
9
+ "train_steps_per_second": 0.07
10
+ }
trainer_state.json ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.4858,
3
+ "best_model_checkpoint": "runs/cocoruta2-llama3-1-8b-regex-only-valid/checkpoint-350",
4
+ "epoch": 0.9992542878448919,
5
+ "eval_steps": 25,
6
+ "global_step": 670,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.037285607755406416,
13
+ "grad_norm": 0.5538629931679852,
14
+ "learning_rate": 7.46268656716418e-05,
15
+ "loss": 0.8987,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.037285607755406416,
20
+ "eval_loss": 0.6527890563011169,
21
+ "eval_runtime": 4.7198,
22
+ "eval_samples_per_second": 1.059,
23
+ "eval_steps_per_second": 0.636,
24
+ "step": 25
25
+ },
26
+ {
27
+ "epoch": 0.07457121551081283,
28
+ "grad_norm": 0.2341721772038344,
29
+ "learning_rate": 0.0001492537313432836,
30
+ "loss": 0.6394,
31
+ "step": 50
32
+ },
33
+ {
34
+ "epoch": 0.07457121551081283,
35
+ "eval_loss": 0.5486425757408142,
36
+ "eval_runtime": 4.7295,
37
+ "eval_samples_per_second": 1.057,
38
+ "eval_steps_per_second": 0.634,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.11185682326621924,
43
+ "grad_norm": 0.19912920431393782,
44
+ "learning_rate": 0.00019991315351855748,
45
+ "loss": 0.6007,
46
+ "step": 75
47
+ },
48
+ {
49
+ "epoch": 0.11185682326621924,
50
+ "eval_loss": 0.5331323146820068,
51
+ "eval_runtime": 4.6724,
52
+ "eval_samples_per_second": 1.07,
53
+ "eval_steps_per_second": 0.642,
54
+ "step": 75
55
+ },
56
+ {
57
+ "epoch": 0.14914243102162567,
58
+ "grad_norm": 0.22498739924316974,
59
+ "learning_rate": 0.0001985256759242359,
60
+ "loss": 0.5696,
61
+ "step": 100
62
+ },
63
+ {
64
+ "epoch": 0.14914243102162567,
65
+ "eval_loss": 0.49856358766555786,
66
+ "eval_runtime": 4.7327,
67
+ "eval_samples_per_second": 1.056,
68
+ "eval_steps_per_second": 0.634,
69
+ "step": 100
70
+ },
71
+ {
72
+ "epoch": 0.18642803877703207,
73
+ "grad_norm": 0.14018935803156768,
74
+ "learning_rate": 0.00019546910545535558,
75
+ "loss": 0.5523,
76
+ "step": 125
77
+ },
78
+ {
79
+ "epoch": 0.18642803877703207,
80
+ "eval_loss": 0.48138752579689026,
81
+ "eval_runtime": 4.6757,
82
+ "eval_samples_per_second": 1.069,
83
+ "eval_steps_per_second": 0.642,
84
+ "step": 125
85
+ },
86
+ {
87
+ "epoch": 0.22371364653243847,
88
+ "grad_norm": 0.16571925714609734,
89
+ "learning_rate": 0.00019079522252288386,
90
+ "loss": 0.5779,
91
+ "step": 150
92
+ },
93
+ {
94
+ "epoch": 0.22371364653243847,
95
+ "eval_loss": 0.472788006067276,
96
+ "eval_runtime": 4.7083,
97
+ "eval_samples_per_second": 1.062,
98
+ "eval_steps_per_second": 0.637,
99
+ "step": 150
100
+ },
101
+ {
102
+ "epoch": 0.2609992542878449,
103
+ "grad_norm": 0.14926948884435004,
104
+ "learning_rate": 0.00018458320592590975,
105
+ "loss": 0.5351,
106
+ "step": 175
107
+ },
108
+ {
109
+ "epoch": 0.2609992542878449,
110
+ "eval_loss": 0.467672735452652,
111
+ "eval_runtime": 4.6956,
112
+ "eval_samples_per_second": 1.065,
113
+ "eval_steps_per_second": 0.639,
114
+ "step": 175
115
+ },
116
+ {
117
+ "epoch": 0.29828486204325133,
118
+ "grad_norm": 0.13621513300078802,
119
+ "learning_rate": 0.00017693829150820068,
120
+ "loss": 0.5251,
121
+ "step": 200
122
+ },
123
+ {
124
+ "epoch": 0.29828486204325133,
125
+ "eval_loss": 0.4527561664581299,
126
+ "eval_runtime": 4.7174,
127
+ "eval_samples_per_second": 1.06,
128
+ "eval_steps_per_second": 0.636,
129
+ "step": 200
130
+ },
131
+ {
132
+ "epoch": 0.33557046979865773,
133
+ "grad_norm": 0.14343687732505184,
134
+ "learning_rate": 0.00016798998939045895,
135
+ "loss": 0.5301,
136
+ "step": 225
137
+ },
138
+ {
139
+ "epoch": 0.33557046979865773,
140
+ "eval_loss": 0.45074066519737244,
141
+ "eval_runtime": 4.7651,
142
+ "eval_samples_per_second": 1.049,
143
+ "eval_steps_per_second": 0.63,
144
+ "step": 225
145
+ },
146
+ {
147
+ "epoch": 0.37285607755406414,
148
+ "grad_norm": 0.14384455598548593,
149
+ "learning_rate": 0.00015788988997959114,
150
+ "loss": 0.5554,
151
+ "step": 250
152
+ },
153
+ {
154
+ "epoch": 0.37285607755406414,
155
+ "eval_loss": 0.45150551199913025,
156
+ "eval_runtime": 4.7234,
157
+ "eval_samples_per_second": 1.059,
158
+ "eval_steps_per_second": 0.635,
159
+ "step": 250
160
+ },
161
+ {
162
+ "epoch": 0.41014168530947054,
163
+ "grad_norm": 0.11661007763800872,
164
+ "learning_rate": 0.0001468090959227082,
165
+ "loss": 0.5198,
166
+ "step": 275
167
+ },
168
+ {
169
+ "epoch": 0.41014168530947054,
170
+ "eval_loss": 0.4460136294364929,
171
+ "eval_runtime": 4.7242,
172
+ "eval_samples_per_second": 1.058,
173
+ "eval_steps_per_second": 0.635,
174
+ "step": 275
175
+ },
176
+ {
177
+ "epoch": 0.44742729306487694,
178
+ "grad_norm": 0.1321962416990341,
179
+ "learning_rate": 0.0001349353235103232,
180
+ "loss": 0.5484,
181
+ "step": 300
182
+ },
183
+ {
184
+ "epoch": 0.44742729306487694,
185
+ "eval_loss": 0.4421899914741516,
186
+ "eval_runtime": 4.7511,
187
+ "eval_samples_per_second": 1.052,
188
+ "eval_steps_per_second": 0.631,
189
+ "step": 300
190
+ },
191
+ {
192
+ "epoch": 0.48471290082028334,
193
+ "grad_norm": 0.13900994098830932,
194
+ "learning_rate": 0.0001224697226329772,
195
+ "loss": 0.5223,
196
+ "step": 325
197
+ },
198
+ {
199
+ "epoch": 0.48471290082028334,
200
+ "eval_loss": 0.4396364092826843,
201
+ "eval_runtime": 4.705,
202
+ "eval_samples_per_second": 1.063,
203
+ "eval_steps_per_second": 0.638,
204
+ "step": 325
205
+ },
206
+ {
207
+ "epoch": 0.5219985085756897,
208
+ "grad_norm": 0.15825647169367713,
209
+ "learning_rate": 0.00010962346916341903,
210
+ "loss": 0.4858,
211
+ "step": 350
212
+ },
213
+ {
214
+ "epoch": 0.5219985085756897,
215
+ "eval_loss": 0.43353357911109924,
216
+ "eval_runtime": 4.7849,
217
+ "eval_samples_per_second": 1.045,
218
+ "eval_steps_per_second": 0.627,
219
+ "step": 350
220
+ },
221
+ {
222
+ "epoch": 0.5592841163310962,
223
+ "grad_norm": 0.11937908855087626,
224
+ "learning_rate": 9.661418749173467e-05,
225
+ "loss": 0.5051,
226
+ "step": 375
227
+ },
228
+ {
229
+ "epoch": 0.5592841163310962,
230
+ "eval_loss": 0.42787012457847595,
231
+ "eval_runtime": 4.7427,
232
+ "eval_samples_per_second": 1.054,
233
+ "eval_steps_per_second": 0.633,
234
+ "step": 375
235
+ },
236
+ {
237
+ "epoch": 0.5965697240865027,
238
+ "grad_norm": 0.1236665974221879,
239
+ "learning_rate": 8.366226381814697e-05,
240
+ "loss": 0.489,
241
+ "step": 400
242
+ },
243
+ {
244
+ "epoch": 0.5965697240865027,
245
+ "eval_loss": 0.4265735149383545,
246
+ "eval_runtime": 4.7586,
247
+ "eval_samples_per_second": 1.051,
248
+ "eval_steps_per_second": 0.63,
249
+ "step": 400
250
+ },
251
+ {
252
+ "epoch": 0.633855331841909,
253
+ "grad_norm": 0.1340681564316269,
254
+ "learning_rate": 7.09871126588481e-05,
255
+ "loss": 0.4992,
256
+ "step": 425
257
+ },
258
+ {
259
+ "epoch": 0.633855331841909,
260
+ "eval_loss": 0.41966643929481506,
261
+ "eval_runtime": 4.725,
262
+ "eval_samples_per_second": 1.058,
263
+ "eval_steps_per_second": 0.635,
264
+ "step": 425
265
+ },
266
+ {
267
+ "epoch": 0.6711409395973155,
268
+ "grad_norm": 0.14955138964358772,
269
+ "learning_rate": 5.880345981282876e-05,
270
+ "loss": 0.4985,
271
+ "step": 450
272
+ },
273
+ {
274
+ "epoch": 0.6711409395973155,
275
+ "eval_loss": 0.41805362701416016,
276
+ "eval_runtime": 4.6928,
277
+ "eval_samples_per_second": 1.065,
278
+ "eval_steps_per_second": 0.639,
279
+ "step": 450
280
+ },
281
+ {
282
+ "epoch": 0.7084265473527218,
283
+ "grad_norm": 0.13281003649358977,
284
+ "learning_rate": 4.7317704758809946e-05,
285
+ "loss": 0.511,
286
+ "step": 475
287
+ },
288
+ {
289
+ "epoch": 0.7084265473527218,
290
+ "eval_loss": 0.4160347878932953,
291
+ "eval_runtime": 4.6998,
292
+ "eval_samples_per_second": 1.064,
293
+ "eval_steps_per_second": 0.638,
294
+ "step": 475
295
+ },
296
+ {
297
+ "epoch": 0.7457121551081283,
298
+ "grad_norm": 0.13212294340504707,
299
+ "learning_rate": 3.672442410577965e-05,
300
+ "loss": 0.5103,
301
+ "step": 500
302
+ },
303
+ {
304
+ "epoch": 0.7457121551081283,
305
+ "eval_loss": 0.41526561975479126,
306
+ "eval_runtime": 4.7196,
307
+ "eval_samples_per_second": 1.059,
308
+ "eval_steps_per_second": 0.636,
309
+ "step": 500
310
+ },
311
+ {
312
+ "epoch": 0.7829977628635347,
313
+ "grad_norm": 0.13255473695033904,
314
+ "learning_rate": 2.7203075331094017e-05,
315
+ "loss": 0.4953,
316
+ "step": 525
317
+ },
318
+ {
319
+ "epoch": 0.7829977628635347,
320
+ "eval_loss": 0.41363996267318726,
321
+ "eval_runtime": 4.6994,
322
+ "eval_samples_per_second": 1.064,
323
+ "eval_steps_per_second": 0.638,
324
+ "step": 525
325
+ },
326
+ {
327
+ "epoch": 0.8202833706189411,
328
+ "grad_norm": 0.11553089711715571,
329
+ "learning_rate": 1.89149566470915e-05,
330
+ "loss": 0.5017,
331
+ "step": 550
332
+ },
333
+ {
334
+ "epoch": 0.8202833706189411,
335
+ "eval_loss": 0.4126836359500885,
336
+ "eval_runtime": 4.7366,
337
+ "eval_samples_per_second": 1.056,
338
+ "eval_steps_per_second": 0.633,
339
+ "step": 550
340
+ },
341
+ {
342
+ "epoch": 0.8575689783743475,
343
+ "grad_norm": 0.1205660312265152,
344
+ "learning_rate": 1.2000474498175552e-05,
345
+ "loss": 0.488,
346
+ "step": 575
347
+ },
348
+ {
349
+ "epoch": 0.8575689783743475,
350
+ "eval_loss": 0.41148170828819275,
351
+ "eval_runtime": 4.6986,
352
+ "eval_samples_per_second": 1.064,
353
+ "eval_steps_per_second": 0.638,
354
+ "step": 575
355
+ },
356
+ {
357
+ "epoch": 0.8948545861297539,
358
+ "grad_norm": 0.11357983134630366,
359
+ "learning_rate": 6.576764978849004e-06,
360
+ "loss": 0.4862,
361
+ "step": 600
362
+ },
363
+ {
364
+ "epoch": 0.8948545861297539,
365
+ "eval_loss": 0.4110669493675232,
366
+ "eval_runtime": 4.7813,
367
+ "eval_samples_per_second": 1.046,
368
+ "eval_steps_per_second": 0.627,
369
+ "step": 600
370
+ },
371
+ {
372
+ "epoch": 0.9321401938851603,
373
+ "grad_norm": 0.11689127563900782,
374
+ "learning_rate": 2.735709467518699e-06,
375
+ "loss": 0.4866,
376
+ "step": 625
377
+ },
378
+ {
379
+ "epoch": 0.9321401938851603,
380
+ "eval_loss": 0.41068965196609497,
381
+ "eval_runtime": 4.7375,
382
+ "eval_samples_per_second": 1.055,
383
+ "eval_steps_per_second": 0.633,
384
+ "step": 625
385
+ },
386
+ {
387
+ "epoch": 0.9694258016405667,
388
+ "grad_norm": 0.12155258793935453,
389
+ "learning_rate": 5.42378092601481e-07,
390
+ "loss": 0.4902,
391
+ "step": 650
392
+ },
393
+ {
394
+ "epoch": 0.9694258016405667,
395
+ "eval_loss": 0.4103812277317047,
396
+ "eval_runtime": 4.6763,
397
+ "eval_samples_per_second": 1.069,
398
+ "eval_steps_per_second": 0.642,
399
+ "step": 650
400
+ },
401
+ {
402
+ "epoch": 0.9992542878448919,
403
+ "step": 670,
404
+ "total_flos": 7.44129634982953e+16,
405
+ "train_loss": 0.2547340336130626,
406
+ "train_runtime": 9634.561,
407
+ "train_samples_per_second": 0.556,
408
+ "train_steps_per_second": 0.07
409
+ }
410
+ ],
411
+ "logging_steps": 25,
412
+ "max_steps": 670,
413
+ "num_input_tokens_seen": 0,
414
+ "num_train_epochs": 1,
415
+ "save_steps": 25,
416
+ "stateful_callbacks": {
417
+ "TrainerControl": {
418
+ "args": {
419
+ "should_epoch_stop": false,
420
+ "should_evaluate": false,
421
+ "should_log": false,
422
+ "should_save": true,
423
+ "should_training_stop": true
424
+ },
425
+ "attributes": {}
426
+ }
427
+ },
428
+ "total_flos": 7.44129634982953e+16,
429
+ "train_batch_size": 2,
430
+ "trial_name": null,
431
+ "trial_params": null
432
+ }