iHateNLP commited on
Commit
eb9372b
·
verified ·
1 Parent(s): d4cda17

Training Examples: 10-20k

Browse files
Files changed (6) hide show
  1. README.md +0 -2
  2. optimizer.pt +1 -1
  3. scaler.pt +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +199 -199
  6. training_args.bin +1 -1
README.md CHANGED
@@ -1,8 +1,6 @@
1
  ---
2
  base_model: unsloth/Phi-3.5-mini-instruct
3
  library_name: peft
4
- tags:
5
- - unsloth
6
  ---
7
 
8
  # Model Card for Model ID
 
1
  ---
2
  base_model: unsloth/Phi-3.5-mini-instruct
3
  library_name: peft
 
 
4
  ---
5
 
6
  # Model Card for Model ID
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f863b9077713f0e757fcdcacee68ba1dcf99ea980774246b3554039ed33fffdb
3
  size 61227348
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f343725d809da4ffaf684a244a4100eecaa26a54d067b93505a4ea66f68b319
3
  size 61227348
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c93c597eb03d2515505b488c57afff1198972d636568623b2424b16127e41db5
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57acdb1f05c51c1c33f6ebed56a52d6cf566d8a89c725b24e17cf2ddc63a1a1f
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f40383c44c7538547fd9cf2b2d066e1e62a7c77bd533c66118ed64f4b2c883d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65471f2a90c88eeb3b60677a5a2bfebe038a7e13f123565e2531cb20a70783a
3
  size 1064
trainer_state.json CHANGED
@@ -3,357 +3,357 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 4820,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0002074688796680498,
13
- "grad_norm": 0.655322790145874,
14
- "learning_rate": 2.074688796680498e-07,
15
- "loss": 1.8621,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.02074688796680498,
20
- "grad_norm": 0.2854801118373871,
21
- "learning_rate": 2.0746887966804982e-05,
22
- "loss": 1.2556,
23
  "step": 100
24
  },
25
  {
26
- "epoch": 0.04149377593360996,
27
- "grad_norm": 0.3198237419128418,
28
- "learning_rate": 4.1493775933609964e-05,
29
- "loss": 1.003,
30
  "step": 200
31
  },
32
  {
33
- "epoch": 0.06224066390041494,
34
- "grad_norm": 0.40545061230659485,
35
- "learning_rate": 6.224066390041494e-05,
36
- "loss": 0.9348,
37
  "step": 300
38
  },
39
  {
40
- "epoch": 0.08298755186721991,
41
- "grad_norm": 0.18410639464855194,
42
- "learning_rate": 8.298755186721993e-05,
43
- "loss": 0.9333,
44
  "step": 400
45
  },
46
  {
47
- "epoch": 0.1037344398340249,
48
- "grad_norm": 0.21944168210029602,
49
- "learning_rate": 0.00010373443983402491,
50
- "loss": 0.9156,
51
  "step": 500
52
  },
53
  {
54
- "epoch": 0.12448132780082988,
55
- "grad_norm": 0.20224255323410034,
56
- "learning_rate": 0.00012448132780082987,
57
- "loss": 0.9411,
58
  "step": 600
59
  },
60
  {
61
- "epoch": 0.14522821576763487,
62
- "grad_norm": 0.33025670051574707,
63
- "learning_rate": 0.00014522821576763486,
64
- "loss": 0.8919,
65
  "step": 700
66
  },
67
  {
68
- "epoch": 0.16597510373443983,
69
- "grad_norm": 0.13862663507461548,
70
- "learning_rate": 0.00016597510373443986,
71
- "loss": 0.8931,
72
  "step": 800
73
  },
74
  {
75
- "epoch": 0.18672199170124482,
76
- "grad_norm": 0.41121917963027954,
77
- "learning_rate": 0.00018672199170124482,
78
- "loss": 0.912,
79
  "step": 900
80
  },
81
  {
82
- "epoch": 0.2074688796680498,
83
- "grad_norm": 0.25411322712898254,
84
- "learning_rate": 0.00019995698998770956,
85
- "loss": 0.9075,
86
  "step": 1000
87
  },
88
  {
89
- "epoch": 0.22821576763485477,
90
- "grad_norm": 0.3547718822956085,
91
- "learning_rate": 0.00019938676188562863,
92
- "loss": 0.9199,
93
  "step": 1100
94
  },
95
  {
96
- "epoch": 0.24896265560165975,
97
- "grad_norm": 0.4020107388496399,
98
- "learning_rate": 0.00019815718684822688,
99
- "loss": 0.9164,
100
  "step": 1200
101
  },
102
  {
103
- "epoch": 0.2697095435684647,
104
- "grad_norm": 0.21474340558052063,
105
- "learning_rate": 0.00019627642206381863,
106
- "loss": 0.881,
107
  "step": 1300
108
  },
109
  {
110
- "epoch": 0.29045643153526973,
111
- "grad_norm": 0.18409068882465363,
112
- "learning_rate": 0.00019375694481280965,
113
- "loss": 0.8793,
114
  "step": 1400
115
  },
116
  {
117
- "epoch": 0.3112033195020747,
118
- "grad_norm": 0.4158700704574585,
119
- "learning_rate": 0.0001906154696915157,
120
- "loss": 0.8633,
121
  "step": 1500
122
  },
123
  {
124
- "epoch": 0.33195020746887965,
125
- "grad_norm": 0.3469320237636566,
126
- "learning_rate": 0.00018687283772498206,
127
- "loss": 0.897,
128
  "step": 1600
129
  },
130
  {
131
- "epoch": 0.35269709543568467,
132
- "grad_norm": 0.4698736369609833,
133
- "learning_rate": 0.00018255387810444448,
134
- "loss": 0.8829,
135
  "step": 1700
136
  },
137
  {
138
- "epoch": 0.37344398340248963,
139
- "grad_norm": 0.3711629807949066,
140
- "learning_rate": 0.0001776872434666882,
141
- "loss": 0.8862,
142
  "step": 1800
143
  },
144
  {
145
- "epoch": 0.3941908713692946,
146
- "grad_norm": 0.2403649389743805,
147
- "learning_rate": 0.000172305219808086,
148
- "loss": 0.8799,
149
  "step": 1900
150
  },
151
  {
152
- "epoch": 0.4149377593360996,
153
- "grad_norm": 0.45258045196533203,
154
- "learning_rate": 0.00016644351229437416,
155
- "loss": 0.8671,
156
  "step": 2000
157
  },
158
  {
159
- "epoch": 0.43568464730290457,
160
- "grad_norm": 0.19893667101860046,
161
- "learning_rate": 0.00016014100838713797,
162
- "loss": 0.8391,
163
  "step": 2100
164
  },
165
  {
166
- "epoch": 0.45643153526970953,
167
- "grad_norm": 0.44036027789115906,
168
- "learning_rate": 0.00015343951985846095,
169
- "loss": 0.847,
170
  "step": 2200
171
  },
172
  {
173
- "epoch": 0.47717842323651455,
174
- "grad_norm": 0.17648863792419434,
175
- "learning_rate": 0.00014638350540525246,
176
- "loss": 0.8721,
177
  "step": 2300
178
  },
179
  {
180
- "epoch": 0.4979253112033195,
181
- "grad_norm": 0.39910203218460083,
182
- "learning_rate": 0.0001390197757034721,
183
- "loss": 0.8791,
184
  "step": 2400
185
  },
186
  {
187
- "epoch": 0.5186721991701245,
188
- "grad_norm": 0.39837658405303955,
189
- "learning_rate": 0.00013139718285896655,
190
- "loss": 0.8557,
191
  "step": 2500
192
  },
193
  {
194
- "epoch": 0.5394190871369294,
195
- "grad_norm": 0.3242895305156708,
196
- "learning_rate": 0.00012356629631514929,
197
- "loss": 0.8634,
198
  "step": 2600
199
  },
200
  {
201
- "epoch": 0.5601659751037344,
202
- "grad_norm": 0.260079562664032,
203
- "learning_rate": 0.0001156595402508126,
204
- "loss": 0.8228,
205
  "step": 2700
206
  },
207
  {
208
- "epoch": 0.5809128630705395,
209
- "grad_norm": 0.19493329524993896,
210
- "learning_rate": 0.00010756972609502109,
211
- "loss": 0.8483,
212
  "step": 2800
213
  },
214
  {
215
- "epoch": 0.6016597510373444,
216
- "grad_norm": 0.4152539372444153,
217
- "learning_rate": 9.942969322189461e-05,
218
- "loss": 0.7983,
219
  "step": 2900
220
  },
221
  {
222
- "epoch": 0.6224066390041494,
223
- "grad_norm": 0.28217726945877075,
224
- "learning_rate": 9.129344385090869e-05,
225
- "loss": 0.8174,
226
  "step": 3000
227
  },
228
  {
229
- "epoch": 0.6431535269709544,
230
- "grad_norm": 0.25863170623779297,
231
- "learning_rate": 8.321495510120858e-05,
232
- "loss": 0.829,
233
  "step": 3100
234
  },
235
  {
236
- "epoch": 0.6639004149377593,
237
- "grad_norm": 0.33922916650772095,
238
- "learning_rate": 7.524782089916924e-05,
239
- "loss": 0.828,
240
  "step": 3200
241
  },
242
  {
243
- "epoch": 0.6846473029045643,
244
- "grad_norm": 0.2833387851715088,
245
- "learning_rate": 6.744489642811432e-05,
246
- "loss": 0.8211,
247
  "step": 3300
248
  },
249
  {
250
- "epoch": 0.7053941908713693,
251
- "grad_norm": 0.44776320457458496,
252
- "learning_rate": 5.9857947478969156e-05,
253
- "loss": 0.8534,
254
  "step": 3400
255
  },
256
  {
257
- "epoch": 0.7261410788381742,
258
- "grad_norm": 0.4343676269054413,
259
- "learning_rate": 5.253730702810957e-05,
260
- "loss": 0.8126,
261
  "step": 3500
262
  },
263
  {
264
- "epoch": 0.7468879668049793,
265
- "grad_norm": 0.2133251577615738,
266
- "learning_rate": 4.553154132072044e-05,
267
- "loss": 0.8514,
268
  "step": 3600
269
  },
270
  {
271
- "epoch": 0.7676348547717843,
272
- "grad_norm": 0.4406150281429291,
273
- "learning_rate": 3.88871276749183e-05,
274
- "loss": 0.8379,
275
  "step": 3700
276
  },
277
  {
278
- "epoch": 0.7883817427385892,
279
- "grad_norm": 0.48208776116371155,
280
- "learning_rate": 3.264814614413537e-05,
281
- "loss": 0.8325,
282
  "step": 3800
283
  },
284
  {
285
- "epoch": 0.8091286307053942,
286
- "grad_norm": 0.2793184220790863,
287
- "learning_rate": 2.68559870833223e-05,
288
- "loss": 0.8116,
289
  "step": 3900
290
  },
291
  {
292
- "epoch": 0.8298755186721992,
293
- "grad_norm": 0.354342520236969,
294
- "learning_rate": 2.154907655902131e-05,
295
- "loss": 0.806,
296
  "step": 4000
297
  },
298
  {
299
- "epoch": 0.8506224066390041,
300
- "grad_norm": 0.2756762206554413,
301
- "learning_rate": 1.6762621424980718e-05,
302
- "loss": 0.8254,
303
  "step": 4100
304
  },
305
  {
306
- "epoch": 0.8713692946058091,
307
- "grad_norm": 0.551670491695404,
308
- "learning_rate": 1.252837575451774e-05,
309
- "loss": 0.8275,
310
  "step": 4200
311
  },
312
  {
313
- "epoch": 0.8921161825726142,
314
- "grad_norm": 0.3675282597541809,
315
- "learning_rate": 8.874430179153592e-06,
316
- "loss": 0.7922,
317
  "step": 4300
318
  },
319
  {
320
- "epoch": 0.9128630705394191,
321
- "grad_norm": 0.24929411709308624,
322
- "learning_rate": 5.825025531079997e-06,
323
- "loss": 0.8274,
324
  "step": 4400
325
  },
326
  {
327
- "epoch": 0.9336099585062241,
328
- "grad_norm": 0.296157568693161,
329
- "learning_rate": 3.4003920257815068e-06,
330
- "loss": 0.8172,
331
  "step": 4500
332
  },
333
  {
334
- "epoch": 0.9543568464730291,
335
- "grad_norm": 0.3886050879955292,
336
- "learning_rate": 1.6166150517008316e-06,
337
- "loss": 0.7862,
338
  "step": 4600
339
  },
340
  {
341
- "epoch": 0.975103734439834,
342
- "grad_norm": 0.21001334488391876,
343
- "learning_rate": 4.85528457318829e-07,
344
- "loss": 0.8318,
345
  "step": 4700
346
  },
347
  {
348
- "epoch": 0.995850622406639,
349
- "grad_norm": 0.42969635128974915,
350
- "learning_rate": 1.4636043598914572e-08,
351
- "loss": 0.8309,
352
  "step": 4800
353
  }
354
  ],
355
  "logging_steps": 100,
356
- "max_steps": 4820,
357
  "num_input_tokens_seen": 0,
358
  "num_train_epochs": 1,
359
  "save_steps": 500,
@@ -369,7 +369,7 @@
369
  "attributes": {}
370
  }
371
  },
372
- "total_flos": 1.298519682425856e+17,
373
  "train_batch_size": 2,
374
  "trial_name": null,
375
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 4825,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0002072538860103627,
13
+ "grad_norm": 0.5270228385925293,
14
+ "learning_rate": 4.140786749482402e-07,
15
+ "loss": 0.7597,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.02072538860103627,
20
+ "grad_norm": 0.3120118975639343,
21
+ "learning_rate": 4.140786749482402e-05,
22
+ "loss": 0.8058,
23
  "step": 100
24
  },
25
  {
26
+ "epoch": 0.04145077720207254,
27
+ "grad_norm": 0.3838038146495819,
28
+ "learning_rate": 8.281573498964804e-05,
29
+ "loss": 0.8085,
30
  "step": 200
31
  },
32
  {
33
+ "epoch": 0.06217616580310881,
34
+ "grad_norm": 0.33677420020103455,
35
+ "learning_rate": 0.00012422360248447205,
36
+ "loss": 0.8452,
37
  "step": 300
38
  },
39
  {
40
+ "epoch": 0.08290155440414508,
41
+ "grad_norm": 0.2833797335624695,
42
+ "learning_rate": 0.00016563146997929608,
43
+ "loss": 0.7836,
44
  "step": 400
45
  },
46
  {
47
+ "epoch": 0.10362694300518134,
48
+ "grad_norm": 0.3029078245162964,
49
+ "learning_rate": 0.0001999924354607825,
50
+ "loss": 0.8032,
51
  "step": 500
52
  },
53
  {
54
+ "epoch": 0.12435233160621761,
55
+ "grad_norm": 0.30878347158432007,
56
+ "learning_rate": 0.00019964190153093613,
57
+ "loss": 0.8034,
58
  "step": 600
59
  },
60
  {
61
+ "epoch": 0.14507772020725387,
62
+ "grad_norm": 0.1962185502052307,
63
+ "learning_rate": 0.00019878125322990773,
64
+ "loss": 0.8069,
65
  "step": 700
66
  },
67
  {
68
+ "epoch": 0.16580310880829016,
69
+ "grad_norm": 0.2113143652677536,
70
+ "learning_rate": 0.00019739761494902327,
71
+ "loss": 0.8239,
72
  "step": 800
73
  },
74
  {
75
+ "epoch": 0.18652849740932642,
76
+ "grad_norm": 0.36496707797050476,
77
+ "learning_rate": 0.00019550431853565577,
78
+ "loss": 0.7952,
79
  "step": 900
80
  },
81
  {
82
+ "epoch": 0.20725388601036268,
83
+ "grad_norm": 0.4117984175682068,
84
+ "learning_rate": 0.00019311127115144138,
85
+ "loss": 0.8159,
86
  "step": 1000
87
  },
88
  {
89
+ "epoch": 0.22797927461139897,
90
+ "grad_norm": 0.25097981095314026,
91
+ "learning_rate": 0.00019023099503382319,
92
+ "loss": 0.8295,
93
  "step": 1100
94
  },
95
  {
96
+ "epoch": 0.24870466321243523,
97
+ "grad_norm": 0.2996521592140198,
98
+ "learning_rate": 0.00018687856197021518,
99
+ "loss": 0.8071,
100
  "step": 1200
101
  },
102
  {
103
+ "epoch": 0.2694300518134715,
104
+ "grad_norm": 0.2897460162639618,
105
+ "learning_rate": 0.0001830715144309886,
106
+ "loss": 0.8338,
107
  "step": 1300
108
  },
109
  {
110
+ "epoch": 0.29015544041450775,
111
+ "grad_norm": 0.5588517785072327,
112
+ "learning_rate": 0.0001788297737739727,
113
+ "loss": 0.7572,
114
  "step": 1400
115
  },
116
  {
117
+ "epoch": 0.31088082901554404,
118
+ "grad_norm": 0.47002148628234863,
119
+ "learning_rate": 0.00017417553600081358,
120
+ "loss": 0.809,
121
  "step": 1500
122
  },
123
  {
124
+ "epoch": 0.3316062176165803,
125
+ "grad_norm": 0.3224999010562897,
126
+ "learning_rate": 0.00016913315561067264,
127
+ "loss": 0.7942,
128
  "step": 1600
129
  },
130
  {
131
+ "epoch": 0.35233160621761656,
132
+ "grad_norm": 0.5584061145782471,
133
+ "learning_rate": 0.0001637290181590304,
134
+ "loss": 0.8329,
135
  "step": 1700
136
  },
137
  {
138
+ "epoch": 0.37305699481865284,
139
+ "grad_norm": 0.3566969633102417,
140
+ "learning_rate": 0.00015799140218846435,
141
+ "loss": 0.7901,
142
  "step": 1800
143
  },
144
  {
145
+ "epoch": 0.39378238341968913,
146
+ "grad_norm": 0.27286580204963684,
147
+ "learning_rate": 0.00015195033125388395,
148
+ "loss": 0.8138,
149
  "step": 1900
150
  },
151
  {
152
+ "epoch": 0.41450777202072536,
153
+ "grad_norm": 0.3148711621761322,
154
+ "learning_rate": 0.00014563741681653824,
155
+ "loss": 0.7867,
156
  "step": 2000
157
  },
158
  {
159
+ "epoch": 0.43523316062176165,
160
+ "grad_norm": 0.3646707832813263,
161
+ "learning_rate": 0.0001390856928288946,
162
+ "loss": 0.7681,
163
  "step": 2100
164
  },
165
  {
166
+ "epoch": 0.45595854922279794,
167
+ "grad_norm": 0.32302677631378174,
168
+ "learning_rate": 0.00013232944287596522,
169
+ "loss": 0.8134,
170
  "step": 2200
171
  },
172
  {
173
+ "epoch": 0.47668393782383417,
174
+ "grad_norm": 0.7468949556350708,
175
+ "learning_rate": 0.000125404020777609,
176
+ "loss": 0.7862,
177
  "step": 2300
178
  },
179
  {
180
+ "epoch": 0.49740932642487046,
181
+ "grad_norm": 0.36616334319114685,
182
+ "learning_rate": 0.00011834566559055394,
183
+ "loss": 0.7675,
184
  "step": 2400
185
  },
186
  {
187
+ "epoch": 0.5181347150259067,
188
+ "grad_norm": 0.3042179346084595,
189
+ "learning_rate": 0.00011119131197818904,
190
+ "loss": 0.7824,
191
  "step": 2500
192
  },
193
  {
194
+ "epoch": 0.538860103626943,
195
+ "grad_norm": 0.6282315850257874,
196
+ "learning_rate": 0.0001039783969404153,
197
+ "loss": 0.8008,
198
  "step": 2600
199
  },
200
  {
201
+ "epoch": 0.5595854922279793,
202
+ "grad_norm": 0.309589147567749,
203
+ "learning_rate": 9.674466391489112e-05,
204
+ "loss": 0.7918,
205
  "step": 2700
206
  },
207
  {
208
+ "epoch": 0.5803108808290155,
209
+ "grad_norm": 0.6195072531700134,
210
+ "learning_rate": 8.952796527476341e-05,
211
+ "loss": 0.7714,
212
  "step": 2800
213
  },
214
  {
215
+ "epoch": 0.6010362694300518,
216
+ "grad_norm": 0.5037879943847656,
217
+ "learning_rate": 8.236606425636553e-05,
218
+ "loss": 0.8098,
219
  "step": 2900
220
  },
221
  {
222
+ "epoch": 0.6217616580310881,
223
+ "grad_norm": 0.3202759325504303,
224
+ "learning_rate": 7.529643735334646e-05,
225
+ "loss": 0.7303,
226
  "step": 3000
227
  },
228
  {
229
+ "epoch": 0.6424870466321243,
230
+ "grad_norm": 0.4311729669570923,
231
+ "learning_rate": 6.835607821125519e-05,
232
+ "loss": 0.7717,
233
  "step": 3100
234
  },
235
  {
236
+ "epoch": 0.6632124352331606,
237
+ "grad_norm": 0.28787970542907715,
238
+ "learning_rate": 6.158130404875231e-05,
239
+ "loss": 0.7748,
240
  "step": 3200
241
  },
242
  {
243
+ "epoch": 0.6839378238341969,
244
+ "grad_norm": 0.28564584255218506,
245
+ "learning_rate": 5.5007565618399506e-05,
246
+ "loss": 0.8161,
247
  "step": 3300
248
  },
249
  {
250
+ "epoch": 0.7046632124352331,
251
+ "grad_norm": 0.5357826352119446,
252
+ "learning_rate": 4.873136932795313e-05,
253
+ "loss": 0.7594,
254
  "step": 3400
255
  },
256
  {
257
+ "epoch": 0.7253886010362695,
258
+ "grad_norm": 0.292579710483551,
259
+ "learning_rate": 4.265882101960175e-05,
260
+ "loss": 0.7572,
261
  "step": 3500
262
  },
263
  {
264
+ "epoch": 0.7461139896373057,
265
+ "grad_norm": 0.29901111125946045,
266
+ "learning_rate": 3.688632521467985e-05,
267
+ "loss": 0.7514,
268
  "step": 3600
269
  },
270
  {
271
+ "epoch": 0.7668393782383419,
272
+ "grad_norm": 0.3171032667160034,
273
+ "learning_rate": 3.144408798582336e-05,
274
+ "loss": 0.7862,
275
  "step": 3700
276
  },
277
  {
278
+ "epoch": 0.7875647668393783,
279
+ "grad_norm": 0.43368250131607056,
280
+ "learning_rate": 2.636058724251739e-05,
281
+ "loss": 0.7994,
282
  "step": 3800
283
  },
284
  {
285
+ "epoch": 0.8082901554404145,
286
+ "grad_norm": 0.4301137924194336,
287
+ "learning_rate": 2.1662423713092516e-05,
288
+ "loss": 0.7646,
289
  "step": 3900
290
  },
291
  {
292
+ "epoch": 0.8290155440414507,
293
+ "grad_norm": 0.20049989223480225,
294
+ "learning_rate": 1.737418174955542e-05,
295
+ "loss": 0.7742,
296
  "step": 4000
297
  },
298
  {
299
+ "epoch": 0.8497409326424871,
300
+ "grad_norm": 0.5148625373840332,
301
+ "learning_rate": 1.3518300683627982e-05,
302
+ "loss": 0.7357,
303
  "step": 4100
304
  },
305
  {
306
+ "epoch": 0.8704663212435233,
307
+ "grad_norm": 0.38820308446884155,
308
+ "learning_rate": 1.011495740715882e-05,
309
+ "loss": 0.7536,
310
  "step": 4200
311
  },
312
  {
313
+ "epoch": 0.8911917098445595,
314
+ "grad_norm": 0.6004698276519775,
315
+ "learning_rate": 7.1819607913342745e-06,
316
+ "loss": 0.7238,
317
  "step": 4300
318
  },
319
  {
320
+ "epoch": 0.9119170984455959,
321
+ "grad_norm": 0.30640721321105957,
322
+ "learning_rate": 4.734658497168276e-06,
323
+ "loss": 0.7669,
324
  "step": 4400
325
  },
326
  {
327
+ "epoch": 0.9326424870466321,
328
+ "grad_norm": 0.23962052166461945,
329
+ "learning_rate": 2.7858566649088814e-06,
330
+ "loss": 0.7329,
331
  "step": 4500
332
  },
333
  {
334
+ "epoch": 0.9533678756476683,
335
+ "grad_norm": 0.2379899024963379,
336
+ "learning_rate": 1.3457529026076777e-06,
337
+ "loss": 0.7845,
338
  "step": 4600
339
  },
340
  {
341
+ "epoch": 0.9740932642487047,
342
+ "grad_norm": 0.291648268699646,
343
+ "learning_rate": 4.218829245063227e-07,
344
+ "loss": 0.773,
345
  "step": 4700
346
  },
347
  {
348
+ "epoch": 0.9948186528497409,
349
+ "grad_norm": 0.7486833930015564,
350
+ "learning_rate": 1.9081118468888824e-08,
351
+ "loss": 0.8057,
352
  "step": 4800
353
  }
354
  ],
355
  "logging_steps": 100,
356
+ "max_steps": 4825,
357
  "num_input_tokens_seen": 0,
358
  "num_train_epochs": 1,
359
  "save_steps": 500,
 
369
  "attributes": {}
370
  }
371
  },
372
+ "total_flos": 1.3151301866366976e+17,
373
  "train_batch_size": 2,
374
  "trial_name": null,
375
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee3f58fe7d64d267d98f39cbabdd040fd1cebc479f214ef573c983f87829e472
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97682e56ca9cafdee97b6d5ed5984ff75c9fc1563ca7b651bdb1cd346643b028
3
  size 5624