iHateNLP commited on
Commit
753a64e
·
verified ·
1 Parent(s): a1ca017

Training Examples: 10-20k

Browse files
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "k_proj",
27
  "up_proj",
 
28
  "down_proj",
 
 
29
  "o_proj",
30
- "q_proj",
31
- "gate_proj",
32
- "v_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "up_proj",
27
+ "q_proj",
28
  "down_proj",
29
+ "v_proj",
30
+ "k_proj",
31
  "o_proj",
32
+ "gate_proj"
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e5825365e8f35ad2be0fba4551dd0bb1a89c29127006b550f42dbe48103b58c
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7484e5ee8793b6a9d815a49cb85f1c89dceb2e005e94a3dbc5624f35beeabc5b
3
  size 159967880
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6572214ae96790947225891855adcce0a29f6be3999bd0eb846418c1764c449e
3
  size 81730644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f880ede74f46f09098edd37f094e2664bc1894201387437a09efebd43caab9ad
3
  size 81730644
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c801982aae9be06d302403c1fff693e53dedf89c1d3b689ee29fedad84a96d23
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c7c5a23cf686c3374465b35601286066a0bb9c35d2746ea32f4f663c7337c6a
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5836d5feabbfa4a8a0dfb2d2daf51efcfd7a4705772f290bfa6fbba9841feacb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:950d7b202a8b603a9f2053cab9955a76c4fc0bb96742fce47216d10adb705966
3
  size 1064
trainer_state.json CHANGED
@@ -3,357 +3,357 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 4849,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.00020622808826562179,
13
- "grad_norm": 0.23989851772785187,
14
  "learning_rate": 2.061855670103093e-07,
15
- "loss": 0.3507,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.02062280882656218,
20
- "grad_norm": 0.46579787135124207,
21
  "learning_rate": 2.0618556701030927e-05,
22
- "loss": 0.6141,
23
  "step": 100
24
  },
25
  {
26
- "epoch": 0.04124561765312436,
27
- "grad_norm": 0.20079994201660156,
28
  "learning_rate": 4.1237113402061855e-05,
29
- "loss": 0.6044,
30
  "step": 200
31
  },
32
  {
33
- "epoch": 0.06186842647968653,
34
- "grad_norm": 0.33593931794166565,
35
  "learning_rate": 6.185567010309279e-05,
36
- "loss": 0.5863,
37
  "step": 300
38
  },
39
  {
40
- "epoch": 0.08249123530624872,
41
- "grad_norm": 0.5399625897407532,
42
  "learning_rate": 8.247422680412371e-05,
43
- "loss": 0.5629,
44
  "step": 400
45
  },
46
  {
47
- "epoch": 0.10311404413281089,
48
- "grad_norm": 0.5116047859191895,
49
  "learning_rate": 0.00010309278350515463,
50
- "loss": 0.4852,
51
  "step": 500
52
  },
53
  {
54
- "epoch": 0.12373685295937306,
55
- "grad_norm": 0.39057251811027527,
56
  "learning_rate": 0.00012371134020618558,
57
- "loss": 0.5038,
58
  "step": 600
59
  },
60
  {
61
- "epoch": 0.14435966178593523,
62
- "grad_norm": 0.35319826006889343,
63
  "learning_rate": 0.0001443298969072165,
64
- "loss": 0.4892,
65
  "step": 700
66
  },
67
  {
68
- "epoch": 0.16498247061249743,
69
- "grad_norm": 0.2662217319011688,
70
  "learning_rate": 0.00016494845360824742,
71
- "loss": 0.5089,
72
  "step": 800
73
  },
74
  {
75
- "epoch": 0.1856052794390596,
76
- "grad_norm": 0.535358190536499,
77
  "learning_rate": 0.00018556701030927837,
78
- "loss": 0.4885,
79
  "step": 900
80
  },
81
  {
82
- "epoch": 0.20622808826562178,
83
- "grad_norm": 0.2716768682003021,
84
- "learning_rate": 0.00019997048441912246,
85
- "loss": 0.5124,
86
  "step": 1000
87
  },
88
  {
89
- "epoch": 0.22685089709218395,
90
- "grad_norm": 0.4777716100215912,
91
- "learning_rate": 0.00019944624754044668,
92
- "loss": 0.4911,
93
  "step": 1100
94
  },
95
  {
96
- "epoch": 0.24747370591874612,
97
- "grad_norm": 0.2679508626461029,
98
- "learning_rate": 0.00019827006532530193,
99
- "loss": 0.4793,
100
  "step": 1200
101
  },
102
  {
103
- "epoch": 0.2680965147453083,
104
- "grad_norm": 0.4718882739543915,
105
- "learning_rate": 0.00019644964853733152,
106
- "loss": 0.4712,
107
  "step": 1300
108
  },
109
  {
110
- "epoch": 0.28871932357187047,
111
- "grad_norm": 0.2879483699798584,
112
- "learning_rate": 0.00019399693138486107,
113
- "loss": 0.5119,
114
  "step": 1400
115
  },
116
  {
117
- "epoch": 0.30934213239843267,
118
- "grad_norm": 0.5847165584564209,
119
- "learning_rate": 0.0001909279932831403,
120
- "loss": 0.4973,
121
  "step": 1500
122
  },
123
  {
124
- "epoch": 0.32996494122499487,
125
- "grad_norm": 0.3073234558105469,
126
- "learning_rate": 0.0001872629534416197,
127
- "loss": 0.4996,
128
  "step": 1600
129
  },
130
  {
131
- "epoch": 0.350587750051557,
132
- "grad_norm": 0.2527044713497162,
133
- "learning_rate": 0.00018302583896732187,
134
- "loss": 0.4805,
135
  "step": 1700
136
  },
137
  {
138
- "epoch": 0.3712105588781192,
139
- "grad_norm": 0.2731375992298126,
140
- "learning_rate": 0.00017824442734898997,
141
- "loss": 0.4934,
142
  "step": 1800
143
  },
144
  {
145
- "epoch": 0.39183336770468136,
146
- "grad_norm": 0.4609294533729553,
147
- "learning_rate": 0.00017295006435464848,
148
- "loss": 0.4947,
149
  "step": 1900
150
  },
151
  {
152
- "epoch": 0.41245617653124356,
153
- "grad_norm": 0.4430903494358063,
154
- "learning_rate": 0.0001671774585363957,
155
- "loss": 0.4581,
156
  "step": 2000
157
  },
158
  {
159
- "epoch": 0.43307898535780576,
160
- "grad_norm": 0.39363059401512146,
161
- "learning_rate": 0.00016096445368960415,
162
- "loss": 0.4923,
163
  "step": 2100
164
  },
165
  {
166
- "epoch": 0.4537017941843679,
167
- "grad_norm": 0.3113062083721161,
168
- "learning_rate": 0.000154351780758231,
169
- "loss": 0.5157,
170
  "step": 2200
171
  },
172
  {
173
- "epoch": 0.4743246030109301,
174
- "grad_norm": 0.5598001480102539,
175
- "learning_rate": 0.00014738279081268692,
176
- "loss": 0.4948,
177
  "step": 2300
178
  },
179
  {
180
- "epoch": 0.49494741183749225,
181
- "grad_norm": 0.5283440351486206,
182
- "learning_rate": 0.00014010317085079503,
183
- "loss": 0.4905,
184
  "step": 2400
185
  },
186
  {
187
- "epoch": 0.5155702206640544,
188
- "grad_norm": 0.6192511916160583,
189
- "learning_rate": 0.00013256064428497966,
190
- "loss": 0.4947,
191
  "step": 2500
192
  },
193
  {
194
- "epoch": 0.5361930294906166,
195
- "grad_norm": 0.385551393032074,
196
- "learning_rate": 0.00012480465807921773,
197
- "loss": 0.5283,
198
  "step": 2600
199
  },
200
  {
201
- "epoch": 0.5568158383171788,
202
- "grad_norm": 0.40230363607406616,
203
- "learning_rate": 0.00011688605858680692,
204
- "loss": 0.5069,
205
  "step": 2700
206
  },
207
  {
208
- "epoch": 0.5774386471437409,
209
- "grad_norm": 0.4681544005870819,
210
- "learning_rate": 0.00010885675821407844,
211
- "loss": 0.5028,
212
  "step": 2800
213
  },
214
  {
215
- "epoch": 0.5980614559703031,
216
- "grad_norm": 0.35956326127052307,
217
- "learning_rate": 0.00010076939509532679,
218
- "loss": 0.5358,
219
  "step": 2900
220
  },
221
  {
222
- "epoch": 0.6186842647968653,
223
- "grad_norm": 0.4002954065799713,
224
- "learning_rate": 9.267698801004341e-05,
225
- "loss": 0.4941,
226
  "step": 3000
227
  },
228
  {
229
- "epoch": 0.6393070736234275,
230
- "grad_norm": 0.5405189990997314,
231
- "learning_rate": 8.463258880473373e-05,
232
- "loss": 0.5329,
233
  "step": 3100
234
  },
235
  {
236
- "epoch": 0.6599298824499897,
237
- "grad_norm": 0.3197394609451294,
238
- "learning_rate": 7.668893459795486e-05,
239
- "loss": 0.5125,
240
  "step": 3200
241
  },
242
  {
243
- "epoch": 0.6805526912765518,
244
- "grad_norm": 0.3258844017982483,
245
- "learning_rate": 6.889810204863274e-05,
246
- "loss": 0.5353,
247
  "step": 3300
248
  },
249
  {
250
- "epoch": 0.701175500103114,
251
- "grad_norm": 0.4742829203605652,
252
- "learning_rate": 6.131116595419178e-05,
253
- "loss": 0.5276,
254
  "step": 3400
255
  },
256
  {
257
- "epoch": 0.7217983089296762,
258
- "grad_norm": 0.2836833894252777,
259
- "learning_rate": 5.397786441664373e-05,
260
- "loss": 0.5076,
261
  "step": 3500
262
  },
263
  {
264
- "epoch": 0.7424211177562384,
265
- "grad_norm": 0.3171875476837158,
266
- "learning_rate": 4.6946272771725984e-05,
267
- "loss": 0.5163,
268
  "step": 3600
269
  },
270
  {
271
- "epoch": 0.7630439265828006,
272
- "grad_norm": 0.24590329825878143,
273
- "learning_rate": 4.026248841872946e-05,
274
- "loss": 0.5259,
275
  "step": 3700
276
  },
277
  {
278
- "epoch": 0.7836667354093627,
279
- "grad_norm": 0.6508501768112183,
280
- "learning_rate": 3.397032861719556e-05,
281
- "loss": 0.5589,
282
  "step": 3800
283
  },
284
  {
285
- "epoch": 0.8042895442359249,
286
- "grad_norm": 0.4867592751979828,
287
- "learning_rate": 2.811104323165301e-05,
288
- "loss": 0.5646,
289
  "step": 3900
290
  },
291
  {
292
- "epoch": 0.8249123530624871,
293
- "grad_norm": 0.312364399433136,
294
- "learning_rate": 2.2723044307569775e-05,
295
- "loss": 0.5721,
296
  "step": 4000
297
  },
298
  {
299
- "epoch": 0.8455351618890493,
300
- "grad_norm": 0.4810425341129303,
301
- "learning_rate": 1.7887852500751822e-05,
302
- "loss": 0.5181,
303
  "step": 4100
304
  },
305
  {
306
- "epoch": 0.8661579707156115,
307
- "grad_norm": 0.3198936879634857,
308
- "learning_rate": 1.3539539439376515e-05,
309
- "loss": 0.5756,
310
  "step": 4200
311
  },
312
  {
313
- "epoch": 0.8867807795421736,
314
- "grad_norm": 0.4786675274372101,
315
- "learning_rate": 9.75804006323886e-06,
316
- "loss": 0.5487,
317
  "step": 4300
318
  },
319
  {
320
- "epoch": 0.9074035883687358,
321
- "grad_norm": 0.4790056347846985,
322
- "learning_rate": 6.568144959657263e-06,
323
- "loss": 0.5506,
324
  "step": 4400
325
  },
326
  {
327
- "epoch": 0.928026397195298,
328
- "grad_norm": 0.587523877620697,
329
- "learning_rate": 4.013449264074187e-06,
330
- "loss": 0.5741,
331
  "step": 4500
332
  },
333
  {
334
- "epoch": 0.9486492060218602,
335
- "grad_norm": 0.41147923469543457,
336
- "learning_rate": 2.059119419840494e-06,
337
- "loss": 0.5414,
338
  "step": 4600
339
  },
340
  {
341
- "epoch": 0.9692720148484224,
342
- "grad_norm": 0.5501502752304077,
343
- "learning_rate": 7.468660935561755e-07,
344
- "loss": 0.6101,
345
  "step": 4700
346
  },
347
  {
348
- "epoch": 0.9898948236749845,
349
- "grad_norm": 0.45654186606407166,
350
- "learning_rate": 8.529209787123682e-08,
351
- "loss": 0.5645,
352
  "step": 4800
353
  }
354
  ],
355
  "logging_steps": 100,
356
- "max_steps": 4849,
357
  "num_input_tokens_seen": 0,
358
  "num_train_epochs": 1,
359
  "save_steps": 500,
@@ -369,7 +369,7 @@
369
  "attributes": {}
370
  }
371
  },
372
- "total_flos": 2.3135703548790374e+17,
373
  "train_batch_size": 2,
374
  "trial_name": null,
375
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 4850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0002061855670103093,
13
+ "grad_norm": 0.5331169366836548,
14
  "learning_rate": 2.061855670103093e-07,
15
+ "loss": 0.5976,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.020618556701030927,
20
+ "grad_norm": 0.42875760793685913,
21
  "learning_rate": 2.0618556701030927e-05,
22
+ "loss": 0.5552,
23
  "step": 100
24
  },
25
  {
26
+ "epoch": 0.041237113402061855,
27
+ "grad_norm": 0.44043704867362976,
28
  "learning_rate": 4.1237113402061855e-05,
29
+ "loss": 0.5911,
30
  "step": 200
31
  },
32
  {
33
+ "epoch": 0.061855670103092786,
34
+ "grad_norm": 0.39276885986328125,
35
  "learning_rate": 6.185567010309279e-05,
36
+ "loss": 0.5676,
37
  "step": 300
38
  },
39
  {
40
+ "epoch": 0.08247422680412371,
41
+ "grad_norm": 0.3910420835018158,
42
  "learning_rate": 8.247422680412371e-05,
43
+ "loss": 0.5516,
44
  "step": 400
45
  },
46
  {
47
+ "epoch": 0.10309278350515463,
48
+ "grad_norm": 0.3953075408935547,
49
  "learning_rate": 0.00010309278350515463,
50
+ "loss": 0.5785,
51
  "step": 500
52
  },
53
  {
54
+ "epoch": 0.12371134020618557,
55
+ "grad_norm": 0.38498565554618835,
56
  "learning_rate": 0.00012371134020618558,
57
+ "loss": 0.6043,
58
  "step": 600
59
  },
60
  {
61
+ "epoch": 0.14432989690721648,
62
+ "grad_norm": 0.36671698093414307,
63
  "learning_rate": 0.0001443298969072165,
64
+ "loss": 0.579,
65
  "step": 700
66
  },
67
  {
68
+ "epoch": 0.16494845360824742,
69
+ "grad_norm": 0.3693369925022125,
70
  "learning_rate": 0.00016494845360824742,
71
+ "loss": 0.5443,
72
  "step": 800
73
  },
74
  {
75
+ "epoch": 0.18556701030927836,
76
+ "grad_norm": 0.3997150659561157,
77
  "learning_rate": 0.00018556701030927837,
78
+ "loss": 0.5798,
79
  "step": 900
80
  },
81
  {
82
+ "epoch": 0.20618556701030927,
83
+ "grad_norm": 0.4547862708568573,
84
+ "learning_rate": 0.0001999704996306308,
85
+ "loss": 0.6046,
86
  "step": 1000
87
  },
88
  {
89
+ "epoch": 0.2268041237113402,
90
+ "grad_norm": 0.34885653853416443,
91
+ "learning_rate": 0.00019944653267951504,
92
+ "loss": 0.5694,
93
  "step": 1100
94
  },
95
  {
96
+ "epoch": 0.24742268041237114,
97
+ "grad_norm": 0.5895730257034302,
98
+ "learning_rate": 0.00019827095435098925,
99
+ "loss": 0.5858,
100
  "step": 1200
101
  },
102
  {
103
+ "epoch": 0.26804123711340205,
104
+ "grad_norm": 0.3080294132232666,
105
+ "learning_rate": 0.0001964514674798659,
106
+ "loss": 0.592,
107
  "step": 1300
108
  },
109
  {
110
+ "epoch": 0.28865979381443296,
111
+ "grad_norm": 0.2972188889980316,
112
+ "learning_rate": 0.00019399999403380266,
113
+ "loss": 0.5726,
114
  "step": 1400
115
  },
116
  {
117
+ "epoch": 0.30927835051546393,
118
+ "grad_norm": 0.28474223613739014,
119
+ "learning_rate": 0.00019093259699605125,
120
+ "loss": 0.5729,
121
  "step": 1500
122
  },
123
  {
124
+ "epoch": 0.32989690721649484,
125
+ "grad_norm": 0.39205971360206604,
126
+ "learning_rate": 0.00018726937511470246,
127
+ "loss": 0.5815,
128
  "step": 1600
129
  },
130
  {
131
+ "epoch": 0.35051546391752575,
132
+ "grad_norm": 0.43681225180625916,
133
+ "learning_rate": 0.0001830343312080704,
134
+ "loss": 0.5555,
135
  "step": 1700
136
  },
137
  {
138
+ "epoch": 0.3711340206185567,
139
+ "grad_norm": 0.3381674885749817,
140
+ "learning_rate": 0.0001782552148891283,
141
+ "loss": 0.5404,
142
  "step": 1800
143
  },
144
  {
145
+ "epoch": 0.3917525773195876,
146
+ "grad_norm": 0.3750503361225128,
147
+ "learning_rate": 0.00017296334073952605,
148
+ "loss": 0.5795,
149
  "step": 1900
150
  },
151
  {
152
+ "epoch": 0.41237113402061853,
153
+ "grad_norm": 0.45220425724983215,
154
+ "learning_rate": 0.00016719338312458124,
155
+ "loss": 0.5602,
156
  "step": 2000
157
  },
158
  {
159
+ "epoch": 0.4329896907216495,
160
+ "grad_norm": 0.43047139048576355,
161
+ "learning_rate": 0.00016098314899369446,
162
+ "loss": 0.5571,
163
  "step": 2100
164
  },
165
  {
166
+ "epoch": 0.4536082474226804,
167
+ "grad_norm": 0.7484344840049744,
168
+ "learning_rate": 0.00015437333015488587,
169
+ "loss": 0.5526,
170
  "step": 2200
171
  },
172
  {
173
+ "epoch": 0.4742268041237113,
174
+ "grad_norm": 0.39876288175582886,
175
+ "learning_rate": 0.00014747851305720256,
176
+ "loss": 0.5495,
177
  "step": 2300
178
  },
179
  {
180
+ "epoch": 0.4948453608247423,
181
+ "grad_norm": 0.2668340802192688,
182
+ "learning_rate": 0.00014020466278830452,
183
+ "loss": 0.5603,
184
  "step": 2400
185
  },
186
  {
187
+ "epoch": 0.5154639175257731,
188
+ "grad_norm": 0.2944701611995697,
189
+ "learning_rate": 0.0001326673763292055,
190
+ "loss": 0.5728,
191
  "step": 2500
192
  },
193
  {
194
+ "epoch": 0.5360824742268041,
195
+ "grad_norm": 0.5645434856414795,
196
+ "learning_rate": 0.0001249160408378004,
197
+ "loss": 0.552,
198
  "step": 2600
199
  },
200
  {
201
+ "epoch": 0.5567010309278351,
202
+ "grad_norm": 0.3050793707370758,
203
+ "learning_rate": 0.0001170014460023793,
204
+ "loss": 0.5735,
205
  "step": 2700
206
  },
207
  {
208
+ "epoch": 0.5773195876288659,
209
+ "grad_norm": 0.4726704955101013,
210
+ "learning_rate": 0.00010897545124833783,
211
+ "loss": 0.5672,
212
  "step": 2800
213
  },
214
  {
215
+ "epoch": 0.5979381443298969,
216
+ "grad_norm": 0.5890568494796753,
217
+ "learning_rate": 0.00010089064593556474,
218
+ "loss": 0.5377,
219
  "step": 2900
220
  },
221
  {
222
+ "epoch": 0.6185567010309279,
223
+ "grad_norm": 0.33398109674453735,
224
+ "learning_rate": 9.280000477302173e-05,
225
+ "loss": 0.5624,
226
  "step": 3000
227
  },
228
  {
229
+ "epoch": 0.6391752577319587,
230
+ "grad_norm": 0.43066734075546265,
231
+ "learning_rate": 8.48365683413172e-05,
232
+ "loss": 0.5712,
233
  "step": 3100
234
  },
235
  {
236
+ "epoch": 0.6597938144329897,
237
+ "grad_norm": 0.41674667596817017,
238
+ "learning_rate": 7.689172737117389e-05,
239
+ "loss": 0.5458,
240
  "step": 3200
241
  },
242
  {
243
+ "epoch": 0.6804123711340206,
244
+ "grad_norm": 0.4635225832462311,
245
+ "learning_rate": 6.909830056250527e-05,
246
+ "loss": 0.5449,
247
  "step": 3300
248
  },
249
  {
250
+ "epoch": 0.7010309278350515,
251
+ "grad_norm": 1.0104267597198486,
252
+ "learning_rate": 6.15073534018669e-05,
253
+ "loss": 0.5475,
254
  "step": 3400
255
  },
256
  {
257
+ "epoch": 0.7216494845360825,
258
+ "grad_norm": 0.24354267120361328,
259
+ "learning_rate": 5.416862465241033e-05,
260
+ "loss": 0.516,
261
  "step": 3500
262
  },
263
  {
264
+ "epoch": 0.7422680412371134,
265
+ "grad_norm": 0.500800609588623,
266
+ "learning_rate": 4.7130200446653475e-05,
267
+ "loss": 0.5138,
268
  "step": 3600
269
  },
270
  {
271
+ "epoch": 0.7628865979381443,
272
+ "grad_norm": 0.26592689752578735,
273
+ "learning_rate": 4.043819920791322e-05,
274
+ "loss": 0.5543,
275
  "step": 3700
276
  },
277
  {
278
+ "epoch": 0.7835051546391752,
279
+ "grad_norm": 0.549505352973938,
280
+ "learning_rate": 3.4136469464914575e-05,
281
+ "loss": 0.5576,
282
  "step": 3800
283
  },
284
  {
285
+ "epoch": 0.8041237113402062,
286
+ "grad_norm": 0.38464662432670593,
287
+ "learning_rate": 2.8266302539609745e-05,
288
+ "loss": 0.5578,
289
  "step": 3900
290
  },
291
  {
292
+ "epoch": 0.8247422680412371,
293
+ "grad_norm": 0.5664867758750916,
294
+ "learning_rate": 2.2866161990785228e-05,
295
+ "loss": 0.5433,
296
  "step": 4000
297
  },
298
  {
299
+ "epoch": 0.845360824742268,
300
+ "grad_norm": 0.3998001217842102,
301
+ "learning_rate": 1.7971431586244815e-05,
302
+ "loss": 0.557,
303
  "step": 4100
304
  },
305
  {
306
+ "epoch": 0.865979381443299,
307
+ "grad_norm": 0.3722541332244873,
308
+ "learning_rate": 1.3614183454950824e-05,
309
+ "loss": 0.5021,
310
  "step": 4200
311
  },
312
  {
313
+ "epoch": 0.8865979381443299,
314
+ "grad_norm": 0.7536567449569702,
315
+ "learning_rate": 9.822967938278171e-06,
316
+ "loss": 0.547,
317
  "step": 4300
318
  },
319
  {
320
+ "epoch": 0.9072164948453608,
321
+ "grad_norm": 0.18647919595241547,
322
+ "learning_rate": 6.622626517355557e-06,
323
+ "loss": 0.5207,
324
  "step": 4400
325
  },
326
  {
327
+ "epoch": 0.9278350515463918,
328
+ "grad_norm": 0.4656095504760742,
329
+ "learning_rate": 4.034129042265066e-06,
330
+ "loss": 0.5498,
331
  "step": 4500
332
  },
333
  {
334
+ "epoch": 0.9484536082474226,
335
+ "grad_norm": 0.544163167476654,
336
+ "learning_rate": 2.074436329635687e-06,
337
+ "loss": 0.5319,
338
  "step": 4600
339
  },
340
  {
341
+ "epoch": 0.9690721649484536,
342
+ "grad_norm": 0.3850492537021637,
343
+ "learning_rate": 7.563890289437825e-07,
344
+ "loss": 0.5569,
345
  "step": 4700
346
  },
347
  {
348
+ "epoch": 0.9896907216494846,
349
+ "grad_norm": 0.5212803483009338,
350
+ "learning_rate": 8.862348571043733e-08,
351
+ "loss": 0.5224,
352
  "step": 4800
353
  }
354
  ],
355
  "logging_steps": 100,
356
+ "max_steps": 4850,
357
  "num_input_tokens_seen": 0,
358
  "num_train_epochs": 1,
359
  "save_steps": 500,
 
369
  "attributes": {}
370
  }
371
  },
372
+ "total_flos": 2.346791597607813e+17,
373
  "train_batch_size": 2,
374
  "trial_name": null,
375
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcdb031000815991decb49e64c1ff52b41b328f05666ceaf823e770ffa5bde97
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9693ae99b42c29fd67ede275c9a27879ef5bb7e8261941cc89ec0f3ccdba0aaf
3
  size 5624