iHateNLP commited on
Commit
c8c6047
·
verified ·
1 Parent(s): 3fc19c7

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,8 +1,6 @@
1
  ---
2
  base_model: unsloth/Phi-3.5-mini-instruct
3
  library_name: peft
4
- tags:
5
- - unsloth
6
  ---
7
 
8
  # Model Card for Model ID
 
1
  ---
2
  base_model: unsloth/Phi-3.5-mini-instruct
3
  library_name: peft
 
 
4
  ---
5
 
6
  # Model Card for Model ID
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "v_proj",
27
  "o_proj",
 
 
28
  "q_proj",
 
29
  "up_proj",
30
- "k_proj",
31
- "gate_proj",
32
- "down_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
26
  "o_proj",
27
+ "down_proj",
28
+ "gate_proj",
29
  "q_proj",
30
+ "v_proj",
31
  "up_proj",
32
+ "k_proj"
 
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bc341f37cfdca38ef584943c397692be182677a2a5fe708a7c3cbba6a571241
3
  size 119597408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ee6d6904f3eb2d56b4d0fcad79a1f8b5298ac233d5a72f0e31d8e5135566c43
3
  size 119597408
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f343725d809da4ffaf684a244a4100eecaa26a54d067b93505a4ea66f68b319
3
  size 61227348
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83438eaa9a3c3886c6e226e34f99c9abf71eeff99962b4cc773061653827afbd
3
  size 61227348
scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57acdb1f05c51c1c33f6ebed56a52d6cf566d8a89c725b24e17cf2ddc63a1a1f
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d847fa8b77307ec3f023cf0a917c492c209c15c78747caba0466134193ab7151
3
  size 988
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c65471f2a90c88eeb3b60677a5a2bfebe038a7e13f123565e2531cb20a70783a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1148de79a5a57cf5420c6b78e00ef69a6c7893d5da37cb4fa60ac3f84fedb35
3
  size 1064
trainer_state.json CHANGED
@@ -1,359 +1,360 @@
1
  {
 
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 4825,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0002072538860103627,
13
- "grad_norm": 0.5270228385925293,
14
- "learning_rate": 4.140786749482402e-07,
15
- "loss": 0.7597,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.02072538860103627,
20
- "grad_norm": 0.3120118975639343,
21
- "learning_rate": 4.140786749482402e-05,
22
- "loss": 0.8058,
23
  "step": 100
24
  },
25
  {
26
- "epoch": 0.04145077720207254,
27
- "grad_norm": 0.3838038146495819,
28
- "learning_rate": 8.281573498964804e-05,
29
- "loss": 0.8085,
30
  "step": 200
31
  },
32
  {
33
- "epoch": 0.06217616580310881,
34
- "grad_norm": 0.33677420020103455,
35
- "learning_rate": 0.00012422360248447205,
36
- "loss": 0.8452,
37
  "step": 300
38
  },
39
  {
40
- "epoch": 0.08290155440414508,
41
- "grad_norm": 0.2833797335624695,
42
- "learning_rate": 0.00016563146997929608,
43
- "loss": 0.7836,
44
  "step": 400
45
  },
46
  {
47
- "epoch": 0.10362694300518134,
48
- "grad_norm": 0.3029078245162964,
49
- "learning_rate": 0.0001999924354607825,
50
- "loss": 0.8032,
51
  "step": 500
52
  },
53
  {
54
- "epoch": 0.12435233160621761,
55
- "grad_norm": 0.30878347158432007,
56
- "learning_rate": 0.00019964190153093613,
57
- "loss": 0.8034,
58
  "step": 600
59
  },
60
  {
61
- "epoch": 0.14507772020725387,
62
- "grad_norm": 0.1962185502052307,
63
- "learning_rate": 0.00019878125322990773,
64
- "loss": 0.8069,
65
  "step": 700
66
  },
67
  {
68
- "epoch": 0.16580310880829016,
69
- "grad_norm": 0.2113143652677536,
70
- "learning_rate": 0.00019739761494902327,
71
- "loss": 0.8239,
72
  "step": 800
73
  },
74
  {
75
- "epoch": 0.18652849740932642,
76
- "grad_norm": 0.36496707797050476,
77
- "learning_rate": 0.00019550431853565577,
78
- "loss": 0.7952,
79
  "step": 900
80
  },
81
  {
82
- "epoch": 0.20725388601036268,
83
- "grad_norm": 0.4117984175682068,
84
- "learning_rate": 0.00019311127115144138,
85
- "loss": 0.8159,
86
  "step": 1000
87
  },
88
  {
89
- "epoch": 0.22797927461139897,
90
- "grad_norm": 0.25097981095314026,
91
- "learning_rate": 0.00019023099503382319,
92
- "loss": 0.8295,
93
  "step": 1100
94
  },
95
  {
96
- "epoch": 0.24870466321243523,
97
- "grad_norm": 0.2996521592140198,
98
- "learning_rate": 0.00018687856197021518,
99
- "loss": 0.8071,
100
  "step": 1200
101
  },
102
  {
103
- "epoch": 0.2694300518134715,
104
- "grad_norm": 0.2897460162639618,
105
- "learning_rate": 0.0001830715144309886,
106
- "loss": 0.8338,
107
  "step": 1300
108
  },
109
  {
110
- "epoch": 0.29015544041450775,
111
- "grad_norm": 0.5588517785072327,
112
- "learning_rate": 0.0001788297737739727,
113
- "loss": 0.7572,
114
  "step": 1400
115
  },
116
  {
117
- "epoch": 0.31088082901554404,
118
- "grad_norm": 0.47002148628234863,
119
- "learning_rate": 0.00017417553600081358,
120
- "loss": 0.809,
121
  "step": 1500
122
  },
123
  {
124
- "epoch": 0.3316062176165803,
125
- "grad_norm": 0.3224999010562897,
126
- "learning_rate": 0.00016913315561067264,
127
- "loss": 0.7942,
128
  "step": 1600
129
  },
130
  {
131
- "epoch": 0.35233160621761656,
132
- "grad_norm": 0.5584061145782471,
133
- "learning_rate": 0.0001637290181590304,
134
- "loss": 0.8329,
135
  "step": 1700
136
  },
137
  {
138
- "epoch": 0.37305699481865284,
139
- "grad_norm": 0.3566969633102417,
140
- "learning_rate": 0.00015799140218846435,
141
- "loss": 0.7901,
142
  "step": 1800
143
  },
144
  {
145
- "epoch": 0.39378238341968913,
146
- "grad_norm": 0.27286580204963684,
147
- "learning_rate": 0.00015195033125388395,
148
- "loss": 0.8138,
149
  "step": 1900
150
  },
151
  {
152
- "epoch": 0.41450777202072536,
153
- "grad_norm": 0.3148711621761322,
154
- "learning_rate": 0.00014563741681653824,
155
- "loss": 0.7867,
156
  "step": 2000
157
  },
158
  {
159
- "epoch": 0.43523316062176165,
160
- "grad_norm": 0.3646707832813263,
161
- "learning_rate": 0.0001390856928288946,
162
- "loss": 0.7681,
163
  "step": 2100
164
  },
165
  {
166
- "epoch": 0.45595854922279794,
167
- "grad_norm": 0.32302677631378174,
168
- "learning_rate": 0.00013232944287596522,
169
- "loss": 0.8134,
170
  "step": 2200
171
  },
172
  {
173
- "epoch": 0.47668393782383417,
174
- "grad_norm": 0.7468949556350708,
175
- "learning_rate": 0.000125404020777609,
176
- "loss": 0.7862,
177
  "step": 2300
178
  },
179
  {
180
- "epoch": 0.49740932642487046,
181
- "grad_norm": 0.36616334319114685,
182
- "learning_rate": 0.00011834566559055394,
183
- "loss": 0.7675,
184
  "step": 2400
185
  },
186
  {
187
- "epoch": 0.5181347150259067,
188
- "grad_norm": 0.3042179346084595,
189
- "learning_rate": 0.00011119131197818904,
190
- "loss": 0.7824,
191
  "step": 2500
192
  },
193
  {
194
- "epoch": 0.538860103626943,
195
- "grad_norm": 0.6282315850257874,
196
- "learning_rate": 0.0001039783969404153,
197
- "loss": 0.8008,
198
  "step": 2600
199
  },
200
  {
201
- "epoch": 0.5595854922279793,
202
- "grad_norm": 0.309589147567749,
203
- "learning_rate": 9.674466391489112e-05,
204
- "loss": 0.7918,
205
  "step": 2700
206
  },
207
  {
208
- "epoch": 0.5803108808290155,
209
- "grad_norm": 0.6195072531700134,
210
- "learning_rate": 8.952796527476341e-05,
211
- "loss": 0.7714,
212
  "step": 2800
213
  },
214
  {
215
- "epoch": 0.6010362694300518,
216
- "grad_norm": 0.5037879943847656,
217
- "learning_rate": 8.236606425636553e-05,
218
- "loss": 0.8098,
219
  "step": 2900
220
  },
221
  {
222
- "epoch": 0.6217616580310881,
223
- "grad_norm": 0.3202759325504303,
224
- "learning_rate": 7.529643735334646e-05,
225
- "loss": 0.7303,
226
  "step": 3000
227
  },
228
  {
229
- "epoch": 0.6424870466321243,
230
- "grad_norm": 0.4311729669570923,
231
- "learning_rate": 6.835607821125519e-05,
232
- "loss": 0.7717,
233
  "step": 3100
234
  },
235
  {
236
- "epoch": 0.6632124352331606,
237
- "grad_norm": 0.28787970542907715,
238
- "learning_rate": 6.158130404875231e-05,
239
- "loss": 0.7748,
240
  "step": 3200
241
  },
242
  {
243
- "epoch": 0.6839378238341969,
244
- "grad_norm": 0.28564584255218506,
245
- "learning_rate": 5.5007565618399506e-05,
246
- "loss": 0.8161,
247
  "step": 3300
248
  },
249
  {
250
- "epoch": 0.7046632124352331,
251
- "grad_norm": 0.5357826352119446,
252
- "learning_rate": 4.873136932795313e-05,
253
- "loss": 0.7594,
254
  "step": 3400
255
  },
256
  {
257
- "epoch": 0.7253886010362695,
258
- "grad_norm": 0.292579710483551,
259
- "learning_rate": 4.265882101960175e-05,
260
- "loss": 0.7572,
261
  "step": 3500
262
  },
263
  {
264
- "epoch": 0.7461139896373057,
265
- "grad_norm": 0.29901111125946045,
266
- "learning_rate": 3.688632521467985e-05,
267
- "loss": 0.7514,
268
  "step": 3600
269
  },
270
  {
271
- "epoch": 0.7668393782383419,
272
- "grad_norm": 0.3171032667160034,
273
- "learning_rate": 3.144408798582336e-05,
274
- "loss": 0.7862,
275
  "step": 3700
276
  },
277
  {
278
- "epoch": 0.7875647668393783,
279
- "grad_norm": 0.43368250131607056,
280
- "learning_rate": 2.636058724251739e-05,
281
- "loss": 0.7994,
282
  "step": 3800
283
  },
284
  {
285
- "epoch": 0.8082901554404145,
286
- "grad_norm": 0.4301137924194336,
287
- "learning_rate": 2.1662423713092516e-05,
288
- "loss": 0.7646,
289
  "step": 3900
290
  },
291
  {
292
- "epoch": 0.8290155440414507,
293
- "grad_norm": 0.20049989223480225,
294
- "learning_rate": 1.737418174955542e-05,
295
- "loss": 0.7742,
296
  "step": 4000
297
  },
298
  {
299
- "epoch": 0.8497409326424871,
300
- "grad_norm": 0.5148625373840332,
301
- "learning_rate": 1.3518300683627982e-05,
302
- "loss": 0.7357,
303
  "step": 4100
304
  },
305
  {
306
- "epoch": 0.8704663212435233,
307
- "grad_norm": 0.38820308446884155,
308
- "learning_rate": 1.011495740715882e-05,
309
- "loss": 0.7536,
310
  "step": 4200
311
  },
312
  {
313
- "epoch": 0.8911917098445595,
314
- "grad_norm": 0.6004698276519775,
315
- "learning_rate": 7.1819607913342745e-06,
316
- "loss": 0.7238,
317
  "step": 4300
318
  },
319
  {
320
- "epoch": 0.9119170984455959,
321
- "grad_norm": 0.30640721321105957,
322
- "learning_rate": 4.734658497168276e-06,
323
- "loss": 0.7669,
324
  "step": 4400
325
  },
326
  {
327
- "epoch": 0.9326424870466321,
328
- "grad_norm": 0.23962052166461945,
329
- "learning_rate": 2.7858566649088814e-06,
330
- "loss": 0.7329,
331
  "step": 4500
332
  },
333
  {
334
- "epoch": 0.9533678756476683,
335
- "grad_norm": 0.2379899024963379,
336
- "learning_rate": 1.3457529026076777e-06,
337
- "loss": 0.7845,
338
  "step": 4600
339
  },
340
  {
341
- "epoch": 0.9740932642487047,
342
- "grad_norm": 0.291648268699646,
343
- "learning_rate": 4.218829245063227e-07,
344
- "loss": 0.773,
345
  "step": 4700
346
  },
347
  {
348
- "epoch": 0.9948186528497409,
349
- "grad_norm": 0.7486833930015564,
350
- "learning_rate": 1.9081118468888824e-08,
351
- "loss": 0.8057,
352
  "step": 4800
353
  }
354
  ],
355
  "logging_steps": 100,
356
- "max_steps": 4825,
357
  "num_input_tokens_seen": 0,
358
  "num_train_epochs": 1,
359
  "save_steps": 500,
@@ -369,7 +370,7 @@
369
  "attributes": {}
370
  }
371
  },
372
- "total_flos": 1.3151301866366976e+17,
373
  "train_batch_size": 2,
374
  "trial_name": null,
375
  "trial_params": null
 
1
  {
2
+ "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 4821,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.00020742584526031943,
14
+ "grad_norm": 0.572188675403595,
15
+ "learning_rate": 0.0,
16
+ "loss": 0.9048,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.020742584526031945,
21
+ "grad_norm": 0.8375779390335083,
22
+ "learning_rate": 2.0518134715025907e-05,
23
+ "loss": 0.6388,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.04148516905206389,
28
+ "grad_norm": 0.80202317237854,
29
+ "learning_rate": 4.1243523316062174e-05,
30
+ "loss": 0.5854,
31
  "step": 200
32
  },
33
  {
34
+ "epoch": 0.06222775357809583,
35
+ "grad_norm": 0.7062171101570129,
36
+ "learning_rate": 6.196891191709845e-05,
37
+ "loss": 0.4707,
38
  "step": 300
39
  },
40
  {
41
+ "epoch": 0.08297033810412778,
42
+ "grad_norm": 0.644052267074585,
43
+ "learning_rate": 8.269430051813471e-05,
44
+ "loss": 0.3773,
45
  "step": 400
46
  },
47
  {
48
+ "epoch": 0.10371292263015972,
49
+ "grad_norm": 0.6500552296638489,
50
+ "learning_rate": 0.000103419689119171,
51
+ "loss": 0.339,
52
  "step": 500
53
  },
54
  {
55
+ "epoch": 0.12445550715619166,
56
+ "grad_norm": 0.9881173968315125,
57
+ "learning_rate": 0.00012414507772020726,
58
+ "loss": 0.3264,
59
  "step": 600
60
  },
61
  {
62
+ "epoch": 0.1451980916822236,
63
+ "grad_norm": 0.8607994318008423,
64
+ "learning_rate": 0.00014487046632124352,
65
+ "loss": 0.3395,
66
  "step": 700
67
  },
68
  {
69
+ "epoch": 0.16594067620825556,
70
+ "grad_norm": 1.1204837560653687,
71
+ "learning_rate": 0.0001655958549222798,
72
+ "loss": 0.356,
73
  "step": 800
74
  },
75
  {
76
+ "epoch": 0.18668326073428748,
77
+ "grad_norm": 1.4915101528167725,
78
+ "learning_rate": 0.00018632124352331608,
79
+ "loss": 0.3399,
80
  "step": 900
81
  },
82
  {
83
+ "epoch": 0.20742584526031943,
84
+ "grad_norm": 1.2340389490127563,
85
+ "learning_rate": 0.00019996163583391267,
86
+ "loss": 0.371,
87
  "step": 1000
88
  },
89
  {
90
+ "epoch": 0.22816842978635138,
91
+ "grad_norm": 0.657108724117279,
92
+ "learning_rate": 0.00019940464789344699,
93
+ "loss": 0.3402,
94
  "step": 1100
95
  },
96
  {
97
+ "epoch": 0.24891101431238333,
98
+ "grad_norm": 1.1355221271514893,
99
+ "learning_rate": 0.00019818819435915685,
100
+ "loss": 0.3604,
101
  "step": 1200
102
  },
103
  {
104
+ "epoch": 0.2696535988384153,
105
+ "grad_norm": 0.8293834924697876,
106
+ "learning_rate": 0.00019632034536930397,
107
+ "loss": 0.355,
108
  "step": 1300
109
  },
110
  {
111
+ "epoch": 0.2903961833644472,
112
+ "grad_norm": 1.1846222877502441,
113
+ "learning_rate": 0.00019381349251894317,
114
+ "loss": 0.3562,
115
  "step": 1400
116
  },
117
  {
118
+ "epoch": 0.3111387678904792,
119
+ "grad_norm": 0.7575041055679321,
120
+ "learning_rate": 0.0001906842666521912,
121
+ "loss": 0.3853,
122
  "step": 1500
123
  },
124
  {
125
+ "epoch": 0.3318813524165111,
126
+ "grad_norm": 0.8805419206619263,
127
+ "learning_rate": 0.0001869534275306705,
128
+ "loss": 0.3789,
129
  "step": 1600
130
  },
131
  {
132
+ "epoch": 0.352623936942543,
133
+ "grad_norm": 0.5712432861328125,
134
+ "learning_rate": 0.00018264572611008452,
135
+ "loss": 0.4197,
136
  "step": 1700
137
  },
138
  {
139
+ "epoch": 0.37336652146857496,
140
+ "grad_norm": 1.414759874343872,
141
+ "learning_rate": 0.00017778974033860325,
142
+ "loss": 0.3831,
143
  "step": 1800
144
  },
145
  {
146
+ "epoch": 0.3941091059946069,
147
+ "grad_norm": 1.0402040481567383,
148
+ "learning_rate": 0.00017241768556639647,
149
+ "loss": 0.4039,
150
  "step": 1900
151
  },
152
  {
153
+ "epoch": 0.41485169052063886,
154
+ "grad_norm": 0.68588787317276,
155
+ "learning_rate": 0.0001665652008240878,
156
+ "loss": 0.3909,
157
  "step": 2000
158
  },
159
  {
160
+ "epoch": 0.4355942750466708,
161
+ "grad_norm": 0.967073380947113,
162
+ "learning_rate": 0.00016027111238799057,
163
+ "loss": 0.4252,
164
  "step": 2100
165
  },
166
  {
167
+ "epoch": 0.45633685957270276,
168
+ "grad_norm": 1.229313850402832,
169
+ "learning_rate": 0.00015357717620066938,
170
+ "loss": 0.4296,
171
  "step": 2200
172
  },
173
  {
174
+ "epoch": 0.4770794440987347,
175
+ "grad_norm": 1.2722011804580688,
176
+ "learning_rate": 0.00014652780085564921,
177
+ "loss": 0.4027,
178
  "step": 2300
179
  },
180
  {
181
+ "epoch": 0.49782202862476665,
182
+ "grad_norm": 0.9985523819923401,
183
+ "learning_rate": 0.00013916975298403346,
184
+ "loss": 0.4733,
185
  "step": 2400
186
  },
187
  {
188
+ "epoch": 0.5185646131507986,
189
+ "grad_norm": 1.0977421998977661,
190
+ "learning_rate": 0.00013155184699754102,
191
+ "loss": 0.4848,
192
  "step": 2500
193
  },
194
  {
195
+ "epoch": 0.5393071976768306,
196
+ "grad_norm": 0.9423943758010864,
197
+ "learning_rate": 0.00012372462124625452,
198
+ "loss": 0.4491,
199
  "step": 2600
200
  },
201
  {
202
+ "epoch": 0.5600497822028625,
203
+ "grad_norm": 1.0384944677352905,
204
+ "learning_rate": 0.00011574000273949858,
205
+ "loss": 0.4421,
206
  "step": 2700
207
  },
208
  {
209
+ "epoch": 0.5807923667288944,
210
+ "grad_norm": 0.6461535692214966,
211
+ "learning_rate": 0.00010765096265414077,
212
+ "loss": 0.4887,
213
  "step": 2800
214
  },
215
  {
216
+ "epoch": 0.6015349512549264,
217
+ "grad_norm": 0.7776329517364502,
218
+ "learning_rate": 9.95111649157258e-05,
219
+ "loss": 0.5306,
220
  "step": 2900
221
  },
222
  {
223
+ "epoch": 0.6222775357809583,
224
+ "grad_norm": 0.6103058457374573,
225
+ "learning_rate": 9.137461018380963e-05,
226
+ "loss": 0.4847,
227
  "step": 3000
228
  },
229
  {
230
+ "epoch": 0.6430201203069903,
231
+ "grad_norm": 1.384641170501709,
232
+ "learning_rate": 8.329527760334861e-05,
233
+ "loss": 0.5293,
234
  "step": 3100
235
  },
236
  {
237
+ "epoch": 0.6637627048330222,
238
+ "grad_norm": 1.2251664400100708,
239
+ "learning_rate": 7.532676669881955e-05,
240
+ "loss": 0.55,
241
  "step": 3200
242
  },
243
  {
244
+ "epoch": 0.6845052893590542,
245
+ "grad_norm": 0.5400375127792358,
246
+ "learning_rate": 6.752194178680041e-05,
247
+ "loss": 0.5329,
248
  "step": 3300
249
  },
250
  {
251
+ "epoch": 0.705247873885086,
252
+ "grad_norm": 1.0325515270233154,
253
+ "learning_rate": 5.9932581266031694e-05,
254
+ "loss": 0.5638,
255
  "step": 3400
256
  },
257
  {
258
+ "epoch": 0.725990458411118,
259
+ "grad_norm": 0.4699115455150604,
260
+ "learning_rate": 5.2680792652421385e-05,
261
+ "loss": 0.5527,
262
  "step": 3500
263
  },
264
  {
265
+ "epoch": 0.7467330429371499,
266
+ "grad_norm": 0.5923639535903931,
267
+ "learning_rate": 4.5668266493661425e-05,
268
+ "loss": 0.5647,
269
  "step": 3600
270
  },
271
  {
272
+ "epoch": 0.7674756274631819,
273
+ "grad_norm": 1.6123884916305542,
274
+ "learning_rate": 3.901618534083994e-05,
275
+ "loss": 0.6208,
276
  "step": 3700
277
  },
278
  {
279
+ "epoch": 0.7882182119892138,
280
+ "grad_norm": 0.882792055606842,
281
+ "learning_rate": 3.2768680114799956e-05,
282
+ "loss": 0.585,
283
  "step": 3800
284
  },
285
  {
286
+ "epoch": 0.8089607965152458,
287
+ "grad_norm": 0.8842360973358154,
288
+ "learning_rate": 2.696719771798648e-05,
289
+ "loss": 0.6046,
290
  "step": 3900
291
  },
292
  {
293
+ "epoch": 0.8297033810412777,
294
+ "grad_norm": 0.9587863087654114,
295
+ "learning_rate": 2.1650226069374525e-05,
296
+ "loss": 0.6127,
297
  "step": 4000
298
  },
299
  {
300
+ "epoch": 0.8504459655673097,
301
+ "grad_norm": 0.6551477909088135,
302
+ "learning_rate": 1.6853038769745467e-05,
303
+ "loss": 0.6291,
304
  "step": 4100
305
  },
306
  {
307
+ "epoch": 0.8711885500933416,
308
+ "grad_norm": 0.7264061570167542,
309
+ "learning_rate": 1.2607461091239803e-05,
310
+ "loss": 0.6627,
311
  "step": 4200
312
  },
313
  {
314
+ "epoch": 0.8919311346193736,
315
+ "grad_norm": 0.40014514327049255,
316
+ "learning_rate": 8.941658843648237e-06,
317
+ "loss": 0.6575,
318
  "step": 4300
319
  },
320
  {
321
+ "epoch": 0.9126737191454055,
322
+ "grad_norm": 1.0279369354248047,
323
+ "learning_rate": 5.879951518134263e-06,
324
+ "loss": 0.7132,
325
  "step": 4400
326
  },
327
  {
328
+ "epoch": 0.9334163036714375,
329
+ "grad_norm": 0.559190034866333,
330
+ "learning_rate": 3.4426509480207646e-06,
331
+ "loss": 0.6866,
332
  "step": 4500
333
  },
334
  {
335
+ "epoch": 0.9541588881974694,
336
+ "grad_norm": 1.0593820810317993,
337
+ "learning_rate": 1.6459265569902738e-06,
338
+ "loss": 0.6781,
339
  "step": 4600
340
  },
341
  {
342
+ "epoch": 0.9749014727235014,
343
+ "grad_norm": 0.7888472080230713,
344
+ "learning_rate": 5.016980886622169e-07,
345
+ "loss": 0.7098,
346
  "step": 4700
347
  },
348
  {
349
+ "epoch": 0.9956440572495333,
350
+ "grad_norm": 1.0247892141342163,
351
+ "learning_rate": 1.755652919597228e-08,
352
+ "loss": 0.6915,
353
  "step": 4800
354
  }
355
  ],
356
  "logging_steps": 100,
357
+ "max_steps": 4821,
358
  "num_input_tokens_seen": 0,
359
  "num_train_epochs": 1,
360
  "save_steps": 500,
 
370
  "attributes": {}
371
  }
372
  },
373
+ "total_flos": 1.2957373058936832e+17,
374
  "train_batch_size": 2,
375
  "trial_name": null,
376
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97682e56ca9cafdee97b6d5ed5984ff75c9fc1563ca7b651bdb1cd346643b028
3
- size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621a556d1da2455eed4e6cd8bccd4f42a7c4222b1175c833beedf9b39ece534d
3
+ size 5560