superbigtree commited on
Commit
ea20dac
·
verified ·
1 Parent(s): 6b73994

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: HuggingFaceH4/Bespoke-Stratos-17k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [HuggingFaceH4/Bespoke-Stratos-17k](https://huggingface.co/datasets/HuggingFaceH4/Bespoke-Stratos-17k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/xiaojia-song12/huggingface/runs/977ijn5q)
33
 
34
 
35
  This model was trained with SFT.
@@ -38,7 +36,7 @@ This model was trained with SFT.
38
 
39
  - TRL: 0.16.0.dev0
40
  - Transformers: 4.49.0.dev0
41
- - Pytorch: 2.7.0a0+git3a58512
42
  - Datasets: 3.2.0
43
  - Tokenizers: 0.21.0
44
 
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/xiaojia-song12/huggingface/runs/v7vrup4x)
31
 
32
 
33
  This model was trained with SFT.
 
36
 
37
  - TRL: 0.16.0.dev0
38
  - Transformers: 4.49.0.dev0
39
+ - Pytorch: 2.5.1+gitabbfe77
40
  - Datasets: 3.2.0
41
  - Tokenizers: 0.21.0
42
 
all_results.json CHANGED
@@ -5,9 +5,9 @@
5
  "eval_samples_per_second": 56.996,
6
  "eval_steps_per_second": 2.209,
7
  "total_flos": 6.966137809639834e+17,
8
- "train_loss": 0.8673680914929632,
9
- "train_runtime": 795.9713,
10
  "train_samples": 16610,
11
- "train_samples_per_second": 27.166,
12
- "train_steps_per_second": 0.212
13
  }
 
5
  "eval_samples_per_second": 56.996,
6
  "eval_steps_per_second": 2.209,
7
  "total_flos": 6.966137809639834e+17,
8
+ "train_loss": 0.8669900795411781,
9
+ "train_runtime": 827.3439,
10
  "train_samples": 16610,
11
+ "train_samples_per_second": 26.135,
12
+ "train_steps_per_second": 0.204
13
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43f380d7be9586da69712b78466b8c4247dc743323c393ed4e447e8943bead8b
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c02f702f829ce0c3e115d2150edd0d77022a7f318b9ce882181fe7559afb4a8
3
  size 3087467144
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 6.966137809639834e+17,
3
- "train_loss": 0.8673680914929632,
4
- "train_runtime": 795.9713,
5
  "train_samples": 16610,
6
- "train_samples_per_second": 27.166,
7
- "train_steps_per_second": 0.212
8
  }
 
1
  {
2
  "total_flos": 6.966137809639834e+17,
3
+ "train_loss": 0.8669900795411781,
4
+ "train_runtime": 827.3439,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 26.135,
7
+ "train_steps_per_second": 0.204
8
  }
trainer_state.json CHANGED
@@ -13,7 +13,7 @@
13
  "grad_norm": 2.40625,
14
  "learning_rate": 5.882352941176471e-06,
15
  "loss": 1.1011,
16
- "mean_token_accuracy": 0.7099346258467769,
17
  "step": 5
18
  },
19
  {
@@ -21,7 +21,7 @@
21
  "grad_norm": 1.46875,
22
  "learning_rate": 1.1764705882352942e-05,
23
  "loss": 1.0735,
24
- "mean_token_accuracy": 0.7143598902988348,
25
  "step": 10
26
  },
27
  {
@@ -29,15 +29,15 @@
29
  "grad_norm": 0.93359375,
30
  "learning_rate": 1.7647058823529414e-05,
31
  "loss": 1.0246,
32
- "mean_token_accuracy": 0.7213492548758127,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
  "grad_norm": 0.61328125,
38
  "learning_rate": 1.9980782984658682e-05,
39
- "loss": 0.9456,
40
- "mean_token_accuracy": 0.7365624101438417,
41
  "step": 20
42
  },
43
  {
@@ -45,15 +45,15 @@
45
  "grad_norm": 0.51171875,
46
  "learning_rate": 1.9863613034027224e-05,
47
  "loss": 0.9101,
48
- "mean_token_accuracy": 0.7418235078195066,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
- "grad_norm": 0.423828125,
54
  "learning_rate": 1.9641197940012136e-05,
55
- "loss": 0.8958,
56
- "mean_token_accuracy": 0.7445377548880543,
57
  "step": 30
58
  },
59
  {
@@ -61,7 +61,7 @@
61
  "grad_norm": 0.408203125,
62
  "learning_rate": 1.9315910880512792e-05,
63
  "loss": 0.8774,
64
- "mean_token_accuracy": 0.7473939577517185,
65
  "step": 35
66
  },
67
  {
@@ -69,227 +69,227 @@
69
  "grad_norm": 0.390625,
70
  "learning_rate": 1.8891222681391853e-05,
71
  "loss": 0.8628,
72
- "mean_token_accuracy": 0.7510309574773866,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
- "grad_norm": 4.75,
78
  "learning_rate": 1.8371664782625287e-05,
79
  "loss": 0.8562,
80
- "mean_token_accuracy": 0.7519657291179588,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
- "grad_norm": 0.322265625,
86
  "learning_rate": 1.7762780887657576e-05,
87
- "loss": 0.8526,
88
- "mean_token_accuracy": 0.7526042764494737,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
- "grad_norm": 0.33984375,
94
  "learning_rate": 1.7071067811865477e-05,
95
- "loss": 0.8506,
96
- "mean_token_accuracy": 0.7529147428713211,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
- "grad_norm": 0.349609375,
102
  "learning_rate": 1.6303906161279554e-05,
103
- "loss": 0.8627,
104
- "mean_token_accuracy": 0.7494820701814584,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
  "grad_norm": 0.32421875,
110
  "learning_rate": 1.5469481581224274e-05,
111
- "loss": 0.8496,
112
- "mean_token_accuracy": 0.7523080532405824,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
- "grad_norm": 0.3359375,
118
  "learning_rate": 1.4576697415156818e-05,
119
- "loss": 0.8422,
120
- "mean_token_accuracy": 0.7546321496781302,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
  "grad_norm": 0.3125,
126
  "learning_rate": 1.3635079705638298e-05,
127
- "loss": 0.842,
128
- "mean_token_accuracy": 0.7544247004239197,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
  "grad_norm": 0.322265625,
134
  "learning_rate": 1.2654675551080724e-05,
135
- "loss": 0.8453,
136
- "mean_token_accuracy": 0.7530717828212506,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
  "grad_norm": 0.330078125,
142
  "learning_rate": 1.164594590280734e-05,
143
- "loss": 0.8455,
144
- "mean_token_accuracy": 0.7532910964218957,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
  "grad_norm": 0.333984375,
150
  "learning_rate": 1.0619653946285948e-05,
151
- "loss": 0.8396,
152
- "mean_token_accuracy": 0.7545698270600318,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
  "grad_norm": 0.353515625,
158
  "learning_rate": 9.586750257511868e-06,
159
- "loss": 0.8353,
160
- "mean_token_accuracy": 0.7558067680588229,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
- "grad_norm": 0.32421875,
166
  "learning_rate": 8.558255959926533e-06,
167
- "loss": 0.8265,
168
- "mean_token_accuracy": 0.7575928127928315,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
- "eval_loss": 0.858591616153717,
174
- "eval_mean_token_accuracy": 0.7338160377061327,
175
- "eval_runtime": 2.2821,
176
- "eval_samples_per_second": 56.526,
177
- "eval_steps_per_second": 2.191,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
  "grad_norm": 0.3203125,
183
  "learning_rate": 7.545145128592009e-06,
184
- "loss": 0.8284,
185
- "mean_token_accuracy": 0.7576165643011782,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
  "grad_norm": 0.314453125,
191
  "learning_rate": 6.558227696373617e-06,
192
- "loss": 0.8497,
193
- "mean_token_accuracy": 0.7519416205438242,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
  "grad_norm": 0.322265625,
199
  "learning_rate": 5.608034111526298e-06,
200
- "loss": 0.8356,
201
- "mean_token_accuracy": 0.7557316956442416,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
  "grad_norm": 0.328125,
207
  "learning_rate": 4.704702977392914e-06,
208
- "loss": 0.8229,
209
- "mean_token_accuracy": 0.7593895998895455,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
  "grad_norm": 0.314453125,
215
  "learning_rate": 3.857872873103322e-06,
216
- "loss": 0.8322,
217
- "mean_token_accuracy": 0.756736495558991,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
  "grad_norm": 0.326171875,
223
  "learning_rate": 3.0765795095517026e-06,
224
- "loss": 0.8367,
225
- "mean_token_accuracy": 0.7552458819622366,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
  "grad_norm": 0.3125,
231
  "learning_rate": 2.369159318001937e-06,
232
- "loss": 0.8431,
233
- "mean_token_accuracy": 0.753608536029534,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
  "grad_norm": 0.318359375,
239
  "learning_rate": 1.743160500034443e-06,
240
- "loss": 0.8345,
241
- "mean_token_accuracy": 0.7563751731445558,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
- "grad_norm": 0.330078125,
247
  "learning_rate": 1.2052624879351105e-06,
248
- "loss": 0.8346,
249
- "mean_token_accuracy": 0.7562260551940483,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
  "grad_norm": 0.30859375,
255
  "learning_rate": 7.612046748871327e-07,
256
- "loss": 0.8249,
257
- "mean_token_accuracy": 0.7586945500381791,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
  "grad_norm": 0.31640625,
263
  "learning_rate": 4.1572517541747294e-07,
264
- "loss": 0.8153,
265
- "mean_token_accuracy": 0.7613143505011888,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
  "grad_norm": 0.322265625,
271
  "learning_rate": 1.7251026952640583e-07,
272
- "loss": 0.8284,
273
- "mean_token_accuracy": 0.7578062563797692,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
  "grad_norm": 0.3125,
279
  "learning_rate": 3.4155069933301535e-08,
280
- "loss": 0.8211,
281
- "mean_token_accuracy": 0.759621495589895,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
- "mean_token_accuracy": 0.7546903746827489,
287
  "step": 169,
288
  "total_flos": 6.966137809639834e+17,
289
- "train_loss": 0.8673680914929632,
290
- "train_runtime": 795.9713,
291
- "train_samples_per_second": 27.166,
292
- "train_steps_per_second": 0.212
293
  }
294
  ],
295
  "logging_steps": 5,
 
13
  "grad_norm": 2.40625,
14
  "learning_rate": 5.882352941176471e-06,
15
  "loss": 1.1011,
16
+ "mean_token_accuracy": 0.7099674616006506,
17
  "step": 5
18
  },
19
  {
 
21
  "grad_norm": 1.46875,
22
  "learning_rate": 1.1764705882352942e-05,
23
  "loss": 1.0735,
24
+ "mean_token_accuracy": 0.7143687536382451,
25
  "step": 10
26
  },
27
  {
 
29
  "grad_norm": 0.93359375,
30
  "learning_rate": 1.7647058823529414e-05,
31
  "loss": 1.0246,
32
+ "mean_token_accuracy": 0.7213007797302202,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
  "grad_norm": 0.61328125,
38
  "learning_rate": 1.9980782984658682e-05,
39
+ "loss": 0.9457,
40
+ "mean_token_accuracy": 0.7365057897688324,
41
  "step": 20
42
  },
43
  {
 
45
  "grad_norm": 0.51171875,
46
  "learning_rate": 1.9863613034027224e-05,
47
  "loss": 0.9101,
48
+ "mean_token_accuracy": 0.7418338168299983,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
+ "grad_norm": 0.42578125,
54
  "learning_rate": 1.9641197940012136e-05,
55
+ "loss": 0.8957,
56
+ "mean_token_accuracy": 0.7444947984183858,
57
  "step": 30
58
  },
59
  {
 
61
  "grad_norm": 0.408203125,
62
  "learning_rate": 1.9315910880512792e-05,
63
  "loss": 0.8774,
64
+ "mean_token_accuracy": 0.7473870896289159,
65
  "step": 35
66
  },
67
  {
 
69
  "grad_norm": 0.390625,
70
  "learning_rate": 1.8891222681391853e-05,
71
  "loss": 0.8628,
72
+ "mean_token_accuracy": 0.7510183565770613,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
+ "grad_norm": 0.35546875,
78
  "learning_rate": 1.8371664782625287e-05,
79
  "loss": 0.8562,
80
+ "mean_token_accuracy": 0.7520073491134827,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
+ "grad_norm": 0.32421875,
86
  "learning_rate": 1.7762780887657576e-05,
87
+ "loss": 0.8523,
88
+ "mean_token_accuracy": 0.7526434348741664,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
+ "grad_norm": 0.341796875,
94
  "learning_rate": 1.7071067811865477e-05,
95
+ "loss": 0.8499,
96
+ "mean_token_accuracy": 0.7530957012263093,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
+ "grad_norm": 0.34765625,
102
  "learning_rate": 1.6303906161279554e-05,
103
+ "loss": 0.862,
104
+ "mean_token_accuracy": 0.7495981366725619,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
  "grad_norm": 0.32421875,
110
  "learning_rate": 1.5469481581224274e-05,
111
+ "loss": 0.8489,
112
+ "mean_token_accuracy": 0.7524725022168722,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
+ "grad_norm": 0.337890625,
118
  "learning_rate": 1.4576697415156818e-05,
119
+ "loss": 0.8416,
120
+ "mean_token_accuracy": 0.7547405747257215,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
  "grad_norm": 0.3125,
126
  "learning_rate": 1.3635079705638298e-05,
127
+ "loss": 0.8415,
128
+ "mean_token_accuracy": 0.7544626406760742,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
  "grad_norm": 0.322265625,
134
  "learning_rate": 1.2654675551080724e-05,
135
+ "loss": 0.8448,
136
+ "mean_token_accuracy": 0.7532455186182244,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
  "grad_norm": 0.330078125,
142
  "learning_rate": 1.164594590280734e-05,
143
+ "loss": 0.845,
144
+ "mean_token_accuracy": 0.7534193771926723,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
  "grad_norm": 0.333984375,
150
  "learning_rate": 1.0619653946285948e-05,
151
+ "loss": 0.8391,
152
+ "mean_token_accuracy": 0.7546228916518883,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
  "grad_norm": 0.353515625,
158
  "learning_rate": 9.586750257511868e-06,
159
+ "loss": 0.8349,
160
+ "mean_token_accuracy": 0.7557750389015123,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
+ "grad_norm": 0.322265625,
166
  "learning_rate": 8.558255959926533e-06,
167
+ "loss": 0.826,
168
+ "mean_token_accuracy": 0.7576405338899699,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
+ "eval_loss": 0.858128011226654,
174
+ "eval_mean_token_accuracy": 0.7344105051298153,
175
+ "eval_runtime": 2.3145,
176
+ "eval_samples_per_second": 55.735,
177
+ "eval_steps_per_second": 2.16,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
  "grad_norm": 0.3203125,
183
  "learning_rate": 7.545145128592009e-06,
184
+ "loss": 0.8279,
185
+ "mean_token_accuracy": 0.7577147828861608,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
  "grad_norm": 0.314453125,
191
  "learning_rate": 6.558227696373617e-06,
192
+ "loss": 0.8491,
193
+ "mean_token_accuracy": 0.7520351508799501,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
  "grad_norm": 0.322265625,
199
  "learning_rate": 5.608034111526298e-06,
200
+ "loss": 0.8351,
201
+ "mean_token_accuracy": 0.7557561284875786,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
  "grad_norm": 0.328125,
207
  "learning_rate": 4.704702977392914e-06,
208
+ "loss": 0.8224,
209
+ "mean_token_accuracy": 0.7594720709034851,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
  "grad_norm": 0.314453125,
215
  "learning_rate": 3.857872873103322e-06,
216
+ "loss": 0.8318,
217
+ "mean_token_accuracy": 0.7568701053574788,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
  "grad_norm": 0.326171875,
223
  "learning_rate": 3.0765795095517026e-06,
224
+ "loss": 0.8362,
225
+ "mean_token_accuracy": 0.755340183100174,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
  "grad_norm": 0.3125,
231
  "learning_rate": 2.369159318001937e-06,
232
+ "loss": 0.8427,
233
+ "mean_token_accuracy": 0.7536929231517968,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
  "grad_norm": 0.318359375,
239
  "learning_rate": 1.743160500034443e-06,
240
+ "loss": 0.834,
241
+ "mean_token_accuracy": 0.756549677941094,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
+ "grad_norm": 0.328125,
247
  "learning_rate": 1.2052624879351105e-06,
248
+ "loss": 0.8341,
249
+ "mean_token_accuracy": 0.7563524633707277,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
  "grad_norm": 0.30859375,
255
  "learning_rate": 7.612046748871327e-07,
256
+ "loss": 0.8244,
257
+ "mean_token_accuracy": 0.7588491746092811,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
  "grad_norm": 0.31640625,
263
  "learning_rate": 4.1572517541747294e-07,
264
+ "loss": 0.8148,
265
+ "mean_token_accuracy": 0.7614155323349776,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
  "grad_norm": 0.322265625,
271
  "learning_rate": 1.7251026952640583e-07,
272
+ "loss": 0.8279,
273
+ "mean_token_accuracy": 0.7578688734669882,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
  "grad_norm": 0.3125,
279
  "learning_rate": 3.4155069933301535e-08,
280
+ "loss": 0.8206,
281
+ "mean_token_accuracy": 0.759708155713754,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
+ "mean_token_accuracy": 0.754803195434105,
287
  "step": 169,
288
  "total_flos": 6.966137809639834e+17,
289
+ "train_loss": 0.8669900795411781,
290
+ "train_runtime": 827.3439,
291
+ "train_samples_per_second": 26.135,
292
+ "train_steps_per_second": 0.204
293
  }
294
  ],
295
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbd31b7f95fae03b8da98d830088e494be82c7e417e3e2998543438d39a57832
3
  size 6008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca94459db6ee92707f6688f16c2d4fc52d7a5c65800f1fce13fd25909817625b
3
  size 6008