superbigtree commited on
Commit
1fd8dd7
·
verified ·
1 Parent(s): 18f7404

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: HuggingFaceH4/Bespoke-Stratos-17k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [HuggingFaceH4/Bespoke-Stratos-17k](https://huggingface.co/datasets/HuggingFaceH4/Bespoke-Stratos-17k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,14 +27,14 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/xiaojia-song12/huggingface/runs/hc5miacf)
33
 
34
 
35
  This model was trained with SFT.
36
 
37
  ### Framework versions
38
 
39
- - TRL: 0.15.0.dev0
40
  - Transformers: 4.49.0.dev0
41
  - Pytorch: 2.7.0a0+git3a58512
42
  - Datasets: 3.2.0
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/xiaojia-song12/huggingface/runs/977ijn5q)
31
 
32
 
33
  This model was trained with SFT.
34
 
35
  ### Framework versions
36
 
37
+ - TRL: 0.16.0.dev0
38
  - Transformers: 4.49.0.dev0
39
  - Pytorch: 2.7.0a0+git3a58512
40
  - Datasets: 3.2.0
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
- "eval_loss": 0.8572873473167419,
3
- "eval_runtime": 2.3373,
4
- "eval_samples": 100,
5
- "eval_samples_per_second": 55.192,
6
- "eval_steps_per_second": 2.139,
7
  "total_flos": 6.966137809639834e+17,
8
- "train_loss": 0.8674715298872727,
9
- "train_runtime": 815.3952,
10
  "train_samples": 16610,
11
- "train_samples_per_second": 26.518,
12
- "train_steps_per_second": 0.207
13
  }
 
1
  {
 
 
 
 
 
2
  "total_flos": 6.966137809639834e+17,
3
+ "train_loss": 0.8673680914929632,
4
+ "train_runtime": 795.9713,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 27.166,
7
+ "train_steps_per_second": 0.212
8
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d69e13d1e0a1431e99e002eb37cd9f5b7404337bcbd336a59d7f02b2e4bbadf7
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43f380d7be9586da69712b78466b8c4247dc743323c393ed4e447e8943bead8b
3
  size 3087467144
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 6.966137809639834e+17,
3
- "train_loss": 0.8674715298872727,
4
- "train_runtime": 815.3952,
5
  "train_samples": 16610,
6
- "train_samples_per_second": 26.518,
7
- "train_steps_per_second": 0.207
8
  }
 
1
  {
2
  "total_flos": 6.966137809639834e+17,
3
+ "train_loss": 0.8673680914929632,
4
+ "train_runtime": 795.9713,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 27.166,
7
+ "train_steps_per_second": 0.212
8
  }
trainer_state.json CHANGED
@@ -10,286 +10,286 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.029585798816568046,
13
- "grad_norm": 5.03125,
14
  "learning_rate": 5.882352941176471e-06,
15
  "loss": 1.1011,
16
- "mean_token_accuracy": 0.7099399748962237,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05917159763313609,
21
- "grad_norm": 1.484375,
22
  "learning_rate": 1.1764705882352942e-05,
23
- "loss": 1.0741,
24
- "mean_token_accuracy": 0.7142575641587607,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.08875739644970414,
29
- "grad_norm": 0.9296875,
30
  "learning_rate": 1.7647058823529414e-05,
31
- "loss": 1.025,
32
- "mean_token_accuracy": 0.7212255669151615,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
  "grad_norm": 0.61328125,
38
  "learning_rate": 1.9980782984658682e-05,
39
- "loss": 0.9455,
40
- "mean_token_accuracy": 0.7365584053166825,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 0.14792899408284024,
45
- "grad_norm": 0.515625,
46
  "learning_rate": 1.9863613034027224e-05,
47
- "loss": 0.9097,
48
- "mean_token_accuracy": 0.7419796471746476,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
- "grad_norm": 0.42578125,
54
  "learning_rate": 1.9641197940012136e-05,
55
- "loss": 0.8953,
56
- "mean_token_accuracy": 0.744632902913685,
57
  "step": 30
58
  },
59
  {
60
  "epoch": 0.20710059171597633,
61
  "grad_norm": 0.408203125,
62
  "learning_rate": 1.9315910880512792e-05,
63
- "loss": 0.8769,
64
- "mean_token_accuracy": 0.7474371048323014,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.23668639053254437,
69
- "grad_norm": 0.388671875,
70
  "learning_rate": 1.8891222681391853e-05,
71
- "loss": 0.8623,
72
- "mean_token_accuracy": 0.7511180018113048,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
- "grad_norm": 0.357421875,
78
  "learning_rate": 1.8371664782625287e-05,
79
- "loss": 0.8557,
80
- "mean_token_accuracy": 0.7520890468920156,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
- "grad_norm": 0.326171875,
86
  "learning_rate": 1.7762780887657576e-05,
87
- "loss": 0.8519,
88
- "mean_token_accuracy": 0.7527647439023715,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
- "grad_norm": 0.34375,
94
  "learning_rate": 1.7071067811865477e-05,
95
- "loss": 0.8499,
96
- "mean_token_accuracy": 0.7530196620772068,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
- "grad_norm": 0.345703125,
102
  "learning_rate": 1.6303906161279554e-05,
103
- "loss": 0.8623,
104
- "mean_token_accuracy": 0.7496151012147616,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
- "grad_norm": 0.322265625,
110
  "learning_rate": 1.5469481581224274e-05,
111
- "loss": 0.8491,
112
- "mean_token_accuracy": 0.7524191922540802,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
- "grad_norm": 0.33984375,
118
  "learning_rate": 1.4576697415156818e-05,
119
- "loss": 0.8418,
120
- "mean_token_accuracy": 0.7547066041239219,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
  "grad_norm": 0.3125,
126
  "learning_rate": 1.3635079705638298e-05,
127
- "loss": 0.8416,
128
- "mean_token_accuracy": 0.7544158237788706,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
- "grad_norm": 9.5625,
134
  "learning_rate": 1.2654675551080724e-05,
135
- "loss": 0.8451,
136
- "mean_token_accuracy": 0.7531590392914497,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
- "grad_norm": 0.328125,
142
  "learning_rate": 1.164594590280734e-05,
143
  "loss": 0.8455,
144
- "mean_token_accuracy": 0.7532937761410722,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
- "grad_norm": 0.337890625,
150
  "learning_rate": 1.0619653946285948e-05,
151
- "loss": 0.8399,
152
- "mean_token_accuracy": 0.7544911735347861,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
- "grad_norm": 0.35546875,
158
  "learning_rate": 9.586750257511868e-06,
159
- "loss": 0.8358,
160
- "mean_token_accuracy": 0.7556337403583416,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
- "grad_norm": 13.75,
166
  "learning_rate": 8.558255959926533e-06,
167
- "loss": 0.8269,
168
- "mean_token_accuracy": 0.7575153177712692,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
- "eval_loss": 0.8590129017829895,
174
- "eval_mean_token_accuracy": 0.7343020822802343,
175
- "eval_runtime": 2.301,
176
- "eval_samples_per_second": 56.063,
177
- "eval_steps_per_second": 2.173,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
- "grad_norm": 0.326171875,
183
  "learning_rate": 7.545145128592009e-06,
184
- "loss": 0.8289,
185
- "mean_token_accuracy": 0.7575234407641943,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
- "grad_norm": 21.25,
191
  "learning_rate": 6.558227696373617e-06,
192
- "loss": 0.8501,
193
- "mean_token_accuracy": 0.7518221200713989,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
  "grad_norm": 0.322265625,
199
  "learning_rate": 5.608034111526298e-06,
200
- "loss": 0.8361,
201
- "mean_token_accuracy": 0.7556435102369359,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
  "grad_norm": 0.328125,
207
  "learning_rate": 4.704702977392914e-06,
208
- "loss": 0.8234,
209
- "mean_token_accuracy": 0.7592704800654426,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
  "grad_norm": 0.314453125,
215
  "learning_rate": 3.857872873103322e-06,
216
- "loss": 0.8328,
217
- "mean_token_accuracy": 0.7565933236472202,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
- "grad_norm": 0.328125,
223
  "learning_rate": 3.0765795095517026e-06,
224
- "loss": 0.8372,
225
- "mean_token_accuracy": 0.7552126694771518,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
- "grad_norm": 0.314453125,
231
  "learning_rate": 2.369159318001937e-06,
232
- "loss": 0.8437,
233
- "mean_token_accuracy": 0.7534241447974153,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
  "grad_norm": 0.318359375,
239
  "learning_rate": 1.743160500034443e-06,
240
- "loss": 0.835,
241
- "mean_token_accuracy": 0.7563463400384239,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
  "grad_norm": 0.330078125,
247
  "learning_rate": 1.2052624879351105e-06,
248
- "loss": 0.8351,
249
- "mean_token_accuracy": 0.7560251329699866,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
  "grad_norm": 0.30859375,
255
  "learning_rate": 7.612046748871327e-07,
256
- "loss": 0.8254,
257
- "mean_token_accuracy": 0.7586063543068307,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
  "grad_norm": 0.31640625,
263
  "learning_rate": 4.1572517541747294e-07,
264
- "loss": 0.8158,
265
- "mean_token_accuracy": 0.7611673621269803,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
- "grad_norm": 0.32421875,
271
  "learning_rate": 1.7251026952640583e-07,
272
- "loss": 0.8289,
273
- "mean_token_accuracy": 0.7576611812663976,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
  "grad_norm": 0.3125,
279
  "learning_rate": 3.4155069933301535e-08,
280
- "loss": 0.8216,
281
- "mean_token_accuracy": 0.7594309628290168,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
- "mean_token_accuracy": 0.7546930193447599,
287
  "step": 169,
288
  "total_flos": 6.966137809639834e+17,
289
- "train_loss": 0.8674715298872727,
290
- "train_runtime": 815.3952,
291
- "train_samples_per_second": 26.518,
292
- "train_steps_per_second": 0.207
293
  }
294
  ],
295
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.029585798816568046,
13
+ "grad_norm": 2.40625,
14
  "learning_rate": 5.882352941176471e-06,
15
  "loss": 1.1011,
16
+ "mean_token_accuracy": 0.7099346258467769,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05917159763313609,
21
+ "grad_norm": 1.46875,
22
  "learning_rate": 1.1764705882352942e-05,
23
+ "loss": 1.0735,
24
+ "mean_token_accuracy": 0.7143598902988348,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.08875739644970414,
29
+ "grad_norm": 0.93359375,
30
  "learning_rate": 1.7647058823529414e-05,
31
+ "loss": 1.0246,
32
+ "mean_token_accuracy": 0.7213492548758127,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
  "grad_norm": 0.61328125,
38
  "learning_rate": 1.9980782984658682e-05,
39
+ "loss": 0.9456,
40
+ "mean_token_accuracy": 0.7365624101438417,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 0.14792899408284024,
45
+ "grad_norm": 0.51171875,
46
  "learning_rate": 1.9863613034027224e-05,
47
+ "loss": 0.9101,
48
+ "mean_token_accuracy": 0.7418235078195066,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
+ "grad_norm": 0.423828125,
54
  "learning_rate": 1.9641197940012136e-05,
55
+ "loss": 0.8958,
56
+ "mean_token_accuracy": 0.7445377548880543,
57
  "step": 30
58
  },
59
  {
60
  "epoch": 0.20710059171597633,
61
  "grad_norm": 0.408203125,
62
  "learning_rate": 1.9315910880512792e-05,
63
+ "loss": 0.8774,
64
+ "mean_token_accuracy": 0.7473939577517185,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.23668639053254437,
69
+ "grad_norm": 0.390625,
70
  "learning_rate": 1.8891222681391853e-05,
71
+ "loss": 0.8628,
72
+ "mean_token_accuracy": 0.7510309574773866,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
+ "grad_norm": 4.75,
78
  "learning_rate": 1.8371664782625287e-05,
79
+ "loss": 0.8562,
80
+ "mean_token_accuracy": 0.7519657291179588,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
+ "grad_norm": 0.322265625,
86
  "learning_rate": 1.7762780887657576e-05,
87
+ "loss": 0.8526,
88
+ "mean_token_accuracy": 0.7526042764494737,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
+ "grad_norm": 0.33984375,
94
  "learning_rate": 1.7071067811865477e-05,
95
+ "loss": 0.8506,
96
+ "mean_token_accuracy": 0.7529147428713211,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
+ "grad_norm": 0.349609375,
102
  "learning_rate": 1.6303906161279554e-05,
103
+ "loss": 0.8627,
104
+ "mean_token_accuracy": 0.7494820701814584,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
+ "grad_norm": 0.32421875,
110
  "learning_rate": 1.5469481581224274e-05,
111
+ "loss": 0.8496,
112
+ "mean_token_accuracy": 0.7523080532405824,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
+ "grad_norm": 0.3359375,
118
  "learning_rate": 1.4576697415156818e-05,
119
+ "loss": 0.8422,
120
+ "mean_token_accuracy": 0.7546321496781302,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
  "grad_norm": 0.3125,
126
  "learning_rate": 1.3635079705638298e-05,
127
+ "loss": 0.842,
128
+ "mean_token_accuracy": 0.7544247004239197,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
+ "grad_norm": 0.322265625,
134
  "learning_rate": 1.2654675551080724e-05,
135
+ "loss": 0.8453,
136
+ "mean_token_accuracy": 0.7530717828212506,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
+ "grad_norm": 0.330078125,
142
  "learning_rate": 1.164594590280734e-05,
143
  "loss": 0.8455,
144
+ "mean_token_accuracy": 0.7532910964218957,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
+ "grad_norm": 0.333984375,
150
  "learning_rate": 1.0619653946285948e-05,
151
+ "loss": 0.8396,
152
+ "mean_token_accuracy": 0.7545698270600318,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
+ "grad_norm": 0.353515625,
158
  "learning_rate": 9.586750257511868e-06,
159
+ "loss": 0.8353,
160
+ "mean_token_accuracy": 0.7558067680588229,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
+ "grad_norm": 0.32421875,
166
  "learning_rate": 8.558255959926533e-06,
167
+ "loss": 0.8265,
168
+ "mean_token_accuracy": 0.7575928127928315,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
+ "eval_loss": 0.858591616153717,
174
+ "eval_mean_token_accuracy": 0.7338160377061327,
175
+ "eval_runtime": 2.2821,
176
+ "eval_samples_per_second": 56.526,
177
+ "eval_steps_per_second": 2.191,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
+ "grad_norm": 0.3203125,
183
  "learning_rate": 7.545145128592009e-06,
184
+ "loss": 0.8284,
185
+ "mean_token_accuracy": 0.7576165643011782,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
+ "grad_norm": 0.314453125,
191
  "learning_rate": 6.558227696373617e-06,
192
+ "loss": 0.8497,
193
+ "mean_token_accuracy": 0.7519416205438242,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
  "grad_norm": 0.322265625,
199
  "learning_rate": 5.608034111526298e-06,
200
+ "loss": 0.8356,
201
+ "mean_token_accuracy": 0.7557316956442416,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
  "grad_norm": 0.328125,
207
  "learning_rate": 4.704702977392914e-06,
208
+ "loss": 0.8229,
209
+ "mean_token_accuracy": 0.7593895998895455,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
  "grad_norm": 0.314453125,
215
  "learning_rate": 3.857872873103322e-06,
216
+ "loss": 0.8322,
217
+ "mean_token_accuracy": 0.756736495558991,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
+ "grad_norm": 0.326171875,
223
  "learning_rate": 3.0765795095517026e-06,
224
+ "loss": 0.8367,
225
+ "mean_token_accuracy": 0.7552458819622366,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
+ "grad_norm": 0.3125,
231
  "learning_rate": 2.369159318001937e-06,
232
+ "loss": 0.8431,
233
+ "mean_token_accuracy": 0.753608536029534,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
  "grad_norm": 0.318359375,
239
  "learning_rate": 1.743160500034443e-06,
240
+ "loss": 0.8345,
241
+ "mean_token_accuracy": 0.7563751731445558,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
  "grad_norm": 0.330078125,
247
  "learning_rate": 1.2052624879351105e-06,
248
+ "loss": 0.8346,
249
+ "mean_token_accuracy": 0.7562260551940483,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
  "grad_norm": 0.30859375,
255
  "learning_rate": 7.612046748871327e-07,
256
+ "loss": 0.8249,
257
+ "mean_token_accuracy": 0.7586945500381791,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
  "grad_norm": 0.31640625,
263
  "learning_rate": 4.1572517541747294e-07,
264
+ "loss": 0.8153,
265
+ "mean_token_accuracy": 0.7613143505011888,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
+ "grad_norm": 0.322265625,
271
  "learning_rate": 1.7251026952640583e-07,
272
+ "loss": 0.8284,
273
+ "mean_token_accuracy": 0.7578062563797692,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
  "grad_norm": 0.3125,
279
  "learning_rate": 3.4155069933301535e-08,
280
+ "loss": 0.8211,
281
+ "mean_token_accuracy": 0.759621495589895,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
+ "mean_token_accuracy": 0.7546903746827489,
287
  "step": 169,
288
  "total_flos": 6.966137809639834e+17,
289
+ "train_loss": 0.8673680914929632,
290
+ "train_runtime": 795.9713,
291
+ "train_samples_per_second": 27.166,
292
+ "train_steps_per_second": 0.212
293
  }
294
  ],
295
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fd4518eeebb19a79bf77cb8690174a32d07f567e7bd1ae9d9a31d0e6e6e1074
3
  size 6008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbd31b7f95fae03b8da98d830088e494be82c7e417e3e2998543438d39a57832
3
  size 6008