Mingsmilet commited on
Commit
0daedf7
·
verified ·
1 Parent(s): 8dbf7b6

Model save

Browse files
README.md CHANGED
@@ -7,8 +7,6 @@ tags:
7
  - trl
8
  - sft
9
  licence: license
10
- datasets:
11
- - bespokelabs/Bespoke-Stratos-17k
12
  ---
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
@@ -20,15 +18,16 @@ It has been trained using [TRL](https://github.com/huggingface/trl).
20
 
21
  ```python
22
  from transformers import pipeline
 
 
23
  generator = pipeline("text-generation", model="Mingsmilet/Qwen2.5-1.5B-Open-R1-Distill", device="cuda")
24
- question = "what is min value of x^2+3?"
25
- output = generator([{"role": "system", "content":"Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution. In the Thought section, detail your reasoning process using the specified format: <|begin_of_thought|> {thought with steps separated with '\n\n'} <|end_of_thought|> Each step should include detailed considerations such as analisying questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion, formatted as follows: <|begin_of_solution|> {final formatted, precise, and clear solution} <|end_of_solution|> Now, try to solve the following question through the above guidelines:"},{"role": "user", "content": question}], max_new_tokens=2048, return_full_text=False)[0]
26
  print(output["generated_text"])
27
  ```
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/1653401183-mingmingai/huggingface/runs/batphcqf)
32
 
33
 
34
  This model was trained with SFT.
 
7
  - trl
8
  - sft
9
  licence: license
 
 
10
  ---
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
 
18
 
19
  ```python
20
  from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
  generator = pipeline("text-generation", model="Mingsmilet/Qwen2.5-1.5B-Open-R1-Distill", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
 
25
  print(output["generated_text"])
26
  ```
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/1653401183-mingmingai/huggingface/runs/2nzizgw4)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 76973799899136.0,
3
- "train_loss": 0.8025032683942445,
4
- "train_runtime": 710.0203,
5
  "train_samples": 16610,
6
- "train_samples_per_second": 30.454,
7
- "train_steps_per_second": 0.238
8
  }
 
1
  {
2
  "total_flos": 76973799899136.0,
3
+ "train_loss": 0.802504962717993,
4
+ "train_runtime": 695.5303,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 31.089,
7
+ "train_steps_per_second": 0.243
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:846816e48b86e2be4cecdd5778e99f02cb73c4f53705b0b48f09130fb97960a4
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdde57df3dbf087870df00e8e1ec680c6c90cd70e09da0bfc89de0dc63069fa2
3
  size 3087467144
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 76973799899136.0,
3
- "train_loss": 0.8025032683942445,
4
- "train_runtime": 710.0203,
5
  "train_samples": 16610,
6
- "train_samples_per_second": 30.454,
7
- "train_steps_per_second": 0.238
8
  }
 
1
  {
2
  "total_flos": 76973799899136.0,
3
+ "train_loss": 0.802504962717993,
4
+ "train_runtime": 695.5303,
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 31.089,
7
+ "train_steps_per_second": 0.243
8
  }
trainer_state.json CHANGED
@@ -10,286 +10,286 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.029585798816568046,
13
- "grad_norm": 2.2988141077652386,
14
  "learning_rate": 5.882352941176471e-06,
15
  "loss": 1.1002,
16
- "mean_token_accuracy": 0.7101159904452009,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05917159763313609,
21
- "grad_norm": 1.596218927081988,
22
  "learning_rate": 1.1764705882352942e-05,
23
  "loss": 1.0327,
24
- "mean_token_accuracy": 0.720176724225474,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.08875739644970414,
29
- "grad_norm": 0.8479104246833407,
30
  "learning_rate": 1.7647058823529414e-05,
31
  "loss": 0.9517,
32
- "mean_token_accuracy": 0.7325662989711975,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
- "grad_norm": 0.6465483847858254,
38
  "learning_rate": 1.9980782984658682e-05,
39
  "loss": 0.8804,
40
- "mean_token_accuracy": 0.7473196725120462,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 0.14792899408284024,
45
- "grad_norm": 0.5525325403311692,
46
  "learning_rate": 1.9863613034027224e-05,
47
  "loss": 0.8536,
48
- "mean_token_accuracy": 0.7518004036197122,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
- "grad_norm": 0.44179450773629486,
54
  "learning_rate": 1.9641197940012136e-05,
55
  "loss": 0.8404,
56
- "mean_token_accuracy": 0.7545674682549227,
57
  "step": 30
58
  },
59
  {
60
  "epoch": 0.20710059171597633,
61
- "grad_norm": 0.39496081405164835,
62
  "learning_rate": 1.9315910880512792e-05,
63
  "loss": 0.8212,
64
- "mean_token_accuracy": 0.7580853602544403,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.23668639053254437,
69
- "grad_norm": 0.38859302520022476,
70
  "learning_rate": 1.8891222681391853e-05,
71
  "loss": 0.805,
72
- "mean_token_accuracy": 0.7621072520755232,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
- "grad_norm": 0.4281961569580366,
78
  "learning_rate": 1.8371664782625287e-05,
79
  "loss": 0.7984,
80
- "mean_token_accuracy": 0.7635130587409846,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
- "grad_norm": 0.393042490001915,
86
  "learning_rate": 1.7762780887657576e-05,
87
- "loss": 0.7936,
88
- "mean_token_accuracy": 0.7642980542104124,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
- "grad_norm": 0.3719832470365729,
94
  "learning_rate": 1.7071067811865477e-05,
95
  "loss": 0.7894,
96
- "mean_token_accuracy": 0.7649172633057038,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
- "grad_norm": 0.36967289202890546,
102
  "learning_rate": 1.6303906161279554e-05,
103
  "loss": 0.7999,
104
- "mean_token_accuracy": 0.7619633834951797,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
- "grad_norm": 0.3965455034810465,
110
  "learning_rate": 1.5469481581224274e-05,
111
- "loss": 0.7867,
112
- "mean_token_accuracy": 0.7647511953755232,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
- "grad_norm": 0.368068249126072,
118
  "learning_rate": 1.4576697415156818e-05,
119
  "loss": 0.7782,
120
- "mean_token_accuracy": 0.7674480153107033,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
- "grad_norm": 0.3388950128403286,
126
  "learning_rate": 1.3635079705638298e-05,
127
  "loss": 0.777,
128
- "mean_token_accuracy": 0.7675313738107395,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
- "grad_norm": 0.3430098886707366,
134
  "learning_rate": 1.2654675551080724e-05,
135
  "loss": 0.7783,
136
- "mean_token_accuracy": 0.7666054602362825,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
- "grad_norm": 0.33478411969703326,
142
  "learning_rate": 1.164594590280734e-05,
143
  "loss": 0.7785,
144
- "mean_token_accuracy": 0.7669639865984688,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
- "grad_norm": 0.34768462687825924,
150
  "learning_rate": 1.0619653946285948e-05,
151
  "loss": 0.7723,
152
- "mean_token_accuracy": 0.7682154932141926,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
- "grad_norm": 0.3445315216411646,
158
  "learning_rate": 9.586750257511868e-06,
159
  "loss": 0.7668,
160
- "mean_token_accuracy": 0.7698424375606829,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
- "grad_norm": 0.3412337472975462,
166
  "learning_rate": 8.558255959926533e-06,
167
  "loss": 0.7582,
168
- "mean_token_accuracy": 0.7714304347299908,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
- "eval_loss": 0.7885846495628357,
174
- "eval_mean_token_accuracy": 0.7482494053174686,
175
- "eval_runtime": 2.2356,
176
- "eval_samples_per_second": 57.703,
177
- "eval_steps_per_second": 2.237,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
- "grad_norm": 0.35299455458895956,
183
  "learning_rate": 7.545145128592009e-06,
184
  "loss": 0.759,
185
- "mean_token_accuracy": 0.7714738320259433,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
- "grad_norm": 0.31616289960261323,
191
  "learning_rate": 6.558227696373617e-06,
192
- "loss": 0.7775,
193
- "mean_token_accuracy": 0.7664820794613972,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
- "grad_norm": 0.307888028457379,
199
  "learning_rate": 5.608034111526298e-06,
200
  "loss": 0.7636,
201
- "mean_token_accuracy": 0.7704197310032757,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
- "grad_norm": 0.2995694223287612,
207
  "learning_rate": 4.704702977392914e-06,
208
  "loss": 0.7511,
209
- "mean_token_accuracy": 0.7742426406160382,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
- "grad_norm": 0.31585565817231076,
215
  "learning_rate": 3.857872873103322e-06,
216
- "loss": 0.7595,
217
- "mean_token_accuracy": 0.7717561342325581,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
- "grad_norm": 0.32069400376048846,
223
  "learning_rate": 3.0765795095517026e-06,
224
- "loss": 0.7624,
225
- "mean_token_accuracy": 0.7705784584578896,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
- "grad_norm": 0.30337427328877,
231
  "learning_rate": 2.369159318001937e-06,
232
  "loss": 0.7684,
233
- "mean_token_accuracy": 0.7688614612364428,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
- "grad_norm": 0.2931589273225087,
239
  "learning_rate": 1.743160500034443e-06,
240
  "loss": 0.7597,
241
- "mean_token_accuracy": 0.7717145410630937,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
- "grad_norm": 0.31239776071011455,
247
  "learning_rate": 1.2052624879351105e-06,
248
- "loss": 0.7595,
249
- "mean_token_accuracy": 0.7714656640175818,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
- "grad_norm": 0.2909480842066185,
255
  "learning_rate": 7.612046748871327e-07,
256
  "loss": 0.7505,
257
- "mean_token_accuracy": 0.7739669991508189,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
- "grad_norm": 0.279458821658001,
263
  "learning_rate": 4.1572517541747294e-07,
264
  "loss": 0.741,
265
- "mean_token_accuracy": 0.7767323569336231,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
- "grad_norm": 0.29648687488650516,
271
  "learning_rate": 1.7251026952640583e-07,
272
  "loss": 0.754,
273
- "mean_token_accuracy": 0.7732043043845805,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
- "grad_norm": 0.2773995248389471,
279
  "learning_rate": 3.4155069933301535e-08,
280
- "loss": 0.7467,
281
- "mean_token_accuracy": 0.7749679952859235,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
- "mean_token_accuracy": 0.7705127247007673,
287
  "step": 169,
288
  "total_flos": 76973799899136.0,
289
- "train_loss": 0.8025032683942445,
290
- "train_runtime": 710.0203,
291
- "train_samples_per_second": 30.454,
292
- "train_steps_per_second": 0.238
293
  }
294
  ],
295
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.029585798816568046,
13
+ "grad_norm": 2.297573765324108,
14
  "learning_rate": 5.882352941176471e-06,
15
  "loss": 1.1002,
16
+ "mean_token_accuracy": 0.7100953695790506,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05917159763313609,
21
+ "grad_norm": 1.5969597101948139,
22
  "learning_rate": 1.1764705882352942e-05,
23
  "loss": 1.0327,
24
+ "mean_token_accuracy": 0.7202142927855761,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.08875739644970414,
29
+ "grad_norm": 0.8485756643128731,
30
  "learning_rate": 1.7647058823529414e-05,
31
  "loss": 0.9517,
32
+ "mean_token_accuracy": 0.7326373095769856,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
+ "grad_norm": 0.6438282504171126,
38
  "learning_rate": 1.9980782984658682e-05,
39
  "loss": 0.8804,
40
+ "mean_token_accuracy": 0.7473626526463326,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 0.14792899408284024,
45
+ "grad_norm": 0.5518035399856269,
46
  "learning_rate": 1.9863613034027224e-05,
47
  "loss": 0.8536,
48
+ "mean_token_accuracy": 0.7518618707778764,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
+ "grad_norm": 0.4402012573426451,
54
  "learning_rate": 1.9641197940012136e-05,
55
  "loss": 0.8404,
56
+ "mean_token_accuracy": 0.7545927971969407,
57
  "step": 30
58
  },
59
  {
60
  "epoch": 0.20710059171597633,
61
+ "grad_norm": 0.3943698562073155,
62
  "learning_rate": 1.9315910880512792e-05,
63
  "loss": 0.8212,
64
+ "mean_token_accuracy": 0.7580720047648368,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.23668639053254437,
69
+ "grad_norm": 0.38684017157532,
70
  "learning_rate": 1.8891222681391853e-05,
71
  "loss": 0.805,
72
+ "mean_token_accuracy": 0.7620874011199529,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
+ "grad_norm": 0.4262882071831819,
78
  "learning_rate": 1.8371664782625287e-05,
79
  "loss": 0.7984,
80
+ "mean_token_accuracy": 0.7635187814772423,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
+ "grad_norm": 0.3931013124021272,
86
  "learning_rate": 1.7762780887657576e-05,
87
+ "loss": 0.7937,
88
+ "mean_token_accuracy": 0.7642257038779341,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
+ "grad_norm": 0.3739445931259617,
94
  "learning_rate": 1.7071067811865477e-05,
95
  "loss": 0.7894,
96
+ "mean_token_accuracy": 0.7648935589559052,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
+ "grad_norm": 0.3711114588864771,
102
  "learning_rate": 1.6303906161279554e-05,
103
  "loss": 0.7999,
104
+ "mean_token_accuracy": 0.7619695483759025,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
+ "grad_norm": 0.3949481992103986,
110
  "learning_rate": 1.5469481581224274e-05,
111
+ "loss": 0.7868,
112
+ "mean_token_accuracy": 0.7646586436181606,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
+ "grad_norm": 0.3710782910651557,
118
  "learning_rate": 1.4576697415156818e-05,
119
  "loss": 0.7782,
120
+ "mean_token_accuracy": 0.7674472564108428,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
+ "grad_norm": 0.34005674173322326,
126
  "learning_rate": 1.3635079705638298e-05,
127
  "loss": 0.777,
128
+ "mean_token_accuracy": 0.7675319726128873,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
+ "grad_norm": 0.3468353725329877,
134
  "learning_rate": 1.2654675551080724e-05,
135
  "loss": 0.7783,
136
+ "mean_token_accuracy": 0.7666067092453227,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
+ "grad_norm": 0.33465976050856655,
142
  "learning_rate": 1.164594590280734e-05,
143
  "loss": 0.7785,
144
+ "mean_token_accuracy": 0.7669678075017483,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
+ "grad_norm": 0.3498974122332,
150
  "learning_rate": 1.0619653946285948e-05,
151
  "loss": 0.7723,
152
+ "mean_token_accuracy": 0.7681925898394718,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
+ "grad_norm": 0.34393653080794817,
158
  "learning_rate": 9.586750257511868e-06,
159
  "loss": 0.7668,
160
+ "mean_token_accuracy": 0.7697901380963259,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
+ "grad_norm": 0.33899820007825987,
166
  "learning_rate": 8.558255959926533e-06,
167
  "loss": 0.7582,
168
+ "mean_token_accuracy": 0.7714266195965762,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
+ "eval_loss": 0.788552463054657,
174
+ "eval_mean_token_accuracy": 0.7482402433575445,
175
+ "eval_runtime": 2.2045,
176
+ "eval_samples_per_second": 58.517,
177
+ "eval_steps_per_second": 2.268,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
+ "grad_norm": 0.353900421817561,
183
  "learning_rate": 7.545145128592009e-06,
184
  "loss": 0.759,
185
+ "mean_token_accuracy": 0.7714586673704755,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
+ "grad_norm": 0.3166511156569925,
191
  "learning_rate": 6.558227696373617e-06,
192
+ "loss": 0.7776,
193
+ "mean_token_accuracy": 0.7664817000327632,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
+ "grad_norm": 0.3074350676983976,
199
  "learning_rate": 5.608034111526298e-06,
200
  "loss": 0.7636,
201
+ "mean_token_accuracy": 0.7703227561153221,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
+ "grad_norm": 0.299274177260631,
207
  "learning_rate": 4.704702977392914e-06,
208
  "loss": 0.7511,
209
+ "mean_token_accuracy": 0.7742873168311635,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
+ "grad_norm": 0.31617489086098977,
215
  "learning_rate": 3.857872873103322e-06,
216
+ "loss": 0.7594,
217
+ "mean_token_accuracy": 0.7717538547987106,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
+ "grad_norm": 0.32025277899891924,
223
  "learning_rate": 3.0765795095517026e-06,
224
+ "loss": 0.7623,
225
+ "mean_token_accuracy": 0.7706147324391592,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
+ "grad_norm": 0.30317891871785263,
231
  "learning_rate": 2.369159318001937e-06,
232
  "loss": 0.7684,
233
+ "mean_token_accuracy": 0.7688305448428647,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
+ "grad_norm": 0.29377898935847374,
239
  "learning_rate": 1.743160500034443e-06,
240
  "loss": 0.7597,
241
+ "mean_token_accuracy": 0.771730188624508,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
+ "grad_norm": 0.3122803287793864,
247
  "learning_rate": 1.2052624879351105e-06,
248
+ "loss": 0.7594,
249
+ "mean_token_accuracy": 0.7715123408941931,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
+ "grad_norm": 0.289656138067616,
255
  "learning_rate": 7.612046748871327e-07,
256
  "loss": 0.7505,
257
+ "mean_token_accuracy": 0.7739841847801008,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
+ "grad_norm": 0.27970392255541077,
263
  "learning_rate": 4.1572517541747294e-07,
264
  "loss": 0.741,
265
+ "mean_token_accuracy": 0.7767522035827779,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
+ "grad_norm": 0.2966574688397028,
271
  "learning_rate": 1.7251026952640583e-07,
272
  "loss": 0.754,
273
+ "mean_token_accuracy": 0.7732214833563628,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
+ "grad_norm": 0.276771819219271,
279
  "learning_rate": 3.4155069933301535e-08,
280
+ "loss": 0.7468,
281
+ "mean_token_accuracy": 0.774979807844946,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
+ "mean_token_accuracy": 0.7703761237331678,
287
  "step": 169,
288
  "total_flos": 76973799899136.0,
289
+ "train_loss": 0.802504962717993,
290
+ "train_runtime": 695.5303,
291
+ "train_samples_per_second": 31.089,
292
+ "train_steps_per_second": 0.243
293
  }
294
  ],
295
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e7043d0dcec036604186a342c05b4b1b9b359afc64da2f56ce61db35f0a4ae1
3
  size 7416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5e6be9345e2f96cd68e61867c69873588bdbf3fae2f77193ddc524b45a7ad6a
3
  size 7416