Mingsmilet commited on
Commit
3d1df92
·
verified ·
1 Parent(s): a94041c

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: HuggingFaceH4/Bespoke-Stratos-17k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [HuggingFaceH4/Bespoke-Stratos-17k](https://huggingface.co/datasets/HuggingFaceH4/Bespoke-Stratos-17k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,14 +27,14 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/1653401183-mingmingai/huggingface/runs/0i0p6fk9)
33
 
34
 
35
  This model was trained with SFT.
36
 
37
  ### Framework versions
38
 
39
- - TRL: 0.15.0.dev0
40
  - Transformers: 4.49.0.dev0
41
  - Pytorch: 2.5.1
42
  - Datasets: 3.2.0
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/1653401183-mingmingai/huggingface/runs/batphcqf)
31
 
32
 
33
  This model was trained with SFT.
34
 
35
  ### Framework versions
36
 
37
+ - TRL: 0.16.0.dev0
38
  - Transformers: 4.49.0.dev0
39
  - Pytorch: 2.5.1
40
  - Datasets: 3.2.0
all_results.json CHANGED
@@ -1,14 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.7803335785865784,
4
- "eval_runtime": 0.8401,
5
- "eval_samples": 100,
6
- "eval_samples_per_second": 152.364,
7
- "eval_steps_per_second": 4.761,
8
- "total_flos": 76916824473600.0,
9
- "train_loss": 0.8026207946461333,
10
- "train_runtime": 498.3301,
11
  "train_samples": 16610,
12
- "train_samples_per_second": 43.365,
13
- "train_steps_per_second": 0.339
14
  }
 
1
  {
2
+ "total_flos": 76973799899136.0,
3
+ "train_loss": 0.8025032683942445,
4
+ "train_runtime": 710.0203,
 
 
 
 
 
 
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 30.454,
7
+ "train_steps_per_second": 0.238
8
  }
config.json CHANGED
@@ -19,11 +19,11 @@
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": null,
21
  "rope_theta": 1000000.0,
22
- "sliding_window": null,
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": null,
21
  "rope_theta": 1000000.0,
22
+ "sliding_window": 32768,
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21e70655225165df770f132bba556d7d5e6ebfcf98d004ed7779070c3da965c9
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:846816e48b86e2be4cecdd5778e99f02cb73c4f53705b0b48f09130fb97960a4
3
  size 3087467144
train_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 76916824473600.0,
4
- "train_loss": 0.8026207946461333,
5
- "train_runtime": 498.3301,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 43.365,
8
- "train_steps_per_second": 0.339
9
  }
 
1
  {
2
+ "total_flos": 76973799899136.0,
3
+ "train_loss": 0.8025032683942445,
4
+ "train_runtime": 710.0203,
 
5
  "train_samples": 16610,
6
+ "train_samples_per_second": 30.454,
7
+ "train_steps_per_second": 0.238
8
  }
trainer_state.json CHANGED
@@ -10,251 +10,286 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.029585798816568046,
13
- "grad_norm": 2.345319996716153,
14
  "learning_rate": 5.882352941176471e-06,
15
- "loss": 1.0993,
 
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.05917159763313609,
20
- "grad_norm": 1.6336099031082492,
21
  "learning_rate": 1.1764705882352942e-05,
22
- "loss": 1.0402,
 
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.08875739644970414,
27
- "grad_norm": 0.8859408831541707,
28
  "learning_rate": 1.7647058823529414e-05,
29
- "loss": 0.9533,
 
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.11834319526627218,
34
- "grad_norm": 0.6064983041288236,
35
  "learning_rate": 1.9980782984658682e-05,
36
- "loss": 0.8922,
 
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.14792899408284024,
41
- "grad_norm": 0.538397725140923,
42
  "learning_rate": 1.9863613034027224e-05,
43
- "loss": 0.8552,
 
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.17751479289940827,
48
- "grad_norm": 0.4261073425530229,
49
  "learning_rate": 1.9641197940012136e-05,
50
- "loss": 0.8283,
 
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.20710059171597633,
55
- "grad_norm": 0.37594749202019234,
56
  "learning_rate": 1.9315910880512792e-05,
57
- "loss": 0.823,
 
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.23668639053254437,
62
- "grad_norm": 0.39908541118991764,
63
  "learning_rate": 1.8891222681391853e-05,
64
- "loss": 0.8225,
 
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.26627218934911245,
69
- "grad_norm": 0.3446073773136696,
70
  "learning_rate": 1.8371664782625287e-05,
71
- "loss": 0.8073,
 
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.2958579881656805,
76
- "grad_norm": 0.33521545405571684,
77
  "learning_rate": 1.7762780887657576e-05,
78
- "loss": 0.7977,
 
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.3254437869822485,
83
- "grad_norm": 0.36639448217064385,
84
  "learning_rate": 1.7071067811865477e-05,
85
- "loss": 0.7877,
 
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.35502958579881655,
90
- "grad_norm": 0.3696573153537768,
91
  "learning_rate": 1.6303906161279554e-05,
92
- "loss": 0.7981,
 
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.38461538461538464,
97
- "grad_norm": 0.34317642014061395,
98
  "learning_rate": 1.5469481581224274e-05,
99
- "loss": 0.7722,
 
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.41420118343195267,
104
- "grad_norm": 0.34403963076039407,
105
  "learning_rate": 1.4576697415156818e-05,
106
- "loss": 0.7741,
 
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.4437869822485207,
111
- "grad_norm": 0.33733303000440357,
112
  "learning_rate": 1.3635079705638298e-05,
113
- "loss": 0.7852,
 
114
  "step": 75
115
  },
116
  {
117
  "epoch": 0.47337278106508873,
118
- "grad_norm": 0.3519387066839729,
119
  "learning_rate": 1.2654675551080724e-05,
120
- "loss": 0.7666,
 
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.5029585798816568,
125
- "grad_norm": 0.35934245892932387,
126
  "learning_rate": 1.164594590280734e-05,
127
- "loss": 0.7706,
 
128
  "step": 85
129
  },
130
  {
131
  "epoch": 0.5325443786982249,
132
- "grad_norm": 0.3600712658909069,
133
  "learning_rate": 1.0619653946285948e-05,
134
- "loss": 0.7699,
 
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.5621301775147929,
139
- "grad_norm": 0.3623401906293079,
140
  "learning_rate": 9.586750257511868e-06,
141
- "loss": 0.7771,
 
142
  "step": 95
143
  },
144
  {
145
  "epoch": 0.591715976331361,
146
- "grad_norm": 0.3321501796878117,
147
  "learning_rate": 8.558255959926533e-06,
148
- "loss": 0.753,
 
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.591715976331361,
153
- "eval_loss": 0.7878655195236206,
154
- "eval_runtime": 0.782,
155
- "eval_samples_per_second": 163.676,
156
- "eval_steps_per_second": 5.115,
 
157
  "step": 100
158
  },
159
  {
160
  "epoch": 0.621301775147929,
161
- "grad_norm": 0.3343545042291716,
162
  "learning_rate": 7.545145128592009e-06,
163
- "loss": 0.7569,
 
164
  "step": 105
165
  },
166
  {
167
  "epoch": 0.650887573964497,
168
- "grad_norm": 0.3444014813426581,
169
  "learning_rate": 6.558227696373617e-06,
170
- "loss": 0.768,
 
171
  "step": 110
172
  },
173
  {
174
  "epoch": 0.6804733727810651,
175
- "grad_norm": 0.3079534620667556,
176
  "learning_rate": 5.608034111526298e-06,
177
- "loss": 0.7623,
 
178
  "step": 115
179
  },
180
  {
181
  "epoch": 0.7100591715976331,
182
- "grad_norm": 0.2980052178831321,
183
  "learning_rate": 4.704702977392914e-06,
184
- "loss": 0.7513,
 
185
  "step": 120
186
  },
187
  {
188
  "epoch": 0.7396449704142012,
189
- "grad_norm": 0.3422464501794574,
190
  "learning_rate": 3.857872873103322e-06,
191
- "loss": 0.7537,
 
192
  "step": 125
193
  },
194
  {
195
  "epoch": 0.7692307692307693,
196
- "grad_norm": 0.31159482318154197,
197
  "learning_rate": 3.0765795095517026e-06,
198
- "loss": 0.7555,
 
199
  "step": 130
200
  },
201
  {
202
  "epoch": 0.7988165680473372,
203
- "grad_norm": 0.3216694034093574,
204
  "learning_rate": 2.369159318001937e-06,
205
- "loss": 0.7583,
 
206
  "step": 135
207
  },
208
  {
209
  "epoch": 0.8284023668639053,
210
- "grad_norm": 0.29895838755514975,
211
  "learning_rate": 1.743160500034443e-06,
212
- "loss": 0.7498,
 
213
  "step": 140
214
  },
215
  {
216
  "epoch": 0.8579881656804734,
217
- "grad_norm": 0.3121733796733359,
218
  "learning_rate": 1.2052624879351105e-06,
219
- "loss": 0.7565,
 
220
  "step": 145
221
  },
222
  {
223
  "epoch": 0.8875739644970414,
224
- "grad_norm": 0.29445775165969484,
225
  "learning_rate": 7.612046748871327e-07,
226
- "loss": 0.7665,
 
227
  "step": 150
228
  },
229
  {
230
  "epoch": 0.9171597633136095,
231
- "grad_norm": 0.2963830850309388,
232
  "learning_rate": 4.1572517541747294e-07,
233
- "loss": 0.7613,
 
234
  "step": 155
235
  },
236
  {
237
  "epoch": 0.9467455621301775,
238
- "grad_norm": 0.2863988168086613,
239
  "learning_rate": 1.7251026952640583e-07,
240
- "loss": 0.7606,
 
241
  "step": 160
242
  },
243
  {
244
  "epoch": 0.9763313609467456,
245
- "grad_norm": 0.2760424898927774,
246
  "learning_rate": 3.4155069933301535e-08,
247
- "loss": 0.7445,
 
248
  "step": 165
249
  },
250
  {
251
  "epoch": 1.0,
 
252
  "step": 169,
253
- "total_flos": 76916824473600.0,
254
- "train_loss": 0.8026207946461333,
255
- "train_runtime": 498.3301,
256
- "train_samples_per_second": 43.365,
257
- "train_steps_per_second": 0.339
258
  }
259
  ],
260
  "logging_steps": 5,
@@ -274,8 +309,8 @@
274
  "attributes": {}
275
  }
276
  },
277
- "total_flos": 76916824473600.0,
278
- "train_batch_size": 4,
279
  "trial_name": null,
280
  "trial_params": null
281
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.029585798816568046,
13
+ "grad_norm": 2.2988141077652386,
14
  "learning_rate": 5.882352941176471e-06,
15
+ "loss": 1.1002,
16
+ "mean_token_accuracy": 0.7101159904452009,
17
  "step": 5
18
  },
19
  {
20
  "epoch": 0.05917159763313609,
21
+ "grad_norm": 1.596218927081988,
22
  "learning_rate": 1.1764705882352942e-05,
23
+ "loss": 1.0327,
24
+ "mean_token_accuracy": 0.720176724225474,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 0.08875739644970414,
29
+ "grad_norm": 0.8479104246833407,
30
  "learning_rate": 1.7647058823529414e-05,
31
+ "loss": 0.9517,
32
+ "mean_token_accuracy": 0.7325662989711975,
33
  "step": 15
34
  },
35
  {
36
  "epoch": 0.11834319526627218,
37
+ "grad_norm": 0.6465483847858254,
38
  "learning_rate": 1.9980782984658682e-05,
39
+ "loss": 0.8804,
40
+ "mean_token_accuracy": 0.7473196725120462,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 0.14792899408284024,
45
+ "grad_norm": 0.5525325403311692,
46
  "learning_rate": 1.9863613034027224e-05,
47
+ "loss": 0.8536,
48
+ "mean_token_accuracy": 0.7518004036197122,
49
  "step": 25
50
  },
51
  {
52
  "epoch": 0.17751479289940827,
53
+ "grad_norm": 0.44179450773629486,
54
  "learning_rate": 1.9641197940012136e-05,
55
+ "loss": 0.8404,
56
+ "mean_token_accuracy": 0.7545674682549227,
57
  "step": 30
58
  },
59
  {
60
  "epoch": 0.20710059171597633,
61
+ "grad_norm": 0.39496081405164835,
62
  "learning_rate": 1.9315910880512792e-05,
63
+ "loss": 0.8212,
64
+ "mean_token_accuracy": 0.7580853602544403,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.23668639053254437,
69
+ "grad_norm": 0.38859302520022476,
70
  "learning_rate": 1.8891222681391853e-05,
71
+ "loss": 0.805,
72
+ "mean_token_accuracy": 0.7621072520755232,
73
  "step": 40
74
  },
75
  {
76
  "epoch": 0.26627218934911245,
77
+ "grad_norm": 0.4281961569580366,
78
  "learning_rate": 1.8371664782625287e-05,
79
+ "loss": 0.7984,
80
+ "mean_token_accuracy": 0.7635130587409846,
81
  "step": 45
82
  },
83
  {
84
  "epoch": 0.2958579881656805,
85
+ "grad_norm": 0.393042490001915,
86
  "learning_rate": 1.7762780887657576e-05,
87
+ "loss": 0.7936,
88
+ "mean_token_accuracy": 0.7642980542104124,
89
  "step": 50
90
  },
91
  {
92
  "epoch": 0.3254437869822485,
93
+ "grad_norm": 0.3719832470365729,
94
  "learning_rate": 1.7071067811865477e-05,
95
+ "loss": 0.7894,
96
+ "mean_token_accuracy": 0.7649172633057038,
97
  "step": 55
98
  },
99
  {
100
  "epoch": 0.35502958579881655,
101
+ "grad_norm": 0.36967289202890546,
102
  "learning_rate": 1.6303906161279554e-05,
103
+ "loss": 0.7999,
104
+ "mean_token_accuracy": 0.7619633834951797,
105
  "step": 60
106
  },
107
  {
108
  "epoch": 0.38461538461538464,
109
+ "grad_norm": 0.3965455034810465,
110
  "learning_rate": 1.5469481581224274e-05,
111
+ "loss": 0.7867,
112
+ "mean_token_accuracy": 0.7647511953755232,
113
  "step": 65
114
  },
115
  {
116
  "epoch": 0.41420118343195267,
117
+ "grad_norm": 0.368068249126072,
118
  "learning_rate": 1.4576697415156818e-05,
119
+ "loss": 0.7782,
120
+ "mean_token_accuracy": 0.7674480153107033,
121
  "step": 70
122
  },
123
  {
124
  "epoch": 0.4437869822485207,
125
+ "grad_norm": 0.3388950128403286,
126
  "learning_rate": 1.3635079705638298e-05,
127
+ "loss": 0.777,
128
+ "mean_token_accuracy": 0.7675313738107395,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 0.47337278106508873,
133
+ "grad_norm": 0.3430098886707366,
134
  "learning_rate": 1.2654675551080724e-05,
135
+ "loss": 0.7783,
136
+ "mean_token_accuracy": 0.7666054602362825,
137
  "step": 80
138
  },
139
  {
140
  "epoch": 0.5029585798816568,
141
+ "grad_norm": 0.33478411969703326,
142
  "learning_rate": 1.164594590280734e-05,
143
+ "loss": 0.7785,
144
+ "mean_token_accuracy": 0.7669639865984688,
145
  "step": 85
146
  },
147
  {
148
  "epoch": 0.5325443786982249,
149
+ "grad_norm": 0.34768462687825924,
150
  "learning_rate": 1.0619653946285948e-05,
151
+ "loss": 0.7723,
152
+ "mean_token_accuracy": 0.7682154932141926,
153
  "step": 90
154
  },
155
  {
156
  "epoch": 0.5621301775147929,
157
+ "grad_norm": 0.3445315216411646,
158
  "learning_rate": 9.586750257511868e-06,
159
+ "loss": 0.7668,
160
+ "mean_token_accuracy": 0.7698424375606829,
161
  "step": 95
162
  },
163
  {
164
  "epoch": 0.591715976331361,
165
+ "grad_norm": 0.3412337472975462,
166
  "learning_rate": 8.558255959926533e-06,
167
+ "loss": 0.7582,
168
+ "mean_token_accuracy": 0.7714304347299908,
169
  "step": 100
170
  },
171
  {
172
  "epoch": 0.591715976331361,
173
+ "eval_loss": 0.7885846495628357,
174
+ "eval_mean_token_accuracy": 0.7482494053174686,
175
+ "eval_runtime": 2.2356,
176
+ "eval_samples_per_second": 57.703,
177
+ "eval_steps_per_second": 2.237,
178
  "step": 100
179
  },
180
  {
181
  "epoch": 0.621301775147929,
182
+ "grad_norm": 0.35299455458895956,
183
  "learning_rate": 7.545145128592009e-06,
184
+ "loss": 0.759,
185
+ "mean_token_accuracy": 0.7714738320259433,
186
  "step": 105
187
  },
188
  {
189
  "epoch": 0.650887573964497,
190
+ "grad_norm": 0.31616289960261323,
191
  "learning_rate": 6.558227696373617e-06,
192
+ "loss": 0.7775,
193
+ "mean_token_accuracy": 0.7664820794613972,
194
  "step": 110
195
  },
196
  {
197
  "epoch": 0.6804733727810651,
198
+ "grad_norm": 0.307888028457379,
199
  "learning_rate": 5.608034111526298e-06,
200
+ "loss": 0.7636,
201
+ "mean_token_accuracy": 0.7704197310032757,
202
  "step": 115
203
  },
204
  {
205
  "epoch": 0.7100591715976331,
206
+ "grad_norm": 0.2995694223287612,
207
  "learning_rate": 4.704702977392914e-06,
208
+ "loss": 0.7511,
209
+ "mean_token_accuracy": 0.7742426406160382,
210
  "step": 120
211
  },
212
  {
213
  "epoch": 0.7396449704142012,
214
+ "grad_norm": 0.31585565817231076,
215
  "learning_rate": 3.857872873103322e-06,
216
+ "loss": 0.7595,
217
+ "mean_token_accuracy": 0.7717561342325581,
218
  "step": 125
219
  },
220
  {
221
  "epoch": 0.7692307692307693,
222
+ "grad_norm": 0.32069400376048846,
223
  "learning_rate": 3.0765795095517026e-06,
224
+ "loss": 0.7624,
225
+ "mean_token_accuracy": 0.7705784584578896,
226
  "step": 130
227
  },
228
  {
229
  "epoch": 0.7988165680473372,
230
+ "grad_norm": 0.30337427328877,
231
  "learning_rate": 2.369159318001937e-06,
232
+ "loss": 0.7684,
233
+ "mean_token_accuracy": 0.7688614612364428,
234
  "step": 135
235
  },
236
  {
237
  "epoch": 0.8284023668639053,
238
+ "grad_norm": 0.2931589273225087,
239
  "learning_rate": 1.743160500034443e-06,
240
+ "loss": 0.7597,
241
+ "mean_token_accuracy": 0.7717145410630937,
242
  "step": 140
243
  },
244
  {
245
  "epoch": 0.8579881656804734,
246
+ "grad_norm": 0.31239776071011455,
247
  "learning_rate": 1.2052624879351105e-06,
248
+ "loss": 0.7595,
249
+ "mean_token_accuracy": 0.7714656640175818,
250
  "step": 145
251
  },
252
  {
253
  "epoch": 0.8875739644970414,
254
+ "grad_norm": 0.2909480842066185,
255
  "learning_rate": 7.612046748871327e-07,
256
+ "loss": 0.7505,
257
+ "mean_token_accuracy": 0.7739669991508189,
258
  "step": 150
259
  },
260
  {
261
  "epoch": 0.9171597633136095,
262
+ "grad_norm": 0.279458821658001,
263
  "learning_rate": 4.1572517541747294e-07,
264
+ "loss": 0.741,
265
+ "mean_token_accuracy": 0.7767323569336231,
266
  "step": 155
267
  },
268
  {
269
  "epoch": 0.9467455621301775,
270
+ "grad_norm": 0.29648687488650516,
271
  "learning_rate": 1.7251026952640583e-07,
272
+ "loss": 0.754,
273
+ "mean_token_accuracy": 0.7732043043845805,
274
  "step": 160
275
  },
276
  {
277
  "epoch": 0.9763313609467456,
278
+ "grad_norm": 0.2773995248389471,
279
  "learning_rate": 3.4155069933301535e-08,
280
+ "loss": 0.7467,
281
+ "mean_token_accuracy": 0.7749679952859235,
282
  "step": 165
283
  },
284
  {
285
  "epoch": 1.0,
286
+ "mean_token_accuracy": 0.7705127247007673,
287
  "step": 169,
288
+ "total_flos": 76973799899136.0,
289
+ "train_loss": 0.8025032683942445,
290
+ "train_runtime": 710.0203,
291
+ "train_samples_per_second": 30.454,
292
+ "train_steps_per_second": 0.238
293
  }
294
  ],
295
  "logging_steps": 5,
 
309
  "attributes": {}
310
  }
311
  },
312
+ "total_flos": 76973799899136.0,
313
+ "train_batch_size": 2,
314
  "trial_name": null,
315
  "trial_params": null
316
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:237a15eca0faf2c43d81e67abeea627888a117b938f040d9bf15c055f4cd8fe2
3
- size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e7043d0dcec036604186a342c05b4b1b9b359afc64da2f56ce61db35f0a4ae1
3
+ size 7416