Model save
Browse files- README.md +1 -1
- all_results.json +4 -4
- model.safetensors +1 -1
- runs/Jan31_21-54-01_ip-26-0-160-192/events.out.tfevents.1738360500.ip-26-0-160-192.374216.0 +3 -0
- runs/Jan31_22-01-36_ip-26-0-160-192/events.out.tfevents.1738360955.ip-26-0-160-192.377629.0 +3 -0
- runs/Jan31_22-07-53_ip-26-0-161-78/events.out.tfevents.1738361338.ip-26-0-161-78.2097139.0 +3 -0
- train_results.json +4 -4
- trainer_state.json +651 -509
- training_args.bin +2 -2
README.md
CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
|
|
27 |
|
28 |
## Training procedure
|
29 |
|
30 |
-
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/huggingface/huggingface/runs/
|
31 |
|
32 |
|
33 |
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
|
|
27 |
|
28 |
## Training procedure
|
29 |
|
30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/huggingface/huggingface/runs/0f5fvgp8)
|
31 |
|
32 |
|
33 |
This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
|
all_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"total_flos": 0.0,
|
3 |
-
"train_loss": 0.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 72441,
|
6 |
-
"train_samples_per_second":
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
"total_flos": 0.0,
|
3 |
+
"train_loss": 0.007009532302370239,
|
4 |
+
"train_runtime": 74639.7368,
|
5 |
"train_samples": 72441,
|
6 |
+
"train_samples_per_second": 0.971,
|
7 |
+
"train_steps_per_second": 0.009
|
8 |
}
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 3554214752
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:120de27679fadf851221dcd9419b615665e01c651c2dbeb646d254688d67fe3e
|
3 |
size 3554214752
|
runs/Jan31_21-54-01_ip-26-0-160-192/events.out.tfevents.1738360500.ip-26-0-160-192.374216.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0199d7a43ae9d04a22f332fe814b32a948d16d775301e28eb6103cd52baca08a
|
3 |
+
size 5622
|
runs/Jan31_22-01-36_ip-26-0-160-192/events.out.tfevents.1738360955.ip-26-0-160-192.377629.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cb2239cee3f4c66347b42e07f91165a59938dbac39eca3e2df5c725d0777656
|
3 |
+
size 16622
|
runs/Jan31_22-07-53_ip-26-0-161-78/events.out.tfevents.1738361338.ip-26-0-161-78.2097139.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f1c7e18d5bf594fedc1219c195bfb0304b20df5f799da8b1abec366ace1898a
|
3 |
+
size 16621
|
train_results.json
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
{
|
2 |
"total_flos": 0.0,
|
3 |
-
"train_loss": 0.
|
4 |
-
"train_runtime":
|
5 |
"train_samples": 72441,
|
6 |
-
"train_samples_per_second":
|
7 |
-
"train_steps_per_second": 0.
|
8 |
}
|
|
|
1 |
{
|
2 |
"total_flos": 0.0,
|
3 |
+
"train_loss": 0.007009532302370239,
|
4 |
+
"train_runtime": 74639.7368,
|
5 |
"train_samples": 72441,
|
6 |
+
"train_samples_per_second": 0.971,
|
7 |
+
"train_steps_per_second": 0.009
|
8 |
}
|
trainer_state.json
CHANGED
@@ -9,935 +9,1077 @@
|
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
-
"completion_length":
|
13 |
"epoch": 0.015460430959512996,
|
14 |
-
"grad_norm": 0.
|
15 |
-
"kl": 0.
|
16 |
"learning_rate": 3.0769230769230774e-06,
|
17 |
"loss": 0.0,
|
18 |
-
"reward": 0.
|
19 |
-
"reward_std": 0.
|
20 |
-
"rewards/accuracy_reward": 0.
|
|
|
21 |
"rewards/format_reward": 0.0,
|
|
|
22 |
"step": 10
|
23 |
},
|
24 |
{
|
25 |
-
"completion_length":
|
26 |
"epoch": 0.03092086191902599,
|
27 |
-
"grad_norm": 0.
|
28 |
-
"kl": 0.
|
29 |
"learning_rate": 6.153846153846155e-06,
|
30 |
"loss": 0.0002,
|
31 |
-
"reward":
|
32 |
-
"reward_std": 0.
|
33 |
-
"rewards/accuracy_reward": 0.
|
|
|
34 |
"rewards/format_reward": 0.0,
|
|
|
35 |
"step": 20
|
36 |
},
|
37 |
{
|
38 |
-
"completion_length":
|
39 |
"epoch": 0.04638129287853899,
|
40 |
-
"grad_norm": 0.
|
41 |
-
"kl": 0.
|
42 |
"learning_rate": 9.230769230769232e-06,
|
43 |
-
"loss": 0.
|
44 |
-
"reward":
|
45 |
-
"reward_std": 0.
|
46 |
-
"rewards/accuracy_reward": 0.
|
|
|
47 |
"rewards/format_reward": 0.0,
|
|
|
48 |
"step": 30
|
49 |
},
|
50 |
{
|
51 |
-
"completion_length":
|
52 |
"epoch": 0.06184172383805198,
|
53 |
-
"grad_norm": 0.
|
54 |
-
"kl": 0.
|
55 |
"learning_rate": 1.230769230769231e-05,
|
56 |
-
"loss": 0.
|
57 |
-
"reward":
|
58 |
-
"reward_std": 0.
|
59 |
-
"rewards/accuracy_reward": 0.
|
|
|
60 |
"rewards/format_reward": 0.0,
|
|
|
61 |
"step": 40
|
62 |
},
|
63 |
{
|
64 |
-
"completion_length":
|
65 |
"epoch": 0.07730215479756498,
|
66 |
-
"grad_norm": 0.
|
67 |
-
"kl": 0.
|
68 |
"learning_rate": 1.5384615384615387e-05,
|
69 |
"loss": 0.0009,
|
70 |
-
"reward":
|
71 |
-
"reward_std": 0.
|
72 |
-
"rewards/accuracy_reward": 0.
|
|
|
73 |
"rewards/format_reward": 0.0,
|
|
|
74 |
"step": 50
|
75 |
},
|
76 |
{
|
77 |
-
"completion_length":
|
78 |
"epoch": 0.09276258575707798,
|
79 |
-
"grad_norm": 0.
|
80 |
-
"kl": 0.
|
81 |
"learning_rate": 1.8461538461538465e-05,
|
82 |
-
"loss": 0.
|
83 |
-
"reward":
|
84 |
-
"reward_std": 0.
|
85 |
-
"rewards/accuracy_reward": 0.
|
|
|
86 |
"rewards/format_reward": 0.0,
|
|
|
87 |
"step": 60
|
88 |
},
|
89 |
{
|
90 |
-
"completion_length":
|
91 |
"epoch": 0.10822301671659097,
|
92 |
-
"grad_norm": 0.
|
93 |
-
"kl": 0.
|
94 |
"learning_rate": 1.999634547413886e-05,
|
95 |
-
"loss": 0.
|
96 |
-
"reward":
|
97 |
-
"reward_std": 0.
|
98 |
-
"rewards/accuracy_reward": 0.
|
|
|
99 |
"rewards/format_reward": 0.0,
|
|
|
100 |
"step": 70
|
101 |
},
|
102 |
{
|
103 |
-
"completion_length":
|
104 |
"epoch": 0.12368344767610397,
|
105 |
-
"grad_norm": 0.
|
106 |
-
"kl": 0.
|
107 |
"learning_rate": 1.9967125291968495e-05,
|
108 |
-
"loss": 0.
|
109 |
-
"reward":
|
110 |
-
"reward_std": 0.
|
111 |
-
"rewards/accuracy_reward": 0.
|
|
|
112 |
"rewards/format_reward": 0.0,
|
|
|
113 |
"step": 80
|
114 |
},
|
115 |
{
|
116 |
-
"completion_length":
|
117 |
"epoch": 0.13914387863561697,
|
118 |
-
"grad_norm": 0.
|
119 |
-
"kl": 0.
|
120 |
"learning_rate": 1.990877034074683e-05,
|
121 |
-
"loss": 0.
|
122 |
-
"reward":
|
123 |
-
"reward_std": 0.
|
124 |
-
"rewards/accuracy_reward": 0.
|
|
|
125 |
"rewards/format_reward": 0.0,
|
|
|
126 |
"step": 90
|
127 |
},
|
128 |
{
|
129 |
-
"completion_length":
|
130 |
"epoch": 0.15460430959512997,
|
131 |
-
"grad_norm": 0.
|
132 |
-
"kl": 0.
|
133 |
"learning_rate": 1.9821451197042028e-05,
|
134 |
-
"loss": 0.
|
135 |
-
"reward":
|
136 |
-
"reward_std": 0.
|
137 |
-
"rewards/accuracy_reward": 0.
|
|
|
138 |
"rewards/format_reward": 0.0,
|
|
|
139 |
"step": 100
|
140 |
},
|
141 |
{
|
142 |
"epoch": 0.15460430959512997,
|
143 |
-
"eval_completion_length":
|
144 |
-
"eval_kl": 0.
|
145 |
-
"eval_loss": 0.
|
146 |
-
"eval_reward":
|
147 |
-
"eval_reward_std": 0.
|
148 |
-
"eval_rewards/accuracy_reward": 0.
|
|
|
149 |
"eval_rewards/format_reward": 0.0,
|
150 |
-
"
|
151 |
-
"
|
152 |
-
"
|
|
|
153 |
"step": 100
|
154 |
},
|
155 |
{
|
156 |
-
"completion_length":
|
157 |
"epoch": 0.17006474055464296,
|
158 |
-
"grad_norm": 0.
|
159 |
-
"kl":
|
160 |
"learning_rate": 1.9705423102261324e-05,
|
161 |
-
"loss": 0.
|
162 |
-
"reward":
|
163 |
-
"reward_std": 0.
|
164 |
-
"rewards/accuracy_reward": 0.
|
|
|
165 |
"rewards/format_reward": 0.0,
|
|
|
166 |
"step": 110
|
167 |
},
|
168 |
{
|
169 |
-
"completion_length":
|
170 |
"epoch": 0.18552517151415596,
|
171 |
-
"grad_norm": 0.
|
172 |
-
"kl": 0.
|
173 |
"learning_rate": 1.956102521655831e-05,
|
174 |
-
"loss": 0.
|
175 |
-
"reward":
|
176 |
-
"reward_std": 0.
|
177 |
-
"rewards/accuracy_reward": 0.
|
|
|
178 |
"rewards/format_reward": 0.0,
|
|
|
179 |
"step": 120
|
180 |
},
|
181 |
{
|
182 |
-
"completion_length":
|
183 |
"epoch": 0.20098560247366895,
|
184 |
-
"grad_norm": 0.
|
185 |
-
"kl": 0.
|
186 |
"learning_rate": 1.9388679627438486e-05,
|
187 |
-
"loss": 0.
|
188 |
-
"reward":
|
189 |
-
"reward_std": 0.
|
190 |
-
"rewards/accuracy_reward": 0.
|
|
|
191 |
"rewards/format_reward": 0.0,
|
|
|
192 |
"step": 130
|
193 |
},
|
194 |
{
|
195 |
-
"completion_length":
|
196 |
"epoch": 0.21644603343318194,
|
197 |
-
"grad_norm": 0.
|
198 |
-
"kl":
|
199 |
"learning_rate": 1.9188890115960967e-05,
|
200 |
-
"loss": 0.
|
201 |
-
"reward":
|
202 |
-
"reward_std": 0.
|
203 |
-
"rewards/accuracy_reward": 0.
|
|
|
204 |
"rewards/format_reward": 0.0,
|
|
|
205 |
"step": 140
|
206 |
},
|
207 |
{
|
208 |
-
"completion_length":
|
209 |
"epoch": 0.23190646439269494,
|
210 |
-
"grad_norm": 0.
|
211 |
-
"kl": 0.
|
212 |
"learning_rate": 1.8962240684142923e-05,
|
213 |
-
"loss": 0.
|
214 |
-
"reward":
|
215 |
-
"reward_std": 0.
|
216 |
-
"rewards/accuracy_reward": 0.
|
|
|
217 |
"rewards/format_reward": 0.0,
|
|
|
218 |
"step": 150
|
219 |
},
|
220 |
{
|
221 |
-
"completion_length":
|
222 |
"epoch": 0.24736689535220793,
|
223 |
-
"grad_norm": 0.
|
224 |
-
"kl": 0.
|
225 |
"learning_rate": 1.8709393847871146e-05,
|
226 |
-
"loss": 0.
|
227 |
-
"reward":
|
228 |
-
"reward_std": 0.
|
229 |
-
"rewards/accuracy_reward": 0.
|
|
|
230 |
"rewards/format_reward": 0.0,
|
|
|
231 |
"step": 160
|
232 |
},
|
233 |
{
|
234 |
-
"completion_length":
|
235 |
"epoch": 0.26282732631172095,
|
236 |
-
"grad_norm": 0.
|
237 |
-
"kl": 0.
|
238 |
"learning_rate": 1.8431088700310846e-05,
|
239 |
-
"loss": 0.
|
240 |
-
"reward":
|
241 |
-
"reward_std": 0.
|
242 |
-
"rewards/accuracy_reward": 0.
|
|
|
243 |
"rewards/format_reward": 0.0,
|
|
|
244 |
"step": 170
|
245 |
},
|
246 |
{
|
247 |
-
"completion_length":
|
248 |
"epoch": 0.27828775727123395,
|
249 |
-
"grad_norm": 0.
|
250 |
-
"kl": 0.
|
251 |
"learning_rate": 1.8128138751472432e-05,
|
252 |
-
"loss": 0.
|
253 |
-
"reward":
|
254 |
-
"reward_std": 0.
|
255 |
-
"rewards/accuracy_reward": 0.
|
|
|
256 |
"rewards/format_reward": 0.0,
|
|
|
257 |
"step": 180
|
258 |
},
|
259 |
{
|
260 |
-
"completion_length":
|
261 |
"epoch": 0.29374818823074694,
|
262 |
-
"grad_norm": 0.
|
263 |
-
"kl": 0.
|
264 |
"learning_rate": 1.780142955025139e-05,
|
265 |
-
"loss": 0.
|
266 |
-
"reward":
|
267 |
-
"reward_std": 0.
|
268 |
-
"rewards/accuracy_reward": 0.
|
|
|
269 |
"rewards/format_reward": 0.0,
|
|
|
270 |
"step": 190
|
271 |
},
|
272 |
{
|
273 |
-
"completion_length":
|
274 |
"epoch": 0.30920861919025994,
|
275 |
-
"grad_norm": 0.
|
276 |
-
"kl": 0.
|
277 |
"learning_rate": 1.745191609589231e-05,
|
278 |
-
"loss": 0.
|
279 |
-
"reward":
|
280 |
-
"reward_std": 0.
|
281 |
-
"rewards/accuracy_reward": 0.
|
|
|
282 |
"rewards/format_reward": 0.0,
|
|
|
283 |
"step": 200
|
284 |
},
|
285 |
{
|
286 |
"epoch": 0.30920861919025994,
|
287 |
-
"eval_completion_length":
|
288 |
-
"eval_kl": 0.
|
289 |
-
"eval_loss": 0.
|
290 |
-
"eval_reward":
|
291 |
-
"eval_reward_std": 0.
|
292 |
-
"eval_rewards/accuracy_reward": 0.
|
|
|
293 |
"eval_rewards/format_reward": 0.0,
|
294 |
-
"
|
295 |
-
"
|
296 |
-
"
|
|
|
297 |
"step": 200
|
298 |
},
|
299 |
{
|
300 |
-
"completion_length":
|
301 |
"epoch": 0.32466905014977293,
|
302 |
-
"grad_norm": 0.
|
303 |
-
"kl": 0.
|
304 |
"learning_rate": 1.7080620046443503e-05,
|
305 |
-
"loss": 0.
|
306 |
-
"reward":
|
307 |
-
"reward_std": 0.
|
308 |
-
"rewards/accuracy_reward": 0.
|
|
|
309 |
"rewards/format_reward": 0.0,
|
|
|
310 |
"step": 210
|
311 |
},
|
312 |
{
|
313 |
-
"completion_length":
|
314 |
"epoch": 0.3401294811092859,
|
315 |
-
"grad_norm": 0.
|
316 |
-
"kl": 0.
|
317 |
"learning_rate": 1.6688626732362192e-05,
|
318 |
-
"loss": 0.
|
319 |
-
"reward":
|
320 |
-
"reward_std": 0.
|
321 |
-
"rewards/accuracy_reward": 0.
|
|
|
322 |
"rewards/format_reward": 0.0,
|
|
|
323 |
"step": 220
|
324 |
},
|
325 |
{
|
326 |
-
"completion_length":
|
327 |
"epoch": 0.3555899120687989,
|
328 |
-
"grad_norm": 0.
|
329 |
-
"kl": 0.
|
330 |
"learning_rate": 1.6277081983999742e-05,
|
331 |
-
"loss": 0.
|
332 |
-
"reward":
|
333 |
-
"reward_std": 0.
|
334 |
-
"rewards/accuracy_reward": 0.
|
|
|
335 |
"rewards/format_reward": 0.0,
|
|
|
336 |
"step": 230
|
337 |
},
|
338 |
{
|
339 |
-
"completion_length":
|
340 |
"epoch": 0.3710503430283119,
|
341 |
-
"grad_norm": 0.
|
342 |
-
"kl": 0.
|
343 |
"learning_rate": 1.5847188782240473e-05,
|
344 |
-
"loss": 0.
|
345 |
-
"reward":
|
346 |
-
"reward_std": 0.
|
347 |
-
"rewards/accuracy_reward": 0.
|
|
|
348 |
"rewards/format_reward": 0.0,
|
|
|
349 |
"step": 240
|
350 |
},
|
351 |
{
|
352 |
-
"completion_length":
|
353 |
"epoch": 0.3865107739878249,
|
354 |
-
"grad_norm": 0.
|
355 |
-
"kl": 0.
|
356 |
"learning_rate": 1.5400203742084508e-05,
|
357 |
-
"loss": 0.
|
358 |
-
"reward":
|
359 |
-
"reward_std": 0.
|
360 |
-
"rewards/accuracy_reward": 0.
|
|
|
361 |
"rewards/format_reward": 0.0,
|
|
|
362 |
"step": 250
|
363 |
},
|
364 |
{
|
365 |
-
"completion_length":
|
366 |
"epoch": 0.4019712049473379,
|
367 |
-
"grad_norm": 0.
|
368 |
-
"kl": 0.
|
369 |
"learning_rate": 1.4937433439453465e-05,
|
370 |
-
"loss": 0.
|
371 |
-
"reward":
|
372 |
-
"reward_std": 0.
|
373 |
-
"rewards/accuracy_reward": 0.
|
|
|
374 |
"rewards/format_reward": 0.0,
|
|
|
375 |
"step": 260
|
376 |
},
|
377 |
{
|
378 |
-
"completion_length":
|
379 |
"epoch": 0.4174316359068509,
|
380 |
-
"grad_norm": 0.
|
381 |
-
"kl": 0.
|
382 |
"learning_rate": 1.4460230591956097e-05,
|
383 |
-
"loss": 0.
|
384 |
-
"reward":
|
385 |
-
"reward_std": 0.
|
386 |
-
"rewards/accuracy_reward": 0.
|
|
|
387 |
"rewards/format_reward": 0.0,
|
|
|
388 |
"step": 270
|
389 |
},
|
390 |
{
|
391 |
-
"completion_length":
|
392 |
"epoch": 0.4328920668663639,
|
393 |
-
"grad_norm": 0.
|
394 |
-
"kl": 0.
|
395 |
"learning_rate": 1.3969990104777712e-05,
|
396 |
-
"loss": 0.
|
397 |
-
"reward":
|
398 |
-
"reward_std": 0.
|
399 |
-
"rewards/accuracy_reward": 0.
|
|
|
400 |
"rewards/format_reward": 0.0,
|
|
|
401 |
"step": 280
|
402 |
},
|
403 |
{
|
404 |
-
"completion_length":
|
405 |
"epoch": 0.4483524978258769,
|
406 |
-
"grad_norm": 0.
|
407 |
-
"kl": 0.
|
408 |
"learning_rate": 1.3468144993251735e-05,
|
409 |
-
"loss": 0.
|
410 |
-
"reward":
|
411 |
-
"reward_std": 0.
|
412 |
-
"rewards/accuracy_reward": 0.
|
|
|
413 |
"rewards/format_reward": 0.0,
|
|
|
414 |
"step": 290
|
415 |
},
|
416 |
{
|
417 |
-
"completion_length":
|
418 |
"epoch": 0.4638129287853899,
|
419 |
-
"grad_norm": 0.
|
420 |
-
"kl":
|
421 |
"learning_rate": 1.295616219403197e-05,
|
422 |
-
"loss": 0.
|
423 |
-
"reward":
|
424 |
-
"reward_std": 0.
|
425 |
-
"rewards/accuracy_reward": 0.
|
|
|
426 |
"rewards/format_reward": 0.0,
|
|
|
427 |
"step": 300
|
428 |
},
|
429 |
{
|
430 |
"epoch": 0.4638129287853899,
|
431 |
-
"eval_completion_length":
|
432 |
-
"eval_kl":
|
433 |
-
"eval_loss": 0.
|
434 |
-
"eval_reward":
|
435 |
-
"eval_reward_std": 0.
|
436 |
-
"eval_rewards/accuracy_reward": 0.
|
|
|
437 |
"eval_rewards/format_reward": 0.0,
|
438 |
-
"
|
439 |
-
"
|
440 |
-
"
|
|
|
441 |
"step": 300
|
442 |
},
|
443 |
{
|
444 |
-
"completion_length":
|
445 |
"epoch": 0.47927335974490287,
|
446 |
-
"grad_norm": 0.
|
447 |
-
"kl": 0.
|
448 |
"learning_rate": 1.2435538277109919e-05,
|
449 |
-
"loss": 0.
|
450 |
-
"reward":
|
451 |
-
"reward_std": 0.
|
452 |
-
"rewards/accuracy_reward": 0.
|
|
|
453 |
"rewards/format_reward": 0.0,
|
|
|
454 |
"step": 310
|
455 |
},
|
456 |
{
|
457 |
-
"completion_length":
|
458 |
"epoch": 0.49473379070441587,
|
459 |
-
"grad_norm": 0.
|
460 |
-
"kl": 0.
|
461 |
"learning_rate": 1.19077950712113e-05,
|
462 |
-
"loss": 0.
|
463 |
-
"reward":
|
464 |
-
"reward_std": 0.
|
465 |
-
"rewards/accuracy_reward": 0.
|
|
|
466 |
"rewards/format_reward": 0.0,
|
|
|
467 |
"step": 320
|
468 |
},
|
469 |
{
|
470 |
-
"completion_length":
|
471 |
"epoch": 0.5101942216639289,
|
472 |
-
"grad_norm": 0.
|
473 |
-
"kl": 0.
|
474 |
"learning_rate": 1.137447521535908e-05,
|
475 |
-
"loss": 0.
|
476 |
-
"reward":
|
477 |
-
"reward_std": 0.
|
478 |
-
"rewards/accuracy_reward": 0.
|
|
|
479 |
"rewards/format_reward": 0.0,
|
|
|
480 |
"step": 330
|
481 |
},
|
482 |
{
|
483 |
-
"completion_length":
|
484 |
"epoch": 0.5256546526234419,
|
485 |
-
"grad_norm": 0.
|
486 |
-
"kl": 0.
|
487 |
"learning_rate": 1.0837137649606241e-05,
|
488 |
-
"loss": 0.
|
489 |
-
"reward":
|
490 |
-
"reward_std": 0.
|
491 |
-
"rewards/accuracy_reward": 0.
|
|
|
492 |
"rewards/format_reward": 0.0,
|
|
|
493 |
"step": 340
|
494 |
},
|
495 |
{
|
496 |
-
"completion_length":
|
497 |
"epoch": 0.5411150835829549,
|
498 |
-
"grad_norm": 0.
|
499 |
-
"kl": 0.
|
500 |
"learning_rate": 1.0297353058119209e-05,
|
501 |
-
"loss": 0.
|
502 |
-
"reward":
|
503 |
-
"reward_std": 0.
|
504 |
-
"rewards/accuracy_reward": 0.
|
|
|
505 |
"rewards/format_reward": 0.0,
|
|
|
506 |
"step": 350
|
507 |
},
|
508 |
{
|
509 |
-
"completion_length":
|
510 |
"epoch": 0.5565755145424679,
|
511 |
-
"grad_norm": 0.
|
512 |
-
"kl": 0.
|
513 |
"learning_rate": 9.756699277932196e-06,
|
514 |
-
"loss": 0.
|
515 |
-
"reward":
|
516 |
-
"reward_std": 0.
|
517 |
-
"rewards/accuracy_reward": 0.
|
|
|
518 |
"rewards/format_reward": 0.0,
|
|
|
519 |
"step": 360
|
520 |
},
|
521 |
{
|
522 |
-
"completion_length":
|
523 |
"epoch": 0.5720359455019809,
|
524 |
-
"grad_norm": 0.
|
525 |
-
"kl": 0.
|
526 |
"learning_rate": 9.216756686793163e-06,
|
527 |
-
"loss": 0.
|
528 |
-
"reward":
|
529 |
-
"reward_std": 0.
|
530 |
-
"rewards/accuracy_reward": 0.
|
|
|
531 |
"rewards/format_reward": 0.0,
|
|
|
532 |
"step": 370
|
533 |
},
|
534 |
{
|
535 |
-
"completion_length":
|
536 |
"epoch": 0.5874963764614939,
|
537 |
-
"grad_norm": 0.
|
538 |
-
"kl": 0.
|
539 |
"learning_rate": 8.67910358358298e-06,
|
540 |
-
"loss": 0.
|
541 |
-
"reward":
|
542 |
-
"reward_std": 0.
|
543 |
-
"rewards/accuracy_reward": 0.
|
|
|
544 |
"rewards/format_reward": 0.0,
|
|
|
545 |
"step": 380
|
546 |
},
|
547 |
{
|
548 |
-
"completion_length":
|
549 |
"epoch": 0.6029568074210069,
|
550 |
-
"grad_norm": 0.
|
551 |
-
"kl": 0.
|
552 |
"learning_rate": 8.145311574811325e-06,
|
553 |
-
"loss": 0.
|
554 |
-
"reward":
|
555 |
-
"reward_std": 0.
|
556 |
-
"rewards/accuracy_reward": 0.
|
|
|
557 |
"rewards/format_reward": 0.0,
|
|
|
558 |
"step": 390
|
559 |
},
|
560 |
{
|
561 |
-
"completion_length":
|
562 |
"epoch": 0.6184172383805199,
|
563 |
-
"grad_norm": 0.
|
564 |
-
"kl": 0.
|
565 |
"learning_rate": 7.616940980675004e-06,
|
566 |
-
"loss": 0.
|
567 |
-
"reward":
|
568 |
-
"reward_std": 0.
|
569 |
-
"rewards/accuracy_reward": 0.
|
|
|
570 |
"rewards/format_reward": 0.0,
|
|
|
571 |
"step": 400
|
572 |
},
|
573 |
{
|
574 |
"epoch": 0.6184172383805199,
|
575 |
-
"eval_completion_length":
|
576 |
-
"eval_kl": 0.
|
577 |
-
"eval_loss": 0.
|
578 |
-
"eval_reward":
|
579 |
-
"eval_reward_std": 0.
|
580 |
-
"eval_rewards/accuracy_reward": 0.
|
|
|
581 |
"eval_rewards/format_reward": 0.0,
|
582 |
-
"
|
583 |
-
"
|
584 |
-
"
|
|
|
585 |
"step": 400
|
586 |
},
|
587 |
{
|
588 |
-
"completion_length":
|
589 |
"epoch": 0.6338776693400329,
|
590 |
-
"grad_norm": 0.
|
591 |
-
"kl": 0.
|
592 |
"learning_rate": 7.095536274107046e-06,
|
593 |
-
"loss": 0.
|
594 |
-
"reward":
|
595 |
-
"reward_std": 0.
|
596 |
-
"rewards/accuracy_reward": 0.
|
|
|
597 |
"rewards/format_reward": 0.0,
|
|
|
598 |
"step": 410
|
599 |
},
|
600 |
{
|
601 |
-
"completion_length":
|
602 |
"epoch": 0.6493381002995459,
|
603 |
-
"grad_norm": 0.
|
604 |
-
"kl": 0.
|
605 |
"learning_rate": 6.58262156614881e-06,
|
606 |
-
"loss": 0.
|
607 |
-
"reward":
|
608 |
-
"reward_std": 0.
|
609 |
-
"rewards/accuracy_reward": 0.
|
|
|
610 |
"rewards/format_reward": 0.0,
|
|
|
611 |
"step": 420
|
612 |
},
|
613 |
{
|
614 |
-
"completion_length":
|
615 |
"epoch": 0.6647985312590589,
|
616 |
-
"grad_norm": 0.
|
617 |
-
"kl": 0.
|
618 |
"learning_rate": 6.079696150841634e-06,
|
619 |
-
"loss": 0.
|
620 |
-
"reward":
|
621 |
-
"reward_std": 0.
|
622 |
-
"rewards/accuracy_reward": 0.
|
|
|
623 |
"rewards/format_reward": 0.0,
|
|
|
624 |
"step": 430
|
625 |
},
|
626 |
{
|
627 |
-
"completion_length":
|
628 |
"epoch": 0.6802589622185718,
|
629 |
-
"grad_norm": 0.
|
630 |
-
"kl": 0.
|
631 |
"learning_rate": 5.588230122660672e-06,
|
632 |
-
"loss": 0.
|
633 |
-
"reward":
|
634 |
-
"reward_std": 0.
|
635 |
-
"rewards/accuracy_reward": 0.
|
|
|
636 |
"rewards/format_reward": 0.0,
|
|
|
637 |
"step": 440
|
638 |
},
|
639 |
{
|
640 |
-
"completion_length":
|
641 |
"epoch": 0.6957193931780848,
|
642 |
-
"grad_norm": 0.
|
643 |
-
"kl": 0.
|
644 |
"learning_rate": 5.109660079301668e-06,
|
645 |
-
"loss": 0.
|
646 |
-
"reward":
|
647 |
-
"reward_std": 0.
|
648 |
-
"rewards/accuracy_reward": 0.
|
|
|
649 |
"rewards/format_reward": 0.0,
|
|
|
650 |
"step": 450
|
651 |
},
|
652 |
{
|
653 |
-
"completion_length":
|
654 |
"epoch": 0.7111798241375978,
|
655 |
-
"grad_norm": 0.
|
656 |
-
"kl": 0.
|
657 |
"learning_rate": 4.64538492238166e-06,
|
658 |
-
"loss": 0.
|
659 |
-
"reward":
|
660 |
-
"reward_std": 0.
|
661 |
-
"rewards/accuracy_reward": 0.
|
|
|
662 |
"rewards/format_reward": 0.0,
|
|
|
663 |
"step": 460
|
664 |
},
|
665 |
{
|
666 |
-
"completion_length":
|
667 |
"epoch": 0.7266402550971108,
|
668 |
-
"grad_norm": 0.
|
669 |
-
"kl": 0.
|
670 |
"learning_rate": 4.196761768328599e-06,
|
671 |
-
"loss": 0.
|
672 |
-
"reward":
|
673 |
-
"reward_std": 0.
|
674 |
-
"rewards/accuracy_reward": 0.
|
|
|
675 |
"rewards/format_reward": 0.0,
|
|
|
676 |
"step": 470
|
677 |
},
|
678 |
{
|
679 |
-
"completion_length":
|
680 |
"epoch": 0.7421006860566238,
|
681 |
-
"grad_norm": 0.
|
682 |
-
"kl": 0.
|
683 |
"learning_rate": 3.7651019814126656e-06,
|
684 |
-
"loss": 0.
|
685 |
-
"reward":
|
686 |
-
"reward_std": 0.
|
687 |
-
"rewards/accuracy_reward": 0.
|
|
|
688 |
"rewards/format_reward": 0.0,
|
|
|
689 |
"step": 480
|
690 |
},
|
691 |
{
|
692 |
-
"completion_length":
|
693 |
"epoch": 0.7575611170161368,
|
694 |
-
"grad_norm": 0.
|
695 |
-
"kl":
|
696 |
"learning_rate": 3.3516673405151546e-06,
|
697 |
-
"loss": 0.
|
698 |
-
"reward":
|
699 |
-
"reward_std": 0.
|
700 |
-
"rewards/accuracy_reward": 0.
|
|
|
701 |
"rewards/format_reward": 0.0,
|
|
|
702 |
"step": 490
|
703 |
},
|
704 |
{
|
705 |
-
"completion_length":
|
706 |
"epoch": 0.7730215479756498,
|
707 |
-
"grad_norm": 0.
|
708 |
-
"kl": 0.
|
709 |
"learning_rate": 2.957666350839663e-06,
|
710 |
-
"loss": 0.
|
711 |
-
"reward":
|
712 |
-
"reward_std": 0.
|
713 |
-
"rewards/accuracy_reward": 0.
|
|
|
714 |
"rewards/format_reward": 0.0,
|
|
|
715 |
"step": 500
|
716 |
},
|
717 |
{
|
718 |
"epoch": 0.7730215479756498,
|
719 |
-
"eval_completion_length":
|
720 |
-
"eval_kl": 0.
|
721 |
-
"eval_loss": 0.
|
722 |
-
"eval_reward":
|
723 |
-
"eval_reward_std": 0.
|
724 |
-
"eval_rewards/accuracy_reward": 0.
|
|
|
725 |
"eval_rewards/format_reward": 0.0,
|
726 |
-
"
|
727 |
-
"
|
728 |
-
"
|
|
|
729 |
"step": 500
|
730 |
},
|
731 |
{
|
732 |
-
"completion_length":
|
733 |
"epoch": 0.7884819789351628,
|
734 |
-
"grad_norm": 0.
|
735 |
-
"kl": 0.
|
736 |
"learning_rate": 2.5842507113469307e-06,
|
737 |
-
"loss": 0.
|
738 |
-
"reward":
|
739 |
-
"reward_std": 0.
|
740 |
-
"rewards/accuracy_reward": 0.
|
|
|
741 |
"rewards/format_reward": 0.0,
|
|
|
742 |
"step": 510
|
743 |
},
|
744 |
{
|
745 |
-
"completion_length":
|
746 |
"epoch": 0.8039424098946758,
|
747 |
-
"grad_norm": 0.
|
748 |
-
"kl": 0.
|
749 |
"learning_rate": 2.2325119482391466e-06,
|
750 |
-
"loss": 0.
|
751 |
-
"reward":
|
752 |
-
"reward_std": 0.
|
753 |
-
"rewards/accuracy_reward": 0.
|
|
|
754 |
"rewards/format_reward": 0.0,
|
|
|
755 |
"step": 520
|
756 |
},
|
757 |
{
|
758 |
-
"completion_length":
|
759 |
"epoch": 0.8194028408541888,
|
760 |
-
"grad_norm": 0.
|
761 |
-
"kl": 0.
|
762 |
"learning_rate": 1.9034782243345074e-06,
|
763 |
-
"loss": 0.
|
764 |
-
"reward":
|
765 |
-
"reward_std": 0.
|
766 |
-
"rewards/accuracy_reward": 0.
|
|
|
767 |
"rewards/format_reward": 0.0,
|
|
|
768 |
"step": 530
|
769 |
},
|
770 |
{
|
771 |
-
"completion_length":
|
772 |
"epoch": 0.8348632718137018,
|
773 |
-
"grad_norm": 0.
|
774 |
-
"kl": 0.
|
775 |
"learning_rate": 1.5981113336584041e-06,
|
776 |
-
"loss": 0.
|
777 |
-
"reward":
|
778 |
-
"reward_std": 0.
|
779 |
-
"rewards/accuracy_reward": 0.
|
|
|
780 |
"rewards/format_reward": 0.0,
|
|
|
781 |
"step": 540
|
782 |
},
|
783 |
{
|
784 |
-
"completion_length":
|
785 |
"epoch": 0.8503237027732148,
|
786 |
-
"grad_norm": 0.
|
787 |
-
"kl": 0.
|
788 |
"learning_rate": 1.3173038900362977e-06,
|
789 |
-
"loss": 0.
|
790 |
-
"reward":
|
791 |
-
"reward_std": 0.
|
792 |
-
"rewards/accuracy_reward": 0.
|
|
|
793 |
"rewards/format_reward": 0.0,
|
|
|
794 |
"step": 550
|
795 |
},
|
796 |
{
|
797 |
-
"completion_length":
|
798 |
"epoch": 0.8657841337327278,
|
799 |
-
"grad_norm": 0.
|
800 |
-
"kl": 0.
|
801 |
"learning_rate": 1.0618767179063416e-06,
|
802 |
-
"loss": 0.
|
803 |
-
"reward":
|
804 |
-
"reward_std": 0.
|
805 |
-
"rewards/accuracy_reward": 0.
|
|
|
806 |
"rewards/format_reward": 0.0,
|
|
|
807 |
"step": 560
|
808 |
},
|
809 |
{
|
810 |
-
"completion_length":
|
811 |
"epoch": 0.8812445646922408,
|
812 |
-
"grad_norm": 0.
|
813 |
-
"kl": 0.
|
814 |
"learning_rate": 8.325764529785851e-07,
|
815 |
-
"loss": 0.
|
816 |
-
"reward":
|
817 |
-
"reward_std": 0.
|
818 |
-
"rewards/accuracy_reward": 0.
|
|
|
819 |
"rewards/format_reward": 0.0,
|
|
|
820 |
"step": 570
|
821 |
},
|
822 |
{
|
823 |
-
"completion_length":
|
824 |
"epoch": 0.8967049956517538,
|
825 |
-
"grad_norm": 0.
|
826 |
-
"kl": 0.
|
827 |
"learning_rate": 6.300733597542086e-07,
|
828 |
-
"loss": 0.
|
829 |
-
"reward":
|
830 |
-
"reward_std": 0.
|
831 |
-
"rewards/accuracy_reward": 0.
|
|
|
832 |
"rewards/format_reward": 0.0,
|
|
|
833 |
"step": 580
|
834 |
},
|
835 |
{
|
836 |
-
"completion_length":
|
837 |
"epoch": 0.9121654266112668,
|
838 |
-
"grad_norm": 0.
|
839 |
-
"kl": 0.
|
840 |
"learning_rate": 4.549593722844492e-07,
|
841 |
-
"loss": 0.
|
842 |
-
"reward":
|
843 |
-
"reward_std": 0.
|
844 |
-
"rewards/accuracy_reward": 0.
|
|
|
845 |
"rewards/format_reward": 0.0,
|
|
|
846 |
"step": 590
|
847 |
},
|
848 |
{
|
849 |
-
"completion_length":
|
850 |
"epoch": 0.9276258575707798,
|
851 |
-
"grad_norm": 0.
|
852 |
-
"kl": 0.
|
853 |
"learning_rate": 3.0774636389618196e-07,
|
854 |
-
"loss": 0.
|
855 |
-
"reward":
|
856 |
-
"reward_std": 0.
|
857 |
-
"rewards/accuracy_reward": 0.
|
|
|
858 |
"rewards/format_reward": 0.0,
|
|
|
859 |
"step": 600
|
860 |
},
|
861 |
{
|
862 |
"epoch": 0.9276258575707798,
|
863 |
-
"eval_completion_length":
|
864 |
-
"eval_kl": 0.
|
865 |
-
"eval_loss": 0.
|
866 |
-
"eval_reward":
|
867 |
-
"eval_reward_std": 0.
|
868 |
-
"eval_rewards/accuracy_reward": 0.
|
|
|
869 |
"eval_rewards/format_reward": 0.0,
|
870 |
-
"
|
871 |
-
"
|
872 |
-
"
|
|
|
873 |
"step": 600
|
874 |
},
|
875 |
{
|
876 |
-
"completion_length":
|
877 |
"epoch": 0.9430862885302927,
|
878 |
-
"grad_norm": 0.
|
879 |
-
"kl": 0.
|
880 |
"learning_rate": 1.8886465094192895e-07,
|
881 |
-
"loss": 0.
|
882 |
-
"reward":
|
883 |
-
"reward_std": 0.
|
884 |
-
"rewards/accuracy_reward": 0.
|
|
|
885 |
"rewards/format_reward": 0.0,
|
|
|
886 |
"step": 610
|
887 |
},
|
888 |
{
|
889 |
-
"completion_length":
|
890 |
"epoch": 0.9585467194898057,
|
891 |
-
"grad_norm": 0.
|
892 |
-
"kl": 0.
|
893 |
"learning_rate": 9.866173494794462e-08,
|
894 |
-
"loss": 0.
|
895 |
-
"reward":
|
896 |
-
"reward_std": 0.
|
897 |
-
"rewards/accuracy_reward": 0.
|
|
|
898 |
"rewards/format_reward": 0.0,
|
|
|
899 |
"step": 620
|
900 |
},
|
901 |
{
|
902 |
-
"completion_length":
|
903 |
"epoch": 0.9740071504493187,
|
904 |
-
"grad_norm": 0.
|
905 |
-
"kl": 0.
|
906 |
"learning_rate": 3.7401286837214224e-08,
|
907 |
-
"loss": 0.
|
908 |
-
"reward":
|
909 |
-
"reward_std": 0.
|
910 |
-
"rewards/accuracy_reward": 0.
|
|
|
911 |
"rewards/format_reward": 0.0,
|
|
|
912 |
"step": 630
|
913 |
},
|
914 |
{
|
915 |
-
"completion_length":
|
916 |
"epoch": 0.9894675814088317,
|
917 |
-
"grad_norm": 0.
|
918 |
-
"kl": 0.
|
919 |
"learning_rate": 5.262376196544239e-09,
|
920 |
-
"loss": 0.
|
921 |
-
"reward":
|
922 |
-
"reward_std": 0.
|
923 |
-
"rewards/accuracy_reward": 0.
|
|
|
924 |
"rewards/format_reward": 0.0,
|
|
|
925 |
"step": 640
|
926 |
},
|
927 |
{
|
928 |
-
"completion_length":
|
929 |
"epoch": 0.9987438399845395,
|
930 |
-
"kl": 0.
|
931 |
-
"reward":
|
932 |
-
"reward_std": 0.
|
933 |
-
"rewards/accuracy_reward": 0.
|
|
|
934 |
"rewards/format_reward": 0.0,
|
|
|
935 |
"step": 646,
|
936 |
"total_flos": 0.0,
|
937 |
-
"train_loss": 0.
|
938 |
-
"train_runtime":
|
939 |
-
"train_samples_per_second":
|
940 |
-
"train_steps_per_second": 0.
|
941 |
}
|
942 |
],
|
943 |
"logging_steps": 10,
|
@@ -958,7 +1100,7 @@
|
|
958 |
}
|
959 |
},
|
960 |
"total_flos": 0.0,
|
961 |
-
"train_batch_size":
|
962 |
"trial_name": null,
|
963 |
"trial_params": null
|
964 |
}
|
|
|
9 |
"is_world_process_zero": true,
|
10 |
"log_history": [
|
11 |
{
|
12 |
+
"completion_length": 928.2598609924316,
|
13 |
"epoch": 0.015460430959512996,
|
14 |
+
"grad_norm": 0.005825439665555334,
|
15 |
+
"kl": 0.0004873394966125488,
|
16 |
"learning_rate": 3.0769230769230774e-06,
|
17 |
"loss": 0.0,
|
18 |
+
"reward": 0.6422783114481717,
|
19 |
+
"reward_std": 0.6085492318496108,
|
20 |
+
"rewards/accuracy_reward": 0.16450893601868302,
|
21 |
+
"rewards/cosine_scaled_reward": -0.15619643366662786,
|
22 |
"rewards/format_reward": 0.0,
|
23 |
+
"rewards/reasoning_steps_reward": 0.6339658062905074,
|
24 |
"step": 10
|
25 |
},
|
26 |
{
|
27 |
+
"completion_length": 834.5876502990723,
|
28 |
"epoch": 0.03092086191902599,
|
29 |
+
"grad_norm": 0.005061101750122049,
|
30 |
+
"kl": 0.005776357650756836,
|
31 |
"learning_rate": 6.153846153846155e-06,
|
32 |
"loss": 0.0002,
|
33 |
+
"reward": 1.1410260727629065,
|
34 |
+
"reward_std": 0.6005165675655008,
|
35 |
+
"rewards/accuracy_reward": 0.3330357288941741,
|
36 |
+
"rewards/cosine_scaled_reward": 0.021941200397304784,
|
37 |
"rewards/format_reward": 0.0,
|
38 |
+
"rewards/reasoning_steps_reward": 0.786049148067832,
|
39 |
"step": 20
|
40 |
},
|
41 |
{
|
42 |
+
"completion_length": 794.6511512756348,
|
43 |
"epoch": 0.04638129287853899,
|
44 |
+
"grad_norm": 0.004186908324406587,
|
45 |
+
"kl": 0.012411689758300782,
|
46 |
"learning_rate": 9.230769230769232e-06,
|
47 |
+
"loss": 0.0005,
|
48 |
+
"reward": 1.5154079463332892,
|
49 |
+
"reward_std": 0.5513344288803637,
|
50 |
+
"rewards/accuracy_reward": 0.4193080538418144,
|
51 |
+
"rewards/cosine_scaled_reward": 0.14040787946141792,
|
52 |
"rewards/format_reward": 0.0,
|
53 |
+
"rewards/reasoning_steps_reward": 0.9556920122355222,
|
54 |
"step": 30
|
55 |
},
|
56 |
{
|
57 |
+
"completion_length": 823.3263763427734,
|
58 |
"epoch": 0.06184172383805198,
|
59 |
+
"grad_norm": 0.004675533239892946,
|
60 |
+
"kl": 0.0160797119140625,
|
61 |
"learning_rate": 1.230769230769231e-05,
|
62 |
+
"loss": 0.0006,
|
63 |
+
"reward": 1.6104407742619515,
|
64 |
+
"reward_std": 0.5197742725256831,
|
65 |
+
"rewards/accuracy_reward": 0.45156252263113855,
|
66 |
+
"rewards/cosine_scaled_reward": 0.18000916420933208,
|
67 |
"rewards/format_reward": 0.0,
|
68 |
+
"rewards/reasoning_steps_reward": 0.9788690943270921,
|
69 |
"step": 40
|
70 |
},
|
71 |
{
|
72 |
+
"completion_length": 824.9076248168946,
|
73 |
"epoch": 0.07730215479756498,
|
74 |
+
"grad_norm": 0.008209128879452155,
|
75 |
+
"kl": 0.021560287475585936,
|
76 |
"learning_rate": 1.5384615384615387e-05,
|
77 |
"loss": 0.0009,
|
78 |
+
"reward": 1.7187657799571752,
|
79 |
+
"reward_std": 0.539798857551068,
|
80 |
+
"rewards/accuracy_reward": 0.48928573532029984,
|
81 |
+
"rewards/cosine_scaled_reward": 0.2462582775799092,
|
82 |
"rewards/format_reward": 0.0,
|
83 |
+
"rewards/reasoning_steps_reward": 0.9832217764109373,
|
84 |
"step": 50
|
85 |
},
|
86 |
{
|
87 |
+
"completion_length": 776.4894325256348,
|
88 |
"epoch": 0.09276258575707798,
|
89 |
+
"grad_norm": 0.005032104484078714,
|
90 |
+
"kl": 0.03041839599609375,
|
91 |
"learning_rate": 1.8461538461538465e-05,
|
92 |
+
"loss": 0.0012,
|
93 |
+
"reward": 1.7944712869822979,
|
94 |
+
"reward_std": 0.5402565439231694,
|
95 |
+
"rewards/accuracy_reward": 0.5172991305589676,
|
96 |
+
"rewards/cosine_scaled_reward": 0.2977078223892022,
|
97 |
"rewards/format_reward": 0.0,
|
98 |
+
"rewards/reasoning_steps_reward": 0.979464340955019,
|
99 |
"step": 60
|
100 |
},
|
101 |
{
|
102 |
+
"completion_length": 783.9421092987061,
|
103 |
"epoch": 0.10822301671659097,
|
104 |
+
"grad_norm": 0.046440537921451495,
|
105 |
+
"kl": 0.18165512084960939,
|
106 |
"learning_rate": 1.999634547413886e-05,
|
107 |
+
"loss": 0.0073,
|
108 |
+
"reward": 1.5861421424895525,
|
109 |
+
"reward_std": 0.7133004866540432,
|
110 |
+
"rewards/accuracy_reward": 0.46037948597222567,
|
111 |
+
"rewards/cosine_scaled_reward": 0.2454426669143686,
|
112 |
"rewards/format_reward": 0.0,
|
113 |
+
"rewards/reasoning_steps_reward": 0.8803199872374534,
|
114 |
"step": 70
|
115 |
},
|
116 |
{
|
117 |
+
"completion_length": 704.7571739196777,
|
118 |
"epoch": 0.12368344767610397,
|
119 |
+
"grad_norm": 0.005638881154684646,
|
120 |
+
"kl": 0.14808197021484376,
|
121 |
"learning_rate": 1.9967125291968495e-05,
|
122 |
+
"loss": 0.0059,
|
123 |
+
"reward": 1.770343079417944,
|
124 |
+
"reward_std": 0.6161688735242933,
|
125 |
+
"rewards/accuracy_reward": 0.5064732374623417,
|
126 |
+
"rewards/cosine_scaled_reward": 0.31033555960966624,
|
127 |
"rewards/format_reward": 0.0,
|
128 |
+
"rewards/reasoning_steps_reward": 0.9535342697054148,
|
129 |
"step": 80
|
130 |
},
|
131 |
{
|
132 |
+
"completion_length": 811.8497016906738,
|
133 |
"epoch": 0.13914387863561697,
|
134 |
+
"grad_norm": 0.006277685603677439,
|
135 |
+
"kl": 0.1677825927734375,
|
136 |
"learning_rate": 1.990877034074683e-05,
|
137 |
+
"loss": 0.0067,
|
138 |
+
"reward": 1.7127626728266478,
|
139 |
+
"reward_std": 0.5350183860398829,
|
140 |
+
"rewards/accuracy_reward": 0.4822544841095805,
|
141 |
+
"rewards/cosine_scaled_reward": 0.24494265783869196,
|
142 |
"rewards/format_reward": 0.0,
|
143 |
+
"rewards/reasoning_steps_reward": 0.9855655211955309,
|
144 |
"step": 90
|
145 |
},
|
146 |
{
|
147 |
+
"completion_length": 786.0161056518555,
|
148 |
"epoch": 0.15460430959512997,
|
149 |
+
"grad_norm": 0.004638658867653336,
|
150 |
+
"kl": 0.20247802734375,
|
151 |
"learning_rate": 1.9821451197042028e-05,
|
152 |
+
"loss": 0.0081,
|
153 |
+
"reward": 1.7241055637598037,
|
154 |
+
"reward_std": 0.6285460269078612,
|
155 |
+
"rewards/accuracy_reward": 0.4876116293948144,
|
156 |
+
"rewards/cosine_scaled_reward": 0.27760252499065247,
|
157 |
"rewards/format_reward": 0.0,
|
158 |
+
"rewards/reasoning_steps_reward": 0.9588914208114148,
|
159 |
"step": 100
|
160 |
},
|
161 |
{
|
162 |
"epoch": 0.15460430959512997,
|
163 |
+
"eval_completion_length": 785.1983642578125,
|
164 |
+
"eval_kl": 0.1015625,
|
165 |
+
"eval_loss": 0.004128854256123304,
|
166 |
+
"eval_reward": 1.8587820827960968,
|
167 |
+
"eval_reward_std": 0.4679965078830719,
|
168 |
+
"eval_rewards/accuracy_reward": 0.5345982536673546,
|
169 |
+
"eval_rewards/cosine_scaled_reward": 0.33646056056022644,
|
170 |
"eval_rewards/format_reward": 0.0,
|
171 |
+
"eval_rewards/reasoning_steps_reward": 0.9877232909202576,
|
172 |
+
"eval_runtime": 65.77,
|
173 |
+
"eval_samples_per_second": 1.505,
|
174 |
+
"eval_steps_per_second": 0.015,
|
175 |
"step": 100
|
176 |
},
|
177 |
{
|
178 |
+
"completion_length": 795.4053916931152,
|
179 |
"epoch": 0.17006474055464296,
|
180 |
+
"grad_norm": 0.0050662218091816385,
|
181 |
+
"kl": 0.20187530517578126,
|
182 |
"learning_rate": 1.9705423102261324e-05,
|
183 |
+
"loss": 0.0081,
|
184 |
+
"reward": 1.7285974282771348,
|
185 |
+
"reward_std": 0.6808601895347237,
|
186 |
+
"rewards/accuracy_reward": 0.49296877197921274,
|
187 |
+
"rewards/cosine_scaled_reward": 0.296454501109838,
|
188 |
"rewards/format_reward": 0.0,
|
189 |
+
"rewards/reasoning_steps_reward": 0.9391741570085287,
|
190 |
"step": 110
|
191 |
},
|
192 |
{
|
193 |
+
"completion_length": 791.2351921081543,
|
194 |
"epoch": 0.18552517151415596,
|
195 |
+
"grad_norm": 0.005966417830813838,
|
196 |
+
"kl": 0.222845458984375,
|
197 |
"learning_rate": 1.956102521655831e-05,
|
198 |
+
"loss": 0.0089,
|
199 |
+
"reward": 1.7569476522505283,
|
200 |
+
"reward_std": 0.6371290137991309,
|
201 |
+
"rewards/accuracy_reward": 0.4906250220956281,
|
202 |
+
"rewards/cosine_scaled_reward": 0.30925412904762195,
|
203 |
"rewards/format_reward": 0.0,
|
204 |
+
"rewards/reasoning_steps_reward": 0.9570684999227523,
|
205 |
"step": 120
|
206 |
},
|
207 |
{
|
208 |
+
"completion_length": 778.9137634277344,
|
209 |
"epoch": 0.20098560247366895,
|
210 |
+
"grad_norm": 0.11425551975396886,
|
211 |
+
"kl": 0.455560302734375,
|
212 |
"learning_rate": 1.9388679627438486e-05,
|
213 |
+
"loss": 0.0182,
|
214 |
+
"reward": 1.6175578892230988,
|
215 |
+
"reward_std": 0.6627502014860511,
|
216 |
+
"rewards/accuracy_reward": 0.43537948445882646,
|
217 |
+
"rewards/cosine_scaled_reward": 0.24683610293641323,
|
218 |
"rewards/format_reward": 0.0,
|
219 |
+
"rewards/reasoning_steps_reward": 0.9353423073887825,
|
220 |
"step": 130
|
221 |
},
|
222 |
{
|
223 |
+
"completion_length": 671.9368595123291,
|
224 |
"epoch": 0.21644603343318194,
|
225 |
+
"grad_norm": 0.012687310552666826,
|
226 |
+
"kl": 2.4836822509765626,
|
227 |
"learning_rate": 1.9188890115960967e-05,
|
228 |
+
"loss": 0.0994,
|
229 |
+
"reward": 1.4270036322064699,
|
230 |
+
"reward_std": 0.739827654324472,
|
231 |
+
"rewards/accuracy_reward": 0.39709823183948173,
|
232 |
+
"rewards/cosine_scaled_reward": 0.2161776867986191,
|
233 |
"rewards/format_reward": 0.0,
|
234 |
+
"rewards/reasoning_steps_reward": 0.8137277197092772,
|
235 |
"step": 140
|
236 |
},
|
237 |
{
|
238 |
+
"completion_length": 703.5272651672364,
|
239 |
"epoch": 0.23190646439269494,
|
240 |
+
"grad_norm": 0.023072548858575986,
|
241 |
+
"kl": 0.175665283203125,
|
242 |
"learning_rate": 1.8962240684142923e-05,
|
243 |
+
"loss": 0.007,
|
244 |
+
"reward": 1.8304371915757656,
|
245 |
+
"reward_std": 0.5595470611006021,
|
246 |
+
"rewards/accuracy_reward": 0.5032366321422159,
|
247 |
+
"rewards/cosine_scaled_reward": 0.3409654124639928,
|
248 |
"rewards/format_reward": 0.0,
|
249 |
+
"rewards/reasoning_steps_reward": 0.986235162243247,
|
250 |
"step": 150
|
251 |
},
|
252 |
{
|
253 |
+
"completion_length": 755.7974658966065,
|
254 |
"epoch": 0.24736689535220793,
|
255 |
+
"grad_norm": 0.0048724703817881404,
|
256 |
+
"kl": 0.1621673583984375,
|
257 |
"learning_rate": 1.8709393847871146e-05,
|
258 |
+
"loss": 0.0065,
|
259 |
+
"reward": 1.8066862165927886,
|
260 |
+
"reward_std": 0.541509800683707,
|
261 |
+
"rewards/accuracy_reward": 0.4974330588709563,
|
262 |
+
"rewards/cosine_scaled_reward": 0.32353881540329893,
|
263 |
"rewards/format_reward": 0.0,
|
264 |
+
"rewards/reasoning_steps_reward": 0.9857143286615611,
|
265 |
"step": 160
|
266 |
},
|
267 |
{
|
268 |
+
"completion_length": 767.8857475280762,
|
269 |
"epoch": 0.26282732631172095,
|
270 |
+
"grad_norm": 0.0058295760602797165,
|
271 |
+
"kl": 0.1024566650390625,
|
272 |
"learning_rate": 1.8431088700310846e-05,
|
273 |
+
"loss": 0.0041,
|
274 |
+
"reward": 1.8246684893965721,
|
275 |
+
"reward_std": 0.6182428574189544,
|
276 |
+
"rewards/accuracy_reward": 0.5167410940863192,
|
277 |
+
"rewards/cosine_scaled_reward": 0.33218330084491754,
|
278 |
"rewards/format_reward": 0.0,
|
279 |
+
"rewards/reasoning_steps_reward": 0.9757441036403179,
|
280 |
"step": 170
|
281 |
},
|
282 |
{
|
283 |
+
"completion_length": 782.8031581878662,
|
284 |
"epoch": 0.27828775727123395,
|
285 |
+
"grad_norm": 0.007260482392875092,
|
286 |
+
"kl": 0.133380126953125,
|
287 |
"learning_rate": 1.8128138751472432e-05,
|
288 |
+
"loss": 0.0053,
|
289 |
+
"reward": 1.6873359650373458,
|
290 |
+
"reward_std": 0.7230455877259374,
|
291 |
+
"rewards/accuracy_reward": 0.46573662804439664,
|
292 |
+
"rewards/cosine_scaled_reward": 0.2658701150892739,
|
293 |
"rewards/format_reward": 0.0,
|
294 |
+
"rewards/reasoning_steps_reward": 0.955729215592146,
|
295 |
"step": 180
|
296 |
},
|
297 |
{
|
298 |
+
"completion_length": 770.7354141235352,
|
299 |
"epoch": 0.29374818823074694,
|
300 |
+
"grad_norm": 0.0038766706896374765,
|
301 |
+
"kl": 0.084027099609375,
|
302 |
"learning_rate": 1.780142955025139e-05,
|
303 |
+
"loss": 0.0034,
|
304 |
+
"reward": 1.8208528086543083,
|
305 |
+
"reward_std": 0.6158834310248494,
|
306 |
+
"rewards/accuracy_reward": 0.5102678800933063,
|
307 |
+
"rewards/cosine_scaled_reward": 0.3412768360443579,
|
308 |
"rewards/format_reward": 0.0,
|
309 |
+
"rewards/reasoning_steps_reward": 0.969308077916503,
|
310 |
"step": 190
|
311 |
},
|
312 |
{
|
313 |
+
"completion_length": 777.9120876312256,
|
314 |
"epoch": 0.30920861919025994,
|
315 |
+
"grad_norm": 0.004081871056913627,
|
316 |
+
"kl": 0.079278564453125,
|
317 |
"learning_rate": 1.745191609589231e-05,
|
318 |
+
"loss": 0.0032,
|
319 |
+
"reward": 1.8799906723201274,
|
320 |
+
"reward_std": 0.6350350034423172,
|
321 |
+
"rewards/accuracy_reward": 0.5420759165659547,
|
322 |
+
"rewards/cosine_scaled_reward": 0.36834625932970083,
|
323 |
"rewards/format_reward": 0.0,
|
324 |
+
"rewards/reasoning_steps_reward": 0.9695684995502234,
|
325 |
"step": 200
|
326 |
},
|
327 |
{
|
328 |
"epoch": 0.30920861919025994,
|
329 |
+
"eval_completion_length": 786.5066223144531,
|
330 |
+
"eval_kl": 0.080078125,
|
331 |
+
"eval_loss": 0.0031854985281825066,
|
332 |
+
"eval_reward": 1.7613219320774078,
|
333 |
+
"eval_reward_std": 0.6763340681791306,
|
334 |
+
"eval_rewards/accuracy_reward": 0.4933036044239998,
|
335 |
+
"eval_rewards/cosine_scaled_reward": 0.3011283501982689,
|
336 |
"eval_rewards/format_reward": 0.0,
|
337 |
+
"eval_rewards/reasoning_steps_reward": 0.9668899178504944,
|
338 |
+
"eval_runtime": 67.4733,
|
339 |
+
"eval_samples_per_second": 1.467,
|
340 |
+
"eval_steps_per_second": 0.015,
|
341 |
"step": 200
|
342 |
},
|
343 |
{
|
344 |
+
"completion_length": 762.9672210693359,
|
345 |
"epoch": 0.32466905014977293,
|
346 |
+
"grad_norm": 0.0045327243820408964,
|
347 |
+
"kl": 0.0857818603515625,
|
348 |
"learning_rate": 1.7080620046443503e-05,
|
349 |
+
"loss": 0.0034,
|
350 |
+
"reward": 1.8360209584236145,
|
351 |
+
"reward_std": 0.6304899661801755,
|
352 |
+
"rewards/accuracy_reward": 0.5189732388593257,
|
353 |
+
"rewards/cosine_scaled_reward": 0.3513110678992234,
|
354 |
"rewards/format_reward": 0.0,
|
355 |
+
"rewards/reasoning_steps_reward": 0.9657366566359997,
|
356 |
"step": 210
|
357 |
},
|
358 |
{
|
359 |
+
"completion_length": 740.6268199920654,
|
360 |
"epoch": 0.3401294811092859,
|
361 |
+
"grad_norm": 0.40798247676236865,
|
362 |
+
"kl": 0.09603729248046874,
|
363 |
"learning_rate": 1.6688626732362192e-05,
|
364 |
+
"loss": 0.0038,
|
365 |
+
"reward": 1.8989367991685868,
|
366 |
+
"reward_std": 0.6170632224529982,
|
367 |
+
"rewards/accuracy_reward": 0.541183059476316,
|
368 |
+
"rewards/cosine_scaled_reward": 0.3866971510913572,
|
369 |
"rewards/format_reward": 0.0,
|
370 |
+
"rewards/reasoning_steps_reward": 0.9710565954446793,
|
371 |
"step": 220
|
372 |
},
|
373 |
{
|
374 |
+
"completion_length": 745.6220226287842,
|
375 |
"epoch": 0.3555899120687989,
|
376 |
+
"grad_norm": 0.009310955589968223,
|
377 |
+
"kl": 0.17754974365234374,
|
378 |
"learning_rate": 1.6277081983999742e-05,
|
379 |
+
"loss": 0.0071,
|
380 |
+
"reward": 1.9535415962338447,
|
381 |
+
"reward_std": 0.5657559703569859,
|
382 |
+
"rewards/accuracy_reward": 0.5494419884867966,
|
383 |
+
"rewards/cosine_scaled_reward": 0.4263093855464831,
|
384 |
"rewards/format_reward": 0.0,
|
385 |
+
"rewards/reasoning_steps_reward": 0.9777902279049158,
|
386 |
"step": 230
|
387 |
},
|
388 |
{
|
389 |
+
"completion_length": 754.8473545074463,
|
390 |
"epoch": 0.3710503430283119,
|
391 |
+
"grad_norm": 0.009032184745149096,
|
392 |
+
"kl": 0.1623504638671875,
|
393 |
"learning_rate": 1.5847188782240473e-05,
|
394 |
+
"loss": 0.0065,
|
395 |
+
"reward": 1.8752706520259381,
|
396 |
+
"reward_std": 0.6476909777149558,
|
397 |
+
"rewards/accuracy_reward": 0.5162946661002934,
|
398 |
+
"rewards/cosine_scaled_reward": 0.3971455840044655,
|
399 |
"rewards/format_reward": 0.0,
|
400 |
+
"rewards/reasoning_steps_reward": 0.9618303928524256,
|
401 |
"step": 240
|
402 |
},
|
403 |
{
|
404 |
+
"completion_length": 767.3316184997559,
|
405 |
"epoch": 0.3865107739878249,
|
406 |
+
"grad_norm": 0.006074054783900294,
|
407 |
+
"kl": 0.1158416748046875,
|
408 |
"learning_rate": 1.5400203742084508e-05,
|
409 |
+
"loss": 0.0046,
|
410 |
+
"reward": 1.8485381975769997,
|
411 |
+
"reward_std": 0.6796474339440465,
|
412 |
+
"rewards/accuracy_reward": 0.5156250222586095,
|
413 |
+
"rewards/cosine_scaled_reward": 0.3913580739754252,
|
414 |
"rewards/format_reward": 0.0,
|
415 |
+
"rewards/reasoning_steps_reward": 0.9415550928562879,
|
416 |
"step": 250
|
417 |
},
|
418 |
{
|
419 |
+
"completion_length": 740.9466835021973,
|
420 |
"epoch": 0.4019712049473379,
|
421 |
+
"grad_norm": 0.004612552363152663,
|
422 |
+
"kl": 0.10526580810546875,
|
423 |
"learning_rate": 1.4937433439453465e-05,
|
424 |
+
"loss": 0.0042,
|
425 |
+
"reward": 1.834777297079563,
|
426 |
+
"reward_std": 0.694879194535315,
|
427 |
+
"rewards/accuracy_reward": 0.5040178820490837,
|
428 |
+
"rewards/cosine_scaled_reward": 0.38555847499519585,
|
429 |
"rewards/format_reward": 0.0,
|
430 |
+
"rewards/reasoning_steps_reward": 0.9452009297907352,
|
431 |
"step": 260
|
432 |
},
|
433 |
{
|
434 |
+
"completion_length": 769.4490287780761,
|
435 |
"epoch": 0.4174316359068509,
|
436 |
+
"grad_norm": 0.005166739754892184,
|
437 |
+
"kl": 0.122613525390625,
|
438 |
"learning_rate": 1.4460230591956097e-05,
|
439 |
+
"loss": 0.0049,
|
440 |
+
"reward": 1.8051817450672387,
|
441 |
+
"reward_std": 0.7667457018047571,
|
442 |
+
"rewards/accuracy_reward": 0.5031250216066837,
|
443 |
+
"rewards/cosine_scaled_reward": 0.3666772120282985,
|
444 |
"rewards/format_reward": 0.0,
|
445 |
+
"rewards/reasoning_steps_reward": 0.9353795044124127,
|
446 |
"step": 270
|
447 |
},
|
448 |
{
|
449 |
+
"completion_length": 756.1934505462647,
|
450 |
"epoch": 0.4328920668663639,
|
451 |
+
"grad_norm": 0.004779328317174938,
|
452 |
+
"kl": 0.118280029296875,
|
453 |
"learning_rate": 1.3969990104777712e-05,
|
454 |
+
"loss": 0.0047,
|
455 |
+
"reward": 1.835938386246562,
|
456 |
+
"reward_std": 0.6989197930321097,
|
457 |
+
"rewards/accuracy_reward": 0.5044643082190305,
|
458 |
+
"rewards/cosine_scaled_reward": 0.3808415879495442,
|
459 |
"rewards/format_reward": 0.0,
|
460 |
+
"rewards/reasoning_steps_reward": 0.9506324753165245,
|
461 |
"step": 280
|
462 |
},
|
463 |
{
|
464 |
+
"completion_length": 753.9099658966064,
|
465 |
"epoch": 0.4483524978258769,
|
466 |
+
"grad_norm": 0.006205432336241612,
|
467 |
+
"kl": 0.12601318359375,
|
468 |
"learning_rate": 1.3468144993251735e-05,
|
469 |
+
"loss": 0.005,
|
470 |
+
"reward": 1.8052862711250781,
|
471 |
+
"reward_std": 0.6413127107545733,
|
472 |
+
"rewards/accuracy_reward": 0.47890627244487405,
|
473 |
+
"rewards/cosine_scaled_reward": 0.3571091307036113,
|
474 |
"rewards/format_reward": 0.0,
|
475 |
+
"rewards/reasoning_steps_reward": 0.9692708767950535,
|
476 |
"step": 290
|
477 |
},
|
478 |
{
|
479 |
+
"completion_length": 766.8500350952148,
|
480 |
"epoch": 0.4638129287853899,
|
481 |
+
"grad_norm": 0.005053729003460748,
|
482 |
+
"kl": 0.1371002197265625,
|
483 |
"learning_rate": 1.295616219403197e-05,
|
484 |
+
"loss": 0.0055,
|
485 |
+
"reward": 1.7713046602904796,
|
486 |
+
"reward_std": 0.6539058156311512,
|
487 |
+
"rewards/accuracy_reward": 0.4574776992201805,
|
488 |
+
"rewards/cosine_scaled_reward": 0.34656501180725174,
|
489 |
"rewards/format_reward": 0.0,
|
490 |
+
"rewards/reasoning_steps_reward": 0.967261953279376,
|
491 |
"step": 300
|
492 |
},
|
493 |
{
|
494 |
"epoch": 0.4638129287853899,
|
495 |
+
"eval_completion_length": 725.3372497558594,
|
496 |
+
"eval_kl": 0.125732421875,
|
497 |
+
"eval_loss": 0.005167535971850157,
|
498 |
+
"eval_reward": 1.8545046150684357,
|
499 |
+
"eval_reward_std": 0.5993074476718903,
|
500 |
+
"eval_rewards/accuracy_reward": 0.4888393133878708,
|
501 |
+
"eval_rewards/cosine_scaled_reward": 0.3980313614010811,
|
502 |
"eval_rewards/format_reward": 0.0,
|
503 |
+
"eval_rewards/reasoning_steps_reward": 0.9676340073347092,
|
504 |
+
"eval_runtime": 63.1453,
|
505 |
+
"eval_samples_per_second": 1.568,
|
506 |
+
"eval_steps_per_second": 0.016,
|
507 |
"step": 300
|
508 |
},
|
509 |
{
|
510 |
+
"completion_length": 738.0375347137451,
|
511 |
"epoch": 0.47927335974490287,
|
512 |
+
"grad_norm": 0.004708932814585316,
|
513 |
+
"kl": 0.128253173828125,
|
514 |
"learning_rate": 1.2435538277109919e-05,
|
515 |
+
"loss": 0.0051,
|
516 |
+
"reward": 1.776976404339075,
|
517 |
+
"reward_std": 0.6543458372354507,
|
518 |
+
"rewards/accuracy_reward": 0.4662946649361402,
|
519 |
+
"rewards/cosine_scaled_reward": 0.35774270847914524,
|
520 |
"rewards/format_reward": 0.0,
|
521 |
+
"rewards/reasoning_steps_reward": 0.9529390316456556,
|
522 |
"step": 310
|
523 |
},
|
524 |
{
|
525 |
+
"completion_length": 730.0644290924072,
|
526 |
"epoch": 0.49473379070441587,
|
527 |
+
"grad_norm": 0.006404744910772637,
|
528 |
+
"kl": 0.12236328125,
|
529 |
"learning_rate": 1.19077950712113e-05,
|
530 |
+
"loss": 0.0049,
|
531 |
+
"reward": 1.8439508713781834,
|
532 |
+
"reward_std": 0.6846362385898829,
|
533 |
+
"rewards/accuracy_reward": 0.500669667404145,
|
534 |
+
"rewards/cosine_scaled_reward": 0.3922022982500494,
|
535 |
"rewards/format_reward": 0.0,
|
536 |
+
"rewards/reasoning_steps_reward": 0.9510789047926664,
|
537 |
"step": 320
|
538 |
},
|
539 |
{
|
540 |
+
"completion_length": 733.6488037109375,
|
541 |
"epoch": 0.5101942216639289,
|
542 |
+
"grad_norm": 0.005310241047036926,
|
543 |
+
"kl": 0.1285675048828125,
|
544 |
"learning_rate": 1.137447521535908e-05,
|
545 |
+
"loss": 0.0051,
|
546 |
+
"reward": 1.8017703101038933,
|
547 |
+
"reward_std": 0.670677787438035,
|
548 |
+
"rewards/accuracy_reward": 0.46941966488957404,
|
549 |
+
"rewards/cosine_scaled_reward": 0.3702226262510521,
|
550 |
"rewards/format_reward": 0.0,
|
551 |
+
"rewards/reasoning_steps_reward": 0.9621280215680599,
|
552 |
"step": 330
|
553 |
},
|
554 |
{
|
555 |
+
"completion_length": 740.5896522521973,
|
556 |
"epoch": 0.5256546526234419,
|
557 |
+
"grad_norm": 0.004911848589025536,
|
558 |
+
"kl": 0.125958251953125,
|
559 |
"learning_rate": 1.0837137649606241e-05,
|
560 |
+
"loss": 0.005,
|
561 |
+
"reward": 1.8196691133081913,
|
562 |
+
"reward_std": 0.6627934613265097,
|
563 |
+
"rewards/accuracy_reward": 0.4854910961352289,
|
564 |
+
"rewards/cosine_scaled_reward": 0.37692351534496993,
|
565 |
"rewards/format_reward": 0.0,
|
566 |
+
"rewards/reasoning_steps_reward": 0.957254507765174,
|
567 |
"step": 340
|
568 |
},
|
569 |
{
|
570 |
+
"completion_length": 733.2659954071045,
|
571 |
"epoch": 0.5411150835829549,
|
572 |
+
"grad_norm": 0.009426685166535624,
|
573 |
+
"kl": 0.1310546875,
|
574 |
"learning_rate": 1.0297353058119209e-05,
|
575 |
+
"loss": 0.0052,
|
576 |
+
"reward": 1.7875644348561763,
|
577 |
+
"reward_std": 0.6663354218006134,
|
578 |
+
"rewards/accuracy_reward": 0.46261162832379343,
|
579 |
+
"rewards/cosine_scaled_reward": 0.36353164007887245,
|
580 |
"rewards/format_reward": 0.0,
|
581 |
+
"rewards/reasoning_steps_reward": 0.961421174928546,
|
582 |
"step": 350
|
583 |
},
|
584 |
{
|
585 |
+
"completion_length": 755.5462394714356,
|
586 |
"epoch": 0.5565755145424679,
|
587 |
+
"grad_norm": 0.005204829040206616,
|
588 |
+
"kl": 0.14141845703125,
|
589 |
"learning_rate": 9.756699277932196e-06,
|
590 |
+
"loss": 0.0057,
|
591 |
+
"reward": 1.7464446134865284,
|
592 |
+
"reward_std": 0.6827127303928137,
|
593 |
+
"rewards/accuracy_reward": 0.43928573140874505,
|
594 |
+
"rewards/cosine_scaled_reward": 0.3423150799470022,
|
595 |
"rewards/format_reward": 0.0,
|
596 |
+
"rewards/reasoning_steps_reward": 0.9648437988013029,
|
597 |
"step": 360
|
598 |
},
|
599 |
{
|
600 |
+
"completion_length": 738.9675567626953,
|
601 |
"epoch": 0.5720359455019809,
|
602 |
+
"grad_norm": 0.0050950433186417,
|
603 |
+
"kl": 0.133477783203125,
|
604 |
"learning_rate": 9.216756686793163e-06,
|
605 |
+
"loss": 0.0053,
|
606 |
+
"reward": 1.7593348406255245,
|
607 |
+
"reward_std": 0.7046971999108791,
|
608 |
+
"rewards/accuracy_reward": 0.4560268087312579,
|
609 |
+
"rewards/cosine_scaled_reward": 0.35353119419887663,
|
610 |
"rewards/format_reward": 0.0,
|
611 |
+
"rewards/reasoning_steps_reward": 0.9497768227010965,
|
612 |
"step": 370
|
613 |
},
|
614 |
{
|
615 |
+
"completion_length": 715.9590724945068,
|
616 |
"epoch": 0.5874963764614939,
|
617 |
+
"grad_norm": 0.005868130396446593,
|
618 |
+
"kl": 0.1201171875,
|
619 |
"learning_rate": 8.67910358358298e-06,
|
620 |
+
"loss": 0.0048,
|
621 |
+
"reward": 1.8290306769311429,
|
622 |
+
"reward_std": 0.7089241919107735,
|
623 |
+
"rewards/accuracy_reward": 0.4906250239349902,
|
624 |
+
"rewards/cosine_scaled_reward": 0.3883312027202919,
|
625 |
"rewards/format_reward": 0.0,
|
626 |
+
"rewards/reasoning_steps_reward": 0.9500744428485632,
|
627 |
"step": 380
|
628 |
},
|
629 |
{
|
630 |
+
"completion_length": 758.1067291259766,
|
631 |
"epoch": 0.6029568074210069,
|
632 |
+
"grad_norm": 0.005528799006616127,
|
633 |
+
"kl": 0.1315093994140625,
|
634 |
"learning_rate": 8.145311574811325e-06,
|
635 |
+
"loss": 0.0053,
|
636 |
+
"reward": 1.6966661393642426,
|
637 |
+
"reward_std": 0.7609130211174489,
|
638 |
+
"rewards/accuracy_reward": 0.45424109399318696,
|
639 |
+
"rewards/cosine_scaled_reward": 0.32028957750299014,
|
640 |
"rewards/format_reward": 0.0,
|
641 |
+
"rewards/reasoning_steps_reward": 0.9221354588866234,
|
642 |
"step": 390
|
643 |
},
|
644 |
{
|
645 |
+
"completion_length": 731.6211277008057,
|
646 |
"epoch": 0.6184172383805199,
|
647 |
+
"grad_norm": 0.006163761009715641,
|
648 |
+
"kl": 0.130072021484375,
|
649 |
"learning_rate": 7.616940980675004e-06,
|
650 |
+
"loss": 0.0052,
|
651 |
+
"reward": 1.7418419629335404,
|
652 |
+
"reward_std": 0.7564100152812898,
|
653 |
+
"rewards/accuracy_reward": 0.46339287841692567,
|
654 |
+
"rewards/cosine_scaled_reward": 0.34221391292085174,
|
655 |
"rewards/format_reward": 0.0,
|
656 |
+
"rewards/reasoning_steps_reward": 0.9362351588904858,
|
657 |
"step": 400
|
658 |
},
|
659 |
{
|
660 |
"epoch": 0.6184172383805199,
|
661 |
+
"eval_completion_length": 721.2921142578125,
|
662 |
+
"eval_kl": 0.14404296875,
|
663 |
+
"eval_loss": 0.005833905190229416,
|
664 |
+
"eval_reward": 1.8010995388031006,
|
665 |
+
"eval_reward_std": 0.79125015437603,
|
666 |
+
"eval_rewards/accuracy_reward": 0.4899553880095482,
|
667 |
+
"eval_rewards/cosine_scaled_reward": 0.3773643299937248,
|
668 |
"eval_rewards/format_reward": 0.0,
|
669 |
+
"eval_rewards/reasoning_steps_reward": 0.9337798058986664,
|
670 |
+
"eval_runtime": 64.0844,
|
671 |
+
"eval_samples_per_second": 1.545,
|
672 |
+
"eval_steps_per_second": 0.016,
|
673 |
"step": 400
|
674 |
},
|
675 |
{
|
676 |
+
"completion_length": 730.9440063476562,
|
677 |
"epoch": 0.6338776693400329,
|
678 |
+
"grad_norm": 0.007004458345967938,
|
679 |
+
"kl": 0.1326690673828125,
|
680 |
"learning_rate": 7.095536274107046e-06,
|
681 |
+
"loss": 0.0053,
|
682 |
+
"reward": 1.7348041359335185,
|
683 |
+
"reward_std": 0.7573289098218083,
|
684 |
+
"rewards/accuracy_reward": 0.46227680711308494,
|
685 |
+
"rewards/cosine_scaled_reward": 0.3395287758205086,
|
686 |
"rewards/format_reward": 0.0,
|
687 |
+
"rewards/reasoning_steps_reward": 0.93299855068326,
|
688 |
"step": 410
|
689 |
},
|
690 |
{
|
691 |
+
"completion_length": 737.4034927368164,
|
692 |
"epoch": 0.6493381002995459,
|
693 |
+
"grad_norm": 0.006302023763263314,
|
694 |
+
"kl": 0.1422760009765625,
|
695 |
"learning_rate": 6.58262156614881e-06,
|
696 |
+
"loss": 0.0057,
|
697 |
+
"reward": 1.7033680249005556,
|
698 |
+
"reward_std": 0.7371486462652683,
|
699 |
+
"rewards/accuracy_reward": 0.43750002002343535,
|
700 |
+
"rewards/cosine_scaled_reward": 0.32375487285316923,
|
701 |
"rewards/format_reward": 0.0,
|
702 |
+
"rewards/reasoning_steps_reward": 0.942113135010004,
|
703 |
"step": 420
|
704 |
},
|
705 |
{
|
706 |
+
"completion_length": 756.3276016235352,
|
707 |
"epoch": 0.6647985312590589,
|
708 |
+
"grad_norm": 0.008166583966853302,
|
709 |
+
"kl": 0.149725341796875,
|
710 |
"learning_rate": 6.079696150841634e-06,
|
711 |
+
"loss": 0.006,
|
712 |
+
"reward": 1.6697823703289032,
|
713 |
+
"reward_std": 0.7648335263133049,
|
714 |
+
"rewards/accuracy_reward": 0.4290178781375289,
|
715 |
+
"rewards/cosine_scaled_reward": 0.30843557265470734,
|
716 |
"rewards/format_reward": 0.0,
|
717 |
+
"rewards/reasoning_steps_reward": 0.9323289088904858,
|
718 |
"step": 430
|
719 |
},
|
720 |
{
|
721 |
+
"completion_length": 711.6224662780762,
|
722 |
"epoch": 0.6802589622185718,
|
723 |
+
"grad_norm": 0.006101275201994206,
|
724 |
+
"kl": 0.149908447265625,
|
725 |
"learning_rate": 5.588230122660672e-06,
|
726 |
+
"loss": 0.006,
|
727 |
+
"reward": 1.710378536581993,
|
728 |
+
"reward_std": 0.7376122187823058,
|
729 |
+
"rewards/accuracy_reward": 0.43995537869632245,
|
730 |
+
"rewards/cosine_scaled_reward": 0.3315466307423776,
|
731 |
"rewards/format_reward": 0.0,
|
732 |
+
"rewards/reasoning_steps_reward": 0.9388765264302492,
|
733 |
"step": 440
|
734 |
},
|
735 |
{
|
736 |
+
"completion_length": 720.1637599945068,
|
737 |
"epoch": 0.6957193931780848,
|
738 |
+
"grad_norm": 0.00827738551813243,
|
739 |
+
"kl": 0.1536865234375,
|
740 |
"learning_rate": 5.109660079301668e-06,
|
741 |
+
"loss": 0.0061,
|
742 |
+
"reward": 1.7479658477008342,
|
743 |
+
"reward_std": 0.7545963631942868,
|
744 |
+
"rewards/accuracy_reward": 0.45546877263113855,
|
745 |
+
"rewards/cosine_scaled_reward": 0.3493794774003618,
|
746 |
"rewards/format_reward": 0.0,
|
747 |
+
"rewards/reasoning_steps_reward": 0.9431175928562879,
|
748 |
"step": 450
|
749 |
},
|
750 |
{
|
751 |
+
"completion_length": 719.8788265228271,
|
752 |
"epoch": 0.7111798241375978,
|
753 |
+
"grad_norm": 0.009020213190404006,
|
754 |
+
"kl": 0.146099853515625,
|
755 |
"learning_rate": 4.64538492238166e-06,
|
756 |
+
"loss": 0.0058,
|
757 |
+
"reward": 1.761041846126318,
|
758 |
+
"reward_std": 0.7622619468718768,
|
759 |
+
"rewards/accuracy_reward": 0.46506698690354825,
|
760 |
+
"rewards/cosine_scaled_reward": 0.3563170699868351,
|
761 |
"rewards/format_reward": 0.0,
|
762 |
+
"rewards/reasoning_steps_reward": 0.9396577756851912,
|
763 |
"step": 460
|
764 |
},
|
765 |
{
|
766 |
+
"completion_length": 715.616215133667,
|
767 |
"epoch": 0.7266402550971108,
|
768 |
+
"grad_norm": 0.009535640148387967,
|
769 |
+
"kl": 0.1488037109375,
|
770 |
"learning_rate": 4.196761768328599e-06,
|
771 |
+
"loss": 0.006,
|
772 |
+
"reward": 1.7519984051585198,
|
773 |
+
"reward_std": 0.7264958534389734,
|
774 |
+
"rewards/accuracy_reward": 0.45613841600716115,
|
775 |
+
"rewards/cosine_scaled_reward": 0.35062185342776503,
|
776 |
"rewards/format_reward": 0.0,
|
777 |
+
"rewards/reasoning_steps_reward": 0.9452381365001201,
|
778 |
"step": 470
|
779 |
},
|
780 |
{
|
781 |
+
"completion_length": 733.5050575256348,
|
782 |
"epoch": 0.7421006860566238,
|
783 |
+
"grad_norm": 0.009375726733768255,
|
784 |
+
"kl": 0.1457244873046875,
|
785 |
"learning_rate": 3.7651019814126656e-06,
|
786 |
+
"loss": 0.0058,
|
787 |
+
"reward": 1.7320308901369572,
|
788 |
+
"reward_std": 0.753149107657373,
|
789 |
+
"rewards/accuracy_reward": 0.45301341358572245,
|
790 |
+
"rewards/cosine_scaled_reward": 0.3402525488520041,
|
791 |
"rewards/format_reward": 0.0,
|
792 |
+
"rewards/reasoning_steps_reward": 0.9387649200856686,
|
793 |
"step": 480
|
794 |
},
|
795 |
{
|
796 |
+
"completion_length": 725.7989181518554,
|
797 |
"epoch": 0.7575611170161368,
|
798 |
+
"grad_norm": 0.006465310695991136,
|
799 |
+
"kl": 0.1441925048828125,
|
800 |
"learning_rate": 3.3516673405151546e-06,
|
801 |
+
"loss": 0.0058,
|
802 |
+
"reward": 1.7133542537689208,
|
803 |
+
"reward_std": 0.7624470146372915,
|
804 |
+
"rewards/accuracy_reward": 0.4443080571014434,
|
805 |
+
"rewards/cosine_scaled_reward": 0.3332202689955011,
|
806 |
"rewards/format_reward": 0.0,
|
807 |
+
"rewards/reasoning_steps_reward": 0.9358259223401546,
|
808 |
"step": 490
|
809 |
},
|
810 |
{
|
811 |
+
"completion_length": 732.0432247161865,
|
812 |
"epoch": 0.7730215479756498,
|
813 |
+
"grad_norm": 0.006663296056320388,
|
814 |
+
"kl": 0.1493438720703125,
|
815 |
"learning_rate": 2.957666350839663e-06,
|
816 |
+
"loss": 0.006,
|
817 |
+
"reward": 1.7120833061635494,
|
818 |
+
"reward_std": 0.7427917202934623,
|
819 |
+
"rewards/accuracy_reward": 0.44140627095475793,
|
820 |
+
"rewards/cosine_scaled_reward": 0.3323585350837675,
|
821 |
"rewards/format_reward": 0.0,
|
822 |
+
"rewards/reasoning_steps_reward": 0.9383184887468815,
|
823 |
"step": 500
|
824 |
},
|
825 |
{
|
826 |
"epoch": 0.7730215479756498,
|
827 |
+
"eval_completion_length": 724.1022491455078,
|
828 |
+
"eval_kl": 0.14892578125,
|
829 |
+
"eval_loss": 0.006081230938434601,
|
830 |
+
"eval_reward": 1.7919847667217255,
|
831 |
+
"eval_reward_std": 0.7341814786195755,
|
832 |
+
"eval_rewards/accuracy_reward": 0.474330373108387,
|
833 |
+
"eval_rewards/cosine_scaled_reward": 0.3756899982690811,
|
834 |
"eval_rewards/format_reward": 0.0,
|
835 |
+
"eval_rewards/reasoning_steps_reward": 0.9419643133878708,
|
836 |
+
"eval_runtime": 62.9452,
|
837 |
+
"eval_samples_per_second": 1.573,
|
838 |
+
"eval_steps_per_second": 0.016,
|
839 |
"step": 500
|
840 |
},
|
841 |
{
|
842 |
+
"completion_length": 719.1628688812256,
|
843 |
"epoch": 0.7884819789351628,
|
844 |
+
"grad_norm": 0.026910990374737,
|
845 |
+
"kl": 0.1684112548828125,
|
846 |
"learning_rate": 2.5842507113469307e-06,
|
847 |
+
"loss": 0.0067,
|
848 |
+
"reward": 1.6821819383651018,
|
849 |
+
"reward_std": 0.7549204783514142,
|
850 |
+
"rewards/accuracy_reward": 0.42008930565789343,
|
851 |
+
"rewards/cosine_scaled_reward": 0.3171893151884433,
|
852 |
"rewards/format_reward": 0.0,
|
853 |
+
"rewards/reasoning_steps_reward": 0.9449033126235008,
|
854 |
"step": 510
|
855 |
},
|
856 |
{
|
857 |
+
"completion_length": 703.1540473937988,
|
858 |
"epoch": 0.8039424098946758,
|
859 |
+
"grad_norm": 0.029497387730934427,
|
860 |
+
"kl": 0.1495452880859375,
|
861 |
"learning_rate": 2.2325119482391466e-06,
|
862 |
+
"loss": 0.006,
|
863 |
+
"reward": 1.7537529528141023,
|
864 |
+
"reward_std": 0.7176604120060801,
|
865 |
+
"rewards/accuracy_reward": 0.44877234250307085,
|
866 |
+
"rewards/cosine_scaled_reward": 0.3511859173071571,
|
867 |
"rewards/format_reward": 0.0,
|
868 |
+
"rewards/reasoning_steps_reward": 0.9537946797907353,
|
869 |
"step": 520
|
870 |
},
|
871 |
{
|
872 |
+
"completion_length": 715.8909927368164,
|
873 |
"epoch": 0.8194028408541888,
|
874 |
+
"grad_norm": 0.006911653084698067,
|
875 |
+
"kl": 0.1466156005859375,
|
876 |
"learning_rate": 1.9034782243345074e-06,
|
877 |
+
"loss": 0.0059,
|
878 |
+
"reward": 1.7353017818182708,
|
879 |
+
"reward_std": 0.7042613643221557,
|
880 |
+
"rewards/accuracy_reward": 0.4434151995461434,
|
881 |
+
"rewards/cosine_scaled_reward": 0.34125408774707466,
|
882 |
"rewards/format_reward": 0.0,
|
883 |
+
"rewards/reasoning_steps_reward": 0.9506324734538794,
|
884 |
"step": 530
|
885 |
},
|
886 |
{
|
887 |
+
"completion_length": 731.9873096466065,
|
888 |
"epoch": 0.8348632718137018,
|
889 |
+
"grad_norm": 0.10031774065756535,
|
890 |
+
"kl": 0.165179443359375,
|
891 |
"learning_rate": 1.5981113336584041e-06,
|
892 |
+
"loss": 0.0066,
|
893 |
+
"reward": 1.720738895609975,
|
894 |
+
"reward_std": 0.7829023336991667,
|
895 |
+
"rewards/accuracy_reward": 0.44453127147862687,
|
896 |
+
"rewards/cosine_scaled_reward": 0.33692186851403677,
|
897 |
"rewards/format_reward": 0.0,
|
898 |
+
"rewards/reasoning_steps_reward": 0.9392857551574707,
|
899 |
"step": 540
|
900 |
},
|
901 |
{
|
902 |
+
"completion_length": 726.0302787780762,
|
903 |
"epoch": 0.8503237027732148,
|
904 |
+
"grad_norm": 0.00915840041448343,
|
905 |
+
"kl": 0.1617706298828125,
|
906 |
"learning_rate": 1.3173038900362977e-06,
|
907 |
+
"loss": 0.0065,
|
908 |
+
"reward": 1.7284724555909634,
|
909 |
+
"reward_std": 0.7755123546347023,
|
910 |
+
"rewards/accuracy_reward": 0.4477678783237934,
|
911 |
+
"rewards/cosine_scaled_reward": 0.34402298720087854,
|
912 |
"rewards/format_reward": 0.0,
|
913 |
+
"rewards/reasoning_steps_reward": 0.9366815879940986,
|
914 |
"step": 550
|
915 |
},
|
916 |
{
|
917 |
+
"completion_length": 716.5763721466064,
|
918 |
"epoch": 0.8657841337327278,
|
919 |
+
"grad_norm": 0.0077065633985853085,
|
920 |
+
"kl": 0.151544189453125,
|
921 |
"learning_rate": 1.0618767179063416e-06,
|
922 |
+
"loss": 0.0061,
|
923 |
+
"reward": 1.7493106886744498,
|
924 |
+
"reward_std": 0.7468110140413046,
|
925 |
+
"rewards/accuracy_reward": 0.45625002135057,
|
926 |
+
"rewards/cosine_scaled_reward": 0.3529192515881732,
|
927 |
"rewards/format_reward": 0.0,
|
928 |
+
"rewards/reasoning_steps_reward": 0.9401414062827825,
|
929 |
"step": 560
|
930 |
},
|
931 |
{
|
932 |
+
"completion_length": 711.7772666931153,
|
933 |
"epoch": 0.8812445646922408,
|
934 |
+
"grad_norm": 0.011223015630773887,
|
935 |
+
"kl": 0.1598358154296875,
|
936 |
"learning_rate": 8.325764529785851e-07,
|
937 |
+
"loss": 0.0064,
|
938 |
+
"reward": 1.7419822074472904,
|
939 |
+
"reward_std": 0.7288113379850983,
|
940 |
+
"rewards/accuracy_reward": 0.45122770036105064,
|
941 |
+
"rewards/cosine_scaled_reward": 0.34804613249725663,
|
942 |
"rewards/format_reward": 0.0,
|
943 |
+
"rewards/reasoning_steps_reward": 0.9427083749324083,
|
944 |
"step": 570
|
945 |
},
|
946 |
{
|
947 |
+
"completion_length": 717.4036037445069,
|
948 |
"epoch": 0.8967049956517538,
|
949 |
+
"grad_norm": 0.01473136538282227,
|
950 |
+
"kl": 0.1699462890625,
|
951 |
"learning_rate": 6.300733597542086e-07,
|
952 |
+
"loss": 0.0068,
|
953 |
+
"reward": 1.7380871541798115,
|
954 |
+
"reward_std": 0.7284659473225474,
|
955 |
+
"rewards/accuracy_reward": 0.4454241285100579,
|
956 |
+
"rewards/cosine_scaled_reward": 0.3448207150679082,
|
957 |
"rewards/format_reward": 0.0,
|
958 |
+
"rewards/reasoning_steps_reward": 0.9478422913700342,
|
959 |
"step": 580
|
960 |
},
|
961 |
{
|
962 |
+
"completion_length": 722.4015926361084,
|
963 |
"epoch": 0.9121654266112668,
|
964 |
+
"grad_norm": 0.015247562461578802,
|
965 |
+
"kl": 0.1722503662109375,
|
966 |
"learning_rate": 4.549593722844492e-07,
|
967 |
+
"loss": 0.0069,
|
968 |
+
"reward": 1.7376306042075158,
|
969 |
+
"reward_std": 0.7329583563841879,
|
970 |
+
"rewards/accuracy_reward": 0.4400669841095805,
|
971 |
+
"rewards/cosine_scaled_reward": 0.34566624723374845,
|
972 |
"rewards/format_reward": 0.0,
|
973 |
+
"rewards/reasoning_steps_reward": 0.951897357404232,
|
974 |
"step": 590
|
975 |
},
|
976 |
{
|
977 |
+
"completion_length": 719.5832901000977,
|
978 |
"epoch": 0.9276258575707798,
|
979 |
+
"grad_norm": 0.008595325912121869,
|
980 |
+
"kl": 0.1673126220703125,
|
981 |
"learning_rate": 3.0774636389618196e-07,
|
982 |
+
"loss": 0.0067,
|
983 |
+
"reward": 1.7701299749314785,
|
984 |
+
"reward_std": 0.7306436906568706,
|
985 |
+
"rewards/accuracy_reward": 0.4577009153552353,
|
986 |
+
"rewards/cosine_scaled_reward": 0.360122480080463,
|
987 |
"rewards/format_reward": 0.0,
|
988 |
+
"rewards/reasoning_steps_reward": 0.9523065883666277,
|
989 |
"step": 600
|
990 |
},
|
991 |
{
|
992 |
"epoch": 0.9276258575707798,
|
993 |
+
"eval_completion_length": 705.7041168212891,
|
994 |
+
"eval_kl": 0.164794921875,
|
995 |
+
"eval_loss": 0.006647891830652952,
|
996 |
+
"eval_reward": 1.8423524498939514,
|
997 |
+
"eval_reward_std": 0.6980961859226227,
|
998 |
+
"eval_rewards/accuracy_reward": 0.4832589626312256,
|
999 |
+
"eval_rewards/cosine_scaled_reward": 0.39815596491098404,
|
1000 |
"eval_rewards/format_reward": 0.0,
|
1001 |
+
"eval_rewards/reasoning_steps_reward": 0.9609375596046448,
|
1002 |
+
"eval_runtime": 63.3214,
|
1003 |
+
"eval_samples_per_second": 1.563,
|
1004 |
+
"eval_steps_per_second": 0.016,
|
1005 |
"step": 600
|
1006 |
},
|
1007 |
{
|
1008 |
+
"completion_length": 719.9855236053467,
|
1009 |
"epoch": 0.9430862885302927,
|
1010 |
+
"grad_norm": 0.014629584328010486,
|
1011 |
+
"kl": 0.17073974609375,
|
1012 |
"learning_rate": 1.8886465094192895e-07,
|
1013 |
+
"loss": 0.0068,
|
1014 |
+
"reward": 1.7343647606670856,
|
1015 |
+
"reward_std": 0.7088968453928828,
|
1016 |
+
"rewards/accuracy_reward": 0.4390625214669853,
|
1017 |
+
"rewards/cosine_scaled_reward": 0.3427352339422214,
|
1018 |
"rewards/format_reward": 0.0,
|
1019 |
+
"rewards/reasoning_steps_reward": 0.9525669939815998,
|
1020 |
"step": 610
|
1021 |
},
|
1022 |
{
|
1023 |
+
"completion_length": 721.9812828063965,
|
1024 |
"epoch": 0.9585467194898057,
|
1025 |
+
"grad_norm": 0.020088112225356343,
|
1026 |
+
"kl": 0.1849456787109375,
|
1027 |
"learning_rate": 9.866173494794462e-08,
|
1028 |
+
"loss": 0.0074,
|
1029 |
+
"reward": 1.7370413817465304,
|
1030 |
+
"reward_std": 0.7334370331838727,
|
1031 |
+
"rewards/accuracy_reward": 0.44151787713635715,
|
1032 |
+
"rewards/cosine_scaled_reward": 0.34656511796929407,
|
1033 |
"rewards/format_reward": 0.0,
|
1034 |
+
"rewards/reasoning_steps_reward": 0.9489583697170019,
|
1035 |
"step": 620
|
1036 |
},
|
1037 |
{
|
1038 |
+
"completion_length": 724.9088500976562,
|
1039 |
"epoch": 0.9740071504493187,
|
1040 |
+
"grad_norm": 0.009230798738629389,
|
1041 |
+
"kl": 0.179193115234375,
|
1042 |
"learning_rate": 3.7401286837214224e-08,
|
1043 |
+
"loss": 0.0072,
|
1044 |
+
"reward": 1.7149522617459296,
|
1045 |
+
"reward_std": 0.740879999101162,
|
1046 |
+
"rewards/accuracy_reward": 0.43069198355078697,
|
1047 |
+
"rewards/cosine_scaled_reward": 0.3343346292153001,
|
1048 |
"rewards/format_reward": 0.0,
|
1049 |
+
"rewards/reasoning_steps_reward": 0.9499256368726492,
|
1050 |
"step": 630
|
1051 |
},
|
1052 |
{
|
1053 |
+
"completion_length": 733.0805023193359,
|
1054 |
"epoch": 0.9894675814088317,
|
1055 |
+
"grad_norm": 0.013971562093972711,
|
1056 |
+
"kl": 0.177264404296875,
|
1057 |
"learning_rate": 5.262376196544239e-09,
|
1058 |
+
"loss": 0.0071,
|
1059 |
+
"reward": 1.6887946531176568,
|
1060 |
+
"reward_std": 0.7455704480409622,
|
1061 |
+
"rewards/accuracy_reward": 0.4194196627475321,
|
1062 |
+
"rewards/cosine_scaled_reward": 0.3205654217163101,
|
1063 |
"rewards/format_reward": 0.0,
|
1064 |
+
"rewards/reasoning_steps_reward": 0.9488095600157976,
|
1065 |
"step": 640
|
1066 |
},
|
1067 |
{
|
1068 |
+
"completion_length": 726.9589246114095,
|
1069 |
"epoch": 0.9987438399845395,
|
1070 |
+
"kl": 0.1743927001953125,
|
1071 |
+
"reward": 1.7412781628469627,
|
1072 |
+
"reward_std": 0.7270878640313944,
|
1073 |
+
"rewards/accuracy_reward": 0.444568472293516,
|
1074 |
+
"rewards/cosine_scaled_reward": 0.34755290367562947,
|
1075 |
"rewards/format_reward": 0.0,
|
1076 |
+
"rewards/reasoning_steps_reward": 0.9491567853838205,
|
1077 |
"step": 646,
|
1078 |
"total_flos": 0.0,
|
1079 |
+
"train_loss": 0.007009532302370239,
|
1080 |
+
"train_runtime": 74639.7368,
|
1081 |
+
"train_samples_per_second": 0.971,
|
1082 |
+
"train_steps_per_second": 0.009
|
1083 |
}
|
1084 |
],
|
1085 |
"logging_steps": 10,
|
|
|
1100 |
}
|
1101 |
},
|
1102 |
"total_flos": 0.0,
|
1103 |
+
"train_batch_size": 8,
|
1104 |
"trial_name": null,
|
1105 |
"trial_params": null
|
1106 |
}
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e04bfc998f8e18fbbd2065db820a1e406e5420c727e971f5e44939e03c82128
|
3 |
+
size 7416
|