qgallouedec HF staff commited on
Commit
8a11b45
·
verified ·
1 Parent(s): 7229b2f

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/huggingface/huggingface/runs/t256wwde)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/huggingface/huggingface/runs/0f5fvgp8)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.01602011715643949,
4
- "train_runtime": 67938.2282,
5
  "train_samples": 72441,
6
- "train_samples_per_second": 1.066,
7
- "train_steps_per_second": 0.01
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.007009532302370239,
4
+ "train_runtime": 74639.7368,
5
  "train_samples": 72441,
6
+ "train_samples_per_second": 0.971,
7
+ "train_steps_per_second": 0.009
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bf1d4882b9a83e72271397bc79ecb7db9444234d1c6e30017d7791b27bb18aa
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:120de27679fadf851221dcd9419b615665e01c651c2dbeb646d254688d67fe3e
3
  size 3554214752
runs/Jan31_21-54-01_ip-26-0-160-192/events.out.tfevents.1738360500.ip-26-0-160-192.374216.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0199d7a43ae9d04a22f332fe814b32a948d16d775301e28eb6103cd52baca08a
3
+ size 5622
runs/Jan31_22-01-36_ip-26-0-160-192/events.out.tfevents.1738360955.ip-26-0-160-192.377629.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb2239cee3f4c66347b42e07f91165a59938dbac39eca3e2df5c725d0777656
3
+ size 16622
runs/Jan31_22-07-53_ip-26-0-161-78/events.out.tfevents.1738361338.ip-26-0-161-78.2097139.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1c7e18d5bf594fedc1219c195bfb0304b20df5f799da8b1abec366ace1898a
3
+ size 16621
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.01602011715643949,
4
- "train_runtime": 67938.2282,
5
  "train_samples": 72441,
6
- "train_samples_per_second": 1.066,
7
- "train_steps_per_second": 0.01
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.007009532302370239,
4
+ "train_runtime": 74639.7368,
5
  "train_samples": 72441,
6
+ "train_samples_per_second": 0.971,
7
+ "train_steps_per_second": 0.009
8
  }
trainer_state.json CHANGED
@@ -9,935 +9,1077 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 890.5919452667237,
13
  "epoch": 0.015460430959512996,
14
- "grad_norm": 0.0048114829798690025,
15
- "kl": 0.0004779815673828125,
16
  "learning_rate": 3.0769230769230774e-06,
17
  "loss": 0.0,
18
- "reward": 0.2079081600648351,
19
- "reward_std": 0.18389849485829474,
20
- "rewards/accuracy_reward": 0.2079081600648351,
 
21
  "rewards/format_reward": 0.0,
 
22
  "step": 10
23
  },
24
  {
25
- "completion_length": 815.0035549163819,
26
  "epoch": 0.03092086191902599,
27
- "grad_norm": 0.0037700873930097754,
28
- "kl": 0.005376839637756347,
29
  "learning_rate": 6.153846153846155e-06,
30
  "loss": 0.0002,
31
- "reward": 0.3485969334375113,
32
- "reward_std": 0.20335620292462409,
33
- "rewards/accuracy_reward": 0.3485969334375113,
 
34
  "rewards/format_reward": 0.0,
 
35
  "step": 20
36
  },
37
  {
38
- "completion_length": 783.1976894378662,
39
  "epoch": 0.04638129287853899,
40
- "grad_norm": 0.003633377980625143,
41
- "kl": 0.008260059356689452,
42
  "learning_rate": 9.230769230769232e-06,
43
- "loss": 0.0003,
44
- "reward": 0.41823978765169156,
45
- "reward_std": 0.214983982546255,
46
- "rewards/accuracy_reward": 0.41823978765169156,
 
47
  "rewards/format_reward": 0.0,
 
48
  "step": 30
49
  },
50
  {
51
- "completion_length": 796.1503639221191,
52
  "epoch": 0.06184172383805198,
53
- "grad_norm": 0.004207792051316878,
54
- "kl": 0.013314247131347656,
55
  "learning_rate": 1.230769230769231e-05,
56
- "loss": 0.0005,
57
- "reward": 0.45267856353893876,
58
- "reward_std": 0.22209063512273133,
59
- "rewards/accuracy_reward": 0.45267856353893876,
 
60
  "rewards/format_reward": 0.0,
 
61
  "step": 40
62
  },
63
  {
64
- "completion_length": 707.4483289718628,
65
  "epoch": 0.07730215479756498,
66
- "grad_norm": 0.0039377851262065435,
67
- "kl": 0.023092269897460938,
68
  "learning_rate": 1.5384615384615387e-05,
69
  "loss": 0.0009,
70
- "reward": 0.5378826450556516,
71
- "reward_std": 0.2228247532621026,
72
- "rewards/accuracy_reward": 0.5378826450556516,
 
73
  "rewards/format_reward": 0.0,
 
74
  "step": 50
75
  },
76
  {
77
- "completion_length": 738.098327255249,
78
  "epoch": 0.09276258575707798,
79
- "grad_norm": 0.004922000110022058,
80
- "kl": 0.03480720520019531,
81
  "learning_rate": 1.8461538461538465e-05,
82
- "loss": 0.0014,
83
- "reward": 0.5255101950955577,
84
- "reward_std": 0.21384936766698956,
85
- "rewards/accuracy_reward": 0.5255101950955577,
 
86
  "rewards/format_reward": 0.0,
 
87
  "step": 60
88
  },
89
  {
90
- "completion_length": 714.9747295379639,
91
  "epoch": 0.10822301671659097,
92
- "grad_norm": 0.025813508862428863,
93
- "kl": 0.11458740234375,
94
  "learning_rate": 1.999634547413886e-05,
95
- "loss": 0.0046,
96
- "reward": 0.5369897866621614,
97
- "reward_std": 0.24942327085882426,
98
- "rewards/accuracy_reward": 0.5369897866621614,
 
99
  "rewards/format_reward": 0.0,
 
100
  "step": 70
101
  },
102
  {
103
- "completion_length": 729.6705226898193,
104
  "epoch": 0.12368344767610397,
105
- "grad_norm": 0.004934077059395142,
106
- "kl": 0.1241455078125,
107
  "learning_rate": 1.9967125291968495e-05,
108
- "loss": 0.005,
109
- "reward": 0.5182397849857807,
110
- "reward_std": 0.22350065293721855,
111
- "rewards/accuracy_reward": 0.5182397849857807,
 
112
  "rewards/format_reward": 0.0,
 
113
  "step": 80
114
  },
115
  {
116
- "completion_length": 752.231746673584,
117
  "epoch": 0.13914387863561697,
118
- "grad_norm": 0.004490175226607394,
119
- "kl": 0.1960784912109375,
120
  "learning_rate": 1.990877034074683e-05,
121
- "loss": 0.0078,
122
- "reward": 0.47576529716607185,
123
- "reward_std": 0.21953069823794066,
124
- "rewards/accuracy_reward": 0.47576529716607185,
 
125
  "rewards/format_reward": 0.0,
 
126
  "step": 90
127
  },
128
  {
129
- "completion_length": 735.3539382934571,
130
  "epoch": 0.15460430959512997,
131
- "grad_norm": 0.11798471666893984,
132
- "kl": 0.30410003662109375,
133
  "learning_rate": 1.9821451197042028e-05,
134
- "loss": 0.0122,
135
- "reward": 0.4843112153466791,
136
- "reward_std": 0.22822934831492603,
137
- "rewards/accuracy_reward": 0.4843112153466791,
 
138
  "rewards/format_reward": 0.0,
 
139
  "step": 100
140
  },
141
  {
142
  "epoch": 0.15460430959512997,
143
- "eval_completion_length": 649.0448852539063,
144
- "eval_kl": 0.27952473958333335,
145
- "eval_loss": 0.009933823719620705,
146
- "eval_reward": 0.45850338935852053,
147
- "eval_reward_std": 0.26114755471547446,
148
- "eval_rewards/accuracy_reward": 0.45850338935852053,
 
149
  "eval_rewards/format_reward": 0.0,
150
- "eval_runtime": 88.3141,
151
- "eval_samples_per_second": 1.121,
152
- "eval_steps_per_second": 0.17,
 
153
  "step": 100
154
  },
155
  {
156
- "completion_length": 732.8517738342285,
157
  "epoch": 0.17006474055464296,
158
- "grad_norm": 0.02615303400330413,
159
- "kl": 1.07412109375,
160
  "learning_rate": 1.9705423102261324e-05,
161
- "loss": 0.043,
162
- "reward": 0.3991071363911033,
163
- "reward_std": 0.2730441292747855,
164
- "rewards/accuracy_reward": 0.3991071363911033,
 
165
  "rewards/format_reward": 0.0,
 
166
  "step": 110
167
  },
168
  {
169
- "completion_length": 792.610315322876,
170
  "epoch": 0.18552517151415596,
171
- "grad_norm": 0.0287118341101585,
172
- "kl": 0.542120361328125,
173
  "learning_rate": 1.956102521655831e-05,
174
- "loss": 0.0217,
175
- "reward": 0.3727040741709061,
176
- "reward_std": 0.26482684616930785,
177
- "rewards/accuracy_reward": 0.3727040741709061,
 
178
  "rewards/format_reward": 0.0,
 
179
  "step": 120
180
  },
181
  {
182
- "completion_length": 742.854447555542,
183
  "epoch": 0.20098560247366895,
184
- "grad_norm": 0.026940670011693296,
185
- "kl": 0.4710235595703125,
186
  "learning_rate": 1.9388679627438486e-05,
187
- "loss": 0.0188,
188
- "reward": 0.46033162334933875,
189
- "reward_std": 0.24005382088944316,
190
- "rewards/accuracy_reward": 0.46033162334933875,
 
191
  "rewards/format_reward": 0.0,
 
192
  "step": 130
193
  },
194
  {
195
- "completion_length": 747.0299587249756,
196
  "epoch": 0.21644603343318194,
197
- "grad_norm": 0.044131709279297326,
198
- "kl": 0.3888519287109375,
199
  "learning_rate": 1.9188890115960967e-05,
200
- "loss": 0.0156,
201
- "reward": 0.4734693782404065,
202
- "reward_std": 0.2484902088996023,
203
- "rewards/accuracy_reward": 0.4734693782404065,
 
204
  "rewards/format_reward": 0.0,
 
205
  "step": 140
206
  },
207
  {
208
- "completion_length": 719.1231998443603,
209
  "epoch": 0.23190646439269494,
210
- "grad_norm": 0.04685629239096149,
211
- "kl": 0.6114959716796875,
212
  "learning_rate": 1.8962240684142923e-05,
213
- "loss": 0.0245,
214
- "reward": 0.4772959094494581,
215
- "reward_std": 0.26747574456967416,
216
- "rewards/accuracy_reward": 0.4772959094494581,
 
217
  "rewards/format_reward": 0.0,
 
218
  "step": 150
219
  },
220
  {
221
- "completion_length": 752.0598068237305,
222
  "epoch": 0.24736689535220793,
223
- "grad_norm": 0.00786304561850755,
224
- "kl": 0.27758941650390623,
225
  "learning_rate": 1.8709393847871146e-05,
226
- "loss": 0.0111,
227
- "reward": 0.4918367238715291,
228
- "reward_std": 0.24101508525200188,
229
- "rewards/accuracy_reward": 0.4918367238715291,
 
230
  "rewards/format_reward": 0.0,
 
231
  "step": 160
232
  },
233
  {
234
- "completion_length": 713.9837879180908,
235
  "epoch": 0.26282732631172095,
236
- "grad_norm": 0.005284057529020878,
237
- "kl": 0.22295989990234374,
238
  "learning_rate": 1.8431088700310846e-05,
239
- "loss": 0.0089,
240
- "reward": 0.5107142773456872,
241
- "reward_std": 0.2484118543099612,
242
- "rewards/accuracy_reward": 0.5107142773456872,
 
243
  "rewards/format_reward": 0.0,
 
244
  "step": 170
245
  },
246
  {
247
- "completion_length": 721.1976871490479,
248
  "epoch": 0.27828775727123395,
249
- "grad_norm": 0.0059488370043247725,
250
- "kl": 0.27528076171875,
251
  "learning_rate": 1.8128138751472432e-05,
252
- "loss": 0.011,
253
- "reward": 0.4794642778113484,
254
- "reward_std": 0.28188897627405823,
255
- "rewards/accuracy_reward": 0.4794642778113484,
 
256
  "rewards/format_reward": 0.0,
 
257
  "step": 180
258
  },
259
  {
260
- "completion_length": 685.0909267425537,
261
  "epoch": 0.29374818823074694,
262
- "grad_norm": 0.024776439473654245,
263
- "kl": 0.3252960205078125,
264
  "learning_rate": 1.780142955025139e-05,
265
- "loss": 0.013,
266
- "reward": 0.49030611482448877,
267
- "reward_std": 0.29441971494816244,
268
- "rewards/accuracy_reward": 0.49030611482448877,
 
269
  "rewards/format_reward": 0.0,
 
270
  "step": 190
271
  },
272
  {
273
- "completion_length": 688.9476896286011,
274
  "epoch": 0.30920861919025994,
275
- "grad_norm": 0.018511152998280257,
276
- "kl": 0.407830810546875,
277
  "learning_rate": 1.745191609589231e-05,
278
- "loss": 0.0163,
279
- "reward": 0.4463010121136904,
280
- "reward_std": 0.29204082568176093,
281
- "rewards/accuracy_reward": 0.4463010121136904,
 
282
  "rewards/format_reward": 0.0,
 
283
  "step": 200
284
  },
285
  {
286
  "epoch": 0.30920861919025994,
287
- "eval_completion_length": 684.5197184244792,
288
- "eval_kl": 0.31907552083333335,
289
- "eval_loss": 0.013225244358181953,
290
- "eval_reward": 0.5034013529618581,
291
- "eval_reward_std": 0.26736749211947125,
292
- "eval_rewards/accuracy_reward": 0.5034013529618581,
 
293
  "eval_rewards/format_reward": 0.0,
294
- "eval_runtime": 88.6591,
295
- "eval_samples_per_second": 1.117,
296
- "eval_steps_per_second": 0.169,
 
297
  "step": 200
298
  },
299
  {
300
- "completion_length": 688.203684425354,
301
  "epoch": 0.32466905014977293,
302
- "grad_norm": 0.04166474923994624,
303
- "kl": 0.4008056640625,
304
  "learning_rate": 1.7080620046443503e-05,
305
- "loss": 0.016,
306
- "reward": 0.45382652347907426,
307
- "reward_std": 0.27682951451279225,
308
- "rewards/accuracy_reward": 0.45382652347907426,
 
309
  "rewards/format_reward": 0.0,
 
310
  "step": 210
311
  },
312
  {
313
- "completion_length": 671.052919960022,
314
  "epoch": 0.3401294811092859,
315
- "grad_norm": 0.03352718681096581,
316
- "kl": 0.3749359130859375,
317
  "learning_rate": 1.6688626732362192e-05,
318
- "loss": 0.015,
319
- "reward": 0.4804846870712936,
320
- "reward_std": 0.2816257219295949,
321
- "rewards/accuracy_reward": 0.4804846870712936,
 
322
  "rewards/format_reward": 0.0,
 
323
  "step": 220
324
  },
325
  {
326
- "completion_length": 643.8374883651734,
327
  "epoch": 0.3555899120687989,
328
- "grad_norm": 0.020351629255596172,
329
- "kl": 0.41683349609375,
330
  "learning_rate": 1.6277081983999742e-05,
331
- "loss": 0.0167,
332
- "reward": 0.4418367271311581,
333
- "reward_std": 0.28084711018018427,
334
- "rewards/accuracy_reward": 0.4418367271311581,
 
335
  "rewards/format_reward": 0.0,
 
336
  "step": 230
337
  },
338
  {
339
- "completion_length": 676.2223068237305,
340
  "epoch": 0.3710503430283119,
341
- "grad_norm": 0.05256794474201724,
342
- "kl": 0.4444091796875,
343
  "learning_rate": 1.5847188782240473e-05,
344
- "loss": 0.0178,
345
- "reward": 0.4517857059370726,
346
- "reward_std": 0.29922230960801244,
347
- "rewards/accuracy_reward": 0.4517857059370726,
 
348
  "rewards/format_reward": 0.0,
 
349
  "step": 240
350
  },
351
  {
352
- "completion_length": 668.4691181182861,
353
  "epoch": 0.3865107739878249,
354
- "grad_norm": 0.03902146766301656,
355
- "kl": 0.5255126953125,
356
  "learning_rate": 1.5400203742084508e-05,
357
- "loss": 0.021,
358
- "reward": 0.4383928496390581,
359
- "reward_std": 0.2969975466839969,
360
- "rewards/accuracy_reward": 0.4383928496390581,
 
361
  "rewards/format_reward": 0.0,
 
362
  "step": 250
363
  },
364
  {
365
- "completion_length": 603.6825130462646,
366
  "epoch": 0.4019712049473379,
367
- "grad_norm": 0.02207794461900282,
368
- "kl": 0.4320556640625,
369
  "learning_rate": 1.4937433439453465e-05,
370
- "loss": 0.0173,
371
- "reward": 0.45331631754525004,
372
- "reward_std": 0.29558597495779393,
373
- "rewards/accuracy_reward": 0.45331631754525004,
 
374
  "rewards/format_reward": 0.0,
 
375
  "step": 260
376
  },
377
  {
378
- "completion_length": 640.7654218673706,
379
  "epoch": 0.4174316359068509,
380
- "grad_norm": 0.02435264101678905,
381
- "kl": 0.510498046875,
382
  "learning_rate": 1.4460230591956097e-05,
383
- "loss": 0.0204,
384
- "reward": 0.4378826460801065,
385
- "reward_std": 0.2932774598710239,
386
- "rewards/accuracy_reward": 0.4378826460801065,
 
387
  "rewards/format_reward": 0.0,
 
388
  "step": 270
389
  },
390
  {
391
- "completion_length": 645.673583984375,
392
  "epoch": 0.4328920668663639,
393
- "grad_norm": 0.08816250567178784,
394
- "kl": 0.658056640625,
395
  "learning_rate": 1.3969990104777712e-05,
396
- "loss": 0.0263,
397
- "reward": 0.42423468651250007,
398
- "reward_std": 0.2902136994060129,
399
- "rewards/accuracy_reward": 0.42423468651250007,
 
400
  "rewards/format_reward": 0.0,
 
401
  "step": 280
402
  },
403
  {
404
- "completion_length": 620.3529205322266,
405
  "epoch": 0.4483524978258769,
406
- "grad_norm": 0.03880404727089487,
407
- "kl": 0.92225341796875,
408
  "learning_rate": 1.3468144993251735e-05,
409
- "loss": 0.0369,
410
- "reward": 0.45433672657236457,
411
- "reward_std": 0.29660415309481325,
412
- "rewards/accuracy_reward": 0.45433672657236457,
 
413
  "rewards/format_reward": 0.0,
 
414
  "step": 290
415
  },
416
  {
417
- "completion_length": 662.0209051132202,
418
  "epoch": 0.4638129287853899,
419
- "grad_norm": 0.0580905052695571,
420
- "kl": 1.16871337890625,
421
  "learning_rate": 1.295616219403197e-05,
422
- "loss": 0.0468,
423
- "reward": 0.4096938706934452,
424
- "reward_std": 0.3028755730483681,
425
- "rewards/accuracy_reward": 0.4096938706934452,
 
426
  "rewards/format_reward": 0.0,
 
427
  "step": 300
428
  },
429
  {
430
  "epoch": 0.4638129287853899,
431
- "eval_completion_length": 604.8503275553386,
432
- "eval_kl": 1.15458984375,
433
- "eval_loss": 0.04879453405737877,
434
- "eval_reward": 0.4707482943932215,
435
- "eval_reward_std": 0.27726571063200633,
436
- "eval_rewards/accuracy_reward": 0.4707482943932215,
 
437
  "eval_rewards/format_reward": 0.0,
438
- "eval_runtime": 85.9413,
439
- "eval_samples_per_second": 1.152,
440
- "eval_steps_per_second": 0.175,
 
441
  "step": 300
442
  },
443
  {
444
- "completion_length": 618.5189908981323,
445
  "epoch": 0.47927335974490287,
446
- "grad_norm": 0.04382094795523676,
447
- "kl": 0.9635986328125,
448
  "learning_rate": 1.2435538277109919e-05,
449
- "loss": 0.0385,
450
- "reward": 0.44183672638610005,
451
- "reward_std": 0.28991906996816397,
452
- "rewards/accuracy_reward": 0.44183672638610005,
 
453
  "rewards/format_reward": 0.0,
 
454
  "step": 310
455
  },
456
  {
457
- "completion_length": 576.9373609542847,
458
  "epoch": 0.49473379070441587,
459
- "grad_norm": 0.03238899986450347,
460
- "kl": 0.90596923828125,
461
  "learning_rate": 1.19077950712113e-05,
462
- "loss": 0.0362,
463
- "reward": 0.4463010119274259,
464
- "reward_std": 0.27278245403431356,
465
- "rewards/accuracy_reward": 0.4463010119274259,
 
466
  "rewards/format_reward": 0.0,
 
467
  "step": 320
468
  },
469
  {
470
- "completion_length": 535.9521591186524,
471
  "epoch": 0.5101942216639289,
472
- "grad_norm": 0.018431343598236234,
473
- "kl": 0.5312744140625,
474
  "learning_rate": 1.137447521535908e-05,
475
- "loss": 0.0213,
476
- "reward": 0.5080357047729194,
477
- "reward_std": 0.2620480744168162,
478
- "rewards/accuracy_reward": 0.5080357047729194,
 
479
  "rewards/format_reward": 0.0,
 
480
  "step": 330
481
  },
482
  {
483
- "completion_length": 646.3938673019409,
484
  "epoch": 0.5256546526234419,
485
- "grad_norm": 0.02317552951279719,
486
- "kl": 0.513262939453125,
487
  "learning_rate": 1.0837137649606241e-05,
488
- "loss": 0.0205,
489
- "reward": 0.44579080818220973,
490
- "reward_std": 0.29469514368101957,
491
- "rewards/accuracy_reward": 0.44579080818220973,
 
492
  "rewards/format_reward": 0.0,
 
493
  "step": 340
494
  },
495
  {
496
- "completion_length": 586.0761362075806,
497
  "epoch": 0.5411150835829549,
498
- "grad_norm": 0.016226017261457597,
499
- "kl": 0.3769073486328125,
500
  "learning_rate": 1.0297353058119209e-05,
501
- "loss": 0.0151,
502
- "reward": 0.4859693799167871,
503
- "reward_std": 0.2713043099734932,
504
- "rewards/accuracy_reward": 0.4859693799167871,
 
505
  "rewards/format_reward": 0.0,
 
506
  "step": 350
507
  },
508
  {
509
- "completion_length": 571.9863378524781,
510
  "epoch": 0.5565755145424679,
511
- "grad_norm": 0.0504216391704931,
512
- "kl": 0.599774169921875,
513
  "learning_rate": 9.756699277932196e-06,
514
- "loss": 0.024,
515
- "reward": 0.48443876539822667,
516
- "reward_std": 0.28054420156404375,
517
- "rewards/accuracy_reward": 0.48443876539822667,
 
518
  "rewards/format_reward": 0.0,
 
519
  "step": 360
520
  },
521
  {
522
- "completion_length": 563.5765197753906,
523
  "epoch": 0.5720359455019809,
524
- "grad_norm": 0.13990571845181696,
525
- "kl": 0.52132568359375,
526
  "learning_rate": 9.216756686793163e-06,
527
- "loss": 0.0209,
528
- "reward": 0.45790815523359923,
529
- "reward_std": 0.26586609268561007,
530
- "rewards/accuracy_reward": 0.45790815523359923,
 
531
  "rewards/format_reward": 0.0,
 
532
  "step": 370
533
  },
534
  {
535
- "completion_length": 575.829453086853,
536
  "epoch": 0.5874963764614939,
537
- "grad_norm": 0.013870992065908727,
538
- "kl": 0.330126953125,
539
  "learning_rate": 8.67910358358298e-06,
540
- "loss": 0.0132,
541
- "reward": 0.4734693790320307,
542
- "reward_std": 0.27416237886063755,
543
- "rewards/accuracy_reward": 0.4734693790320307,
 
544
  "rewards/format_reward": 0.0,
 
545
  "step": 380
546
  },
547
  {
548
- "completion_length": 568.1160606384277,
549
  "epoch": 0.6029568074210069,
550
- "grad_norm": 0.010163464330427247,
551
- "kl": 0.2426544189453125,
552
  "learning_rate": 8.145311574811325e-06,
553
- "loss": 0.0097,
554
- "reward": 0.4954081534408033,
555
- "reward_std": 0.26710083298385146,
556
- "rewards/accuracy_reward": 0.4954081534408033,
 
557
  "rewards/format_reward": 0.0,
 
558
  "step": 390
559
  },
560
  {
561
- "completion_length": 665.6186065673828,
562
  "epoch": 0.6184172383805199,
563
- "grad_norm": 0.013029334976500356,
564
- "kl": 0.321246337890625,
565
  "learning_rate": 7.616940980675004e-06,
566
- "loss": 0.0128,
567
- "reward": 0.4474489719606936,
568
- "reward_std": 0.27854115669615565,
569
- "rewards/accuracy_reward": 0.4474489719606936,
 
570
  "rewards/format_reward": 0.0,
 
571
  "step": 400
572
  },
573
  {
574
  "epoch": 0.6184172383805199,
575
- "eval_completion_length": 627.0203979492187,
576
- "eval_kl": 0.2732747395833333,
577
- "eval_loss": 0.011420400813221931,
578
- "eval_reward": 0.5047619010011355,
579
- "eval_reward_std": 0.2536137938499451,
580
- "eval_rewards/accuracy_reward": 0.5047619010011355,
 
581
  "eval_rewards/format_reward": 0.0,
582
- "eval_runtime": 87.816,
583
- "eval_samples_per_second": 1.127,
584
- "eval_steps_per_second": 0.171,
 
585
  "step": 400
586
  },
587
  {
588
- "completion_length": 646.4711584091186,
589
  "epoch": 0.6338776693400329,
590
- "grad_norm": 0.023099135498393764,
591
- "kl": 0.343701171875,
592
  "learning_rate": 7.095536274107046e-06,
593
- "loss": 0.0137,
594
- "reward": 0.4645408088341355,
595
- "reward_std": 0.2891133207827806,
596
- "rewards/accuracy_reward": 0.4645408088341355,
 
597
  "rewards/format_reward": 0.0,
 
598
  "step": 410
599
  },
600
  {
601
- "completion_length": 587.3729467391968,
602
  "epoch": 0.6493381002995459,
603
- "grad_norm": 0.013467292425479267,
604
- "kl": 0.2858642578125,
605
  "learning_rate": 6.58262156614881e-06,
606
- "loss": 0.0114,
607
- "reward": 0.503188765514642,
608
- "reward_std": 0.2590713477227837,
609
- "rewards/accuracy_reward": 0.503188765514642,
 
610
  "rewards/format_reward": 0.0,
 
611
  "step": 420
612
  },
613
  {
614
- "completion_length": 594.2052188873291,
615
  "epoch": 0.6647985312590589,
616
- "grad_norm": 0.013766538000227693,
617
- "kl": 0.285015869140625,
618
  "learning_rate": 6.079696150841634e-06,
619
- "loss": 0.0114,
620
- "reward": 0.4913265212439001,
621
- "reward_std": 0.2684762907214463,
622
- "rewards/accuracy_reward": 0.4913265212439001,
 
623
  "rewards/format_reward": 0.0,
 
624
  "step": 430
625
  },
626
  {
627
- "completion_length": 631.7939949035645,
628
  "epoch": 0.6802589622185718,
629
- "grad_norm": 0.07419237005783043,
630
- "kl": 0.346197509765625,
631
  "learning_rate": 5.588230122660672e-06,
632
- "loss": 0.0138,
633
- "reward": 0.4860969296656549,
634
- "reward_std": 0.2773646651767194,
635
- "rewards/accuracy_reward": 0.4860969296656549,
 
636
  "rewards/format_reward": 0.0,
 
637
  "step": 440
638
  },
639
  {
640
- "completion_length": 586.2428462982177,
641
  "epoch": 0.6957193931780848,
642
- "grad_norm": 0.015937060852966797,
643
- "kl": 0.2478759765625,
644
  "learning_rate": 5.109660079301668e-06,
645
- "loss": 0.0099,
646
- "reward": 0.5220663199201226,
647
- "reward_std": 0.2621162030380219,
648
- "rewards/accuracy_reward": 0.5220663199201226,
 
649
  "rewards/format_reward": 0.0,
 
650
  "step": 450
651
  },
652
  {
653
- "completion_length": 641.8402923583984,
654
  "epoch": 0.7111798241375978,
655
- "grad_norm": 0.04445799644422213,
656
- "kl": 0.3908416748046875,
657
  "learning_rate": 4.64538492238166e-06,
658
- "loss": 0.0156,
659
- "reward": 0.4795918288640678,
660
- "reward_std": 0.28201754316687583,
661
- "rewards/accuracy_reward": 0.4795918288640678,
 
662
  "rewards/format_reward": 0.0,
 
663
  "step": 460
664
  },
665
  {
666
- "completion_length": 648.5068748474121,
667
  "epoch": 0.7266402550971108,
668
- "grad_norm": 0.018272003157409882,
669
- "kl": 0.44942626953125,
670
  "learning_rate": 4.196761768328599e-06,
671
- "loss": 0.018,
672
- "reward": 0.45650509353727103,
673
- "reward_std": 0.2825955556239933,
674
- "rewards/accuracy_reward": 0.45650509353727103,
 
675
  "rewards/format_reward": 0.0,
 
676
  "step": 470
677
  },
678
  {
679
- "completion_length": 632.1244766235352,
680
  "epoch": 0.7421006860566238,
681
- "grad_norm": 0.03550172747608362,
682
- "kl": 0.54378662109375,
683
  "learning_rate": 3.7651019814126656e-06,
684
- "loss": 0.0218,
685
- "reward": 0.4636479509063065,
686
- "reward_std": 0.2770009428262711,
687
- "rewards/accuracy_reward": 0.4636479509063065,
 
688
  "rewards/format_reward": 0.0,
 
689
  "step": 480
690
  },
691
  {
692
- "completion_length": 632.8059818267823,
693
  "epoch": 0.7575611170161368,
694
- "grad_norm": 0.9437401498024955,
695
- "kl": 1.1802734375,
696
  "learning_rate": 3.3516673405151546e-06,
697
- "loss": 0.0472,
698
- "reward": 0.45255101229995487,
699
- "reward_std": 0.2746642493642867,
700
- "rewards/accuracy_reward": 0.45255101229995487,
 
701
  "rewards/format_reward": 0.0,
 
702
  "step": 490
703
  },
704
  {
705
- "completion_length": 586.2049638748169,
706
  "epoch": 0.7730215479756498,
707
- "grad_norm": 0.11836217149566043,
708
- "kl": 0.545013427734375,
709
  "learning_rate": 2.957666350839663e-06,
710
- "loss": 0.0218,
711
- "reward": 0.48941325647756456,
712
- "reward_std": 0.263414288777858,
713
- "rewards/accuracy_reward": 0.48941325647756456,
 
714
  "rewards/format_reward": 0.0,
 
715
  "step": 500
716
  },
717
  {
718
  "epoch": 0.7730215479756498,
719
- "eval_completion_length": 551.6952229817708,
720
- "eval_kl": 0.28020833333333334,
721
- "eval_loss": 0.011639236472547054,
722
- "eval_reward": 0.5387754996617635,
723
- "eval_reward_std": 0.2534260580937068,
724
- "eval_rewards/accuracy_reward": 0.5387754996617635,
 
725
  "eval_rewards/format_reward": 0.0,
726
- "eval_runtime": 85.6752,
727
- "eval_samples_per_second": 1.156,
728
- "eval_steps_per_second": 0.175,
 
729
  "step": 500
730
  },
731
  {
732
- "completion_length": 598.6526651382446,
733
  "epoch": 0.7884819789351628,
734
- "grad_norm": 0.014324651971585192,
735
- "kl": 0.284661865234375,
736
  "learning_rate": 2.5842507113469307e-06,
737
- "loss": 0.0114,
738
- "reward": 0.49579080580733714,
739
- "reward_std": 0.2849952794611454,
740
- "rewards/accuracy_reward": 0.49579080580733714,
 
741
  "rewards/format_reward": 0.0,
 
742
  "step": 510
743
  },
744
  {
745
- "completion_length": 602.975754737854,
746
  "epoch": 0.8039424098946758,
747
- "grad_norm": 0.013302847380980731,
748
- "kl": 0.287591552734375,
749
  "learning_rate": 2.2325119482391466e-06,
750
- "loss": 0.0115,
751
- "reward": 0.4869897892698646,
752
- "reward_std": 0.28300182512030003,
753
- "rewards/accuracy_reward": 0.4869897892698646,
 
754
  "rewards/format_reward": 0.0,
 
755
  "step": 520
756
  },
757
  {
758
- "completion_length": 617.0672046661377,
759
  "epoch": 0.8194028408541888,
760
- "grad_norm": 0.011506624405655665,
761
- "kl": 0.316864013671875,
762
  "learning_rate": 1.9034782243345074e-06,
763
- "loss": 0.0127,
764
- "reward": 0.49005101155489683,
765
- "reward_std": 0.283530889172107,
766
- "rewards/accuracy_reward": 0.49005101155489683,
 
767
  "rewards/format_reward": 0.0,
 
768
  "step": 530
769
  },
770
  {
771
- "completion_length": 608.665803527832,
772
  "epoch": 0.8348632718137018,
773
- "grad_norm": 0.016337273376864313,
774
- "kl": 0.557275390625,
775
  "learning_rate": 1.5981113336584041e-06,
776
- "loss": 0.0223,
777
- "reward": 0.49119897168129684,
778
- "reward_std": 0.28141105216927825,
779
- "rewards/accuracy_reward": 0.49119897168129684,
 
780
  "rewards/format_reward": 0.0,
 
781
  "step": 540
782
  },
783
  {
784
- "completion_length": 625.1159269332886,
785
  "epoch": 0.8503237027732148,
786
- "grad_norm": 0.057861981590189904,
787
- "kl": 0.260540771484375,
788
  "learning_rate": 1.3173038900362977e-06,
789
- "loss": 0.0104,
790
- "reward": 0.4499999931082129,
791
- "reward_std": 0.27894868138246237,
792
- "rewards/accuracy_reward": 0.4499999931082129,
 
793
  "rewards/format_reward": 0.0,
 
794
  "step": 550
795
  },
796
  {
797
- "completion_length": 632.5747304916382,
798
  "epoch": 0.8657841337327278,
799
- "grad_norm": 0.030466073357798586,
800
- "kl": 0.293267822265625,
801
  "learning_rate": 1.0618767179063416e-06,
802
- "loss": 0.0117,
803
- "reward": 0.4618622355163097,
804
- "reward_std": 0.2925746965222061,
805
- "rewards/accuracy_reward": 0.4618622355163097,
 
806
  "rewards/format_reward": 0.0,
 
807
  "step": 560
808
  },
809
  {
810
- "completion_length": 641.7084041595459,
811
  "epoch": 0.8812445646922408,
812
- "grad_norm": 0.017331692484703218,
813
- "kl": 0.316583251953125,
814
  "learning_rate": 8.325764529785851e-07,
815
- "loss": 0.0127,
816
- "reward": 0.4483418272808194,
817
- "reward_std": 0.28196476846933366,
818
- "rewards/accuracy_reward": 0.4483418272808194,
 
819
  "rewards/format_reward": 0.0,
 
820
  "step": 570
821
  },
822
  {
823
- "completion_length": 646.2618488311767,
824
  "epoch": 0.8967049956517538,
825
- "grad_norm": 0.009327065043797485,
826
- "kl": 0.303216552734375,
827
  "learning_rate": 6.300733597542086e-07,
828
- "loss": 0.0121,
829
- "reward": 0.44196427753195167,
830
- "reward_std": 0.28285656329244374,
831
- "rewards/accuracy_reward": 0.44196427753195167,
 
832
  "rewards/format_reward": 0.0,
 
833
  "step": 580
834
  },
835
  {
836
- "completion_length": 626.1570028305053,
837
  "epoch": 0.9121654266112668,
838
- "grad_norm": 0.010427073776751795,
839
- "kl": 0.31864013671875,
840
  "learning_rate": 4.549593722844492e-07,
841
- "loss": 0.0127,
842
- "reward": 0.46224488839507105,
843
- "reward_std": 0.28828581105917694,
844
- "rewards/accuracy_reward": 0.46224488839507105,
 
845
  "rewards/format_reward": 0.0,
 
846
  "step": 590
847
  },
848
  {
849
- "completion_length": 628.7367259979249,
850
  "epoch": 0.9276258575707798,
851
- "grad_norm": 0.01713962537975438,
852
- "kl": 0.3144775390625,
853
  "learning_rate": 3.0774636389618196e-07,
854
- "loss": 0.0126,
855
- "reward": 0.4502550953067839,
856
- "reward_std": 0.2758318264503032,
857
- "rewards/accuracy_reward": 0.4502550953067839,
 
858
  "rewards/format_reward": 0.0,
 
859
  "step": 600
860
  },
861
  {
862
  "epoch": 0.9276258575707798,
863
- "eval_completion_length": 607.1224405924479,
864
- "eval_kl": 0.2647786458333333,
865
- "eval_loss": 0.011027935892343521,
866
- "eval_reward": 0.4816326439380646,
867
- "eval_reward_std": 0.25309162139892577,
868
- "eval_rewards/accuracy_reward": 0.4816326439380646,
 
869
  "eval_rewards/format_reward": 0.0,
870
- "eval_runtime": 87.0986,
871
- "eval_samples_per_second": 1.137,
872
- "eval_steps_per_second": 0.172,
 
873
  "step": 600
874
  },
875
  {
876
- "completion_length": 633.0672080993652,
877
  "epoch": 0.9430862885302927,
878
- "grad_norm": 0.022562920761009898,
879
- "kl": 0.302935791015625,
880
  "learning_rate": 1.8886465094192895e-07,
881
- "loss": 0.0121,
882
- "reward": 0.46020407443866135,
883
- "reward_std": 0.2828258784487844,
884
- "rewards/accuracy_reward": 0.46020407443866135,
 
885
  "rewards/format_reward": 0.0,
 
886
  "step": 610
887
  },
888
  {
889
- "completion_length": 626.6780488967895,
890
  "epoch": 0.9585467194898057,
891
- "grad_norm": 0.012428624166763307,
892
- "kl": 0.3028564453125,
893
  "learning_rate": 9.866173494794462e-08,
894
- "loss": 0.0121,
895
- "reward": 0.447959177242592,
896
- "reward_std": 0.27789597446098924,
897
- "rewards/accuracy_reward": 0.447959177242592,
 
898
  "rewards/format_reward": 0.0,
 
899
  "step": 620
900
  },
901
  {
902
- "completion_length": 630.8656784057617,
903
  "epoch": 0.9740071504493187,
904
- "grad_norm": 0.009372661260439265,
905
- "kl": 0.305780029296875,
906
  "learning_rate": 3.7401286837214224e-08,
907
- "loss": 0.0122,
908
- "reward": 0.4577806035755202,
909
- "reward_std": 0.27292990586720406,
910
- "rewards/accuracy_reward": 0.4577806035755202,
 
911
  "rewards/format_reward": 0.0,
 
912
  "step": 630
913
  },
914
  {
915
- "completion_length": 624.4659309387207,
916
  "epoch": 0.9894675814088317,
917
- "grad_norm": 0.012393498840602491,
918
- "kl": 0.301348876953125,
919
  "learning_rate": 5.262376196544239e-09,
920
- "loss": 0.0121,
921
- "reward": 0.4693877460435033,
922
- "reward_std": 0.27738194689154627,
923
- "rewards/accuracy_reward": 0.4693877460435033,
 
924
  "rewards/format_reward": 0.0,
 
925
  "step": 640
926
  },
927
  {
928
- "completion_length": 626.0862976710001,
929
  "epoch": 0.9987438399845395,
930
- "kl": 0.31524658203125,
931
- "reward": 0.46088434507449466,
932
- "reward_std": 0.28149245004169643,
933
- "rewards/accuracy_reward": 0.46088434507449466,
 
934
  "rewards/format_reward": 0.0,
 
935
  "step": 646,
936
  "total_flos": 0.0,
937
- "train_loss": 0.01602011715643949,
938
- "train_runtime": 67938.2282,
939
- "train_samples_per_second": 1.066,
940
- "train_steps_per_second": 0.01
941
  }
942
  ],
943
  "logging_steps": 10,
@@ -958,7 +1100,7 @@
958
  }
959
  },
960
  "total_flos": 0.0,
961
- "train_batch_size": 1,
962
  "trial_name": null,
963
  "trial_params": null
964
  }
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 928.2598609924316,
13
  "epoch": 0.015460430959512996,
14
+ "grad_norm": 0.005825439665555334,
15
+ "kl": 0.0004873394966125488,
16
  "learning_rate": 3.0769230769230774e-06,
17
  "loss": 0.0,
18
+ "reward": 0.6422783114481717,
19
+ "reward_std": 0.6085492318496108,
20
+ "rewards/accuracy_reward": 0.16450893601868302,
21
+ "rewards/cosine_scaled_reward": -0.15619643366662786,
22
  "rewards/format_reward": 0.0,
23
+ "rewards/reasoning_steps_reward": 0.6339658062905074,
24
  "step": 10
25
  },
26
  {
27
+ "completion_length": 834.5876502990723,
28
  "epoch": 0.03092086191902599,
29
+ "grad_norm": 0.005061101750122049,
30
+ "kl": 0.005776357650756836,
31
  "learning_rate": 6.153846153846155e-06,
32
  "loss": 0.0002,
33
+ "reward": 1.1410260727629065,
34
+ "reward_std": 0.6005165675655008,
35
+ "rewards/accuracy_reward": 0.3330357288941741,
36
+ "rewards/cosine_scaled_reward": 0.021941200397304784,
37
  "rewards/format_reward": 0.0,
38
+ "rewards/reasoning_steps_reward": 0.786049148067832,
39
  "step": 20
40
  },
41
  {
42
+ "completion_length": 794.6511512756348,
43
  "epoch": 0.04638129287853899,
44
+ "grad_norm": 0.004186908324406587,
45
+ "kl": 0.012411689758300782,
46
  "learning_rate": 9.230769230769232e-06,
47
+ "loss": 0.0005,
48
+ "reward": 1.5154079463332892,
49
+ "reward_std": 0.5513344288803637,
50
+ "rewards/accuracy_reward": 0.4193080538418144,
51
+ "rewards/cosine_scaled_reward": 0.14040787946141792,
52
  "rewards/format_reward": 0.0,
53
+ "rewards/reasoning_steps_reward": 0.9556920122355222,
54
  "step": 30
55
  },
56
  {
57
+ "completion_length": 823.3263763427734,
58
  "epoch": 0.06184172383805198,
59
+ "grad_norm": 0.004675533239892946,
60
+ "kl": 0.0160797119140625,
61
  "learning_rate": 1.230769230769231e-05,
62
+ "loss": 0.0006,
63
+ "reward": 1.6104407742619515,
64
+ "reward_std": 0.5197742725256831,
65
+ "rewards/accuracy_reward": 0.45156252263113855,
66
+ "rewards/cosine_scaled_reward": 0.18000916420933208,
67
  "rewards/format_reward": 0.0,
68
+ "rewards/reasoning_steps_reward": 0.9788690943270921,
69
  "step": 40
70
  },
71
  {
72
+ "completion_length": 824.9076248168946,
73
  "epoch": 0.07730215479756498,
74
+ "grad_norm": 0.008209128879452155,
75
+ "kl": 0.021560287475585936,
76
  "learning_rate": 1.5384615384615387e-05,
77
  "loss": 0.0009,
78
+ "reward": 1.7187657799571752,
79
+ "reward_std": 0.539798857551068,
80
+ "rewards/accuracy_reward": 0.48928573532029984,
81
+ "rewards/cosine_scaled_reward": 0.2462582775799092,
82
  "rewards/format_reward": 0.0,
83
+ "rewards/reasoning_steps_reward": 0.9832217764109373,
84
  "step": 50
85
  },
86
  {
87
+ "completion_length": 776.4894325256348,
88
  "epoch": 0.09276258575707798,
89
+ "grad_norm": 0.005032104484078714,
90
+ "kl": 0.03041839599609375,
91
  "learning_rate": 1.8461538461538465e-05,
92
+ "loss": 0.0012,
93
+ "reward": 1.7944712869822979,
94
+ "reward_std": 0.5402565439231694,
95
+ "rewards/accuracy_reward": 0.5172991305589676,
96
+ "rewards/cosine_scaled_reward": 0.2977078223892022,
97
  "rewards/format_reward": 0.0,
98
+ "rewards/reasoning_steps_reward": 0.979464340955019,
99
  "step": 60
100
  },
101
  {
102
+ "completion_length": 783.9421092987061,
103
  "epoch": 0.10822301671659097,
104
+ "grad_norm": 0.046440537921451495,
105
+ "kl": 0.18165512084960939,
106
  "learning_rate": 1.999634547413886e-05,
107
+ "loss": 0.0073,
108
+ "reward": 1.5861421424895525,
109
+ "reward_std": 0.7133004866540432,
110
+ "rewards/accuracy_reward": 0.46037948597222567,
111
+ "rewards/cosine_scaled_reward": 0.2454426669143686,
112
  "rewards/format_reward": 0.0,
113
+ "rewards/reasoning_steps_reward": 0.8803199872374534,
114
  "step": 70
115
  },
116
  {
117
+ "completion_length": 704.7571739196777,
118
  "epoch": 0.12368344767610397,
119
+ "grad_norm": 0.005638881154684646,
120
+ "kl": 0.14808197021484376,
121
  "learning_rate": 1.9967125291968495e-05,
122
+ "loss": 0.0059,
123
+ "reward": 1.770343079417944,
124
+ "reward_std": 0.6161688735242933,
125
+ "rewards/accuracy_reward": 0.5064732374623417,
126
+ "rewards/cosine_scaled_reward": 0.31033555960966624,
127
  "rewards/format_reward": 0.0,
128
+ "rewards/reasoning_steps_reward": 0.9535342697054148,
129
  "step": 80
130
  },
131
  {
132
+ "completion_length": 811.8497016906738,
133
  "epoch": 0.13914387863561697,
134
+ "grad_norm": 0.006277685603677439,
135
+ "kl": 0.1677825927734375,
136
  "learning_rate": 1.990877034074683e-05,
137
+ "loss": 0.0067,
138
+ "reward": 1.7127626728266478,
139
+ "reward_std": 0.5350183860398829,
140
+ "rewards/accuracy_reward": 0.4822544841095805,
141
+ "rewards/cosine_scaled_reward": 0.24494265783869196,
142
  "rewards/format_reward": 0.0,
143
+ "rewards/reasoning_steps_reward": 0.9855655211955309,
144
  "step": 90
145
  },
146
  {
147
+ "completion_length": 786.0161056518555,
148
  "epoch": 0.15460430959512997,
149
+ "grad_norm": 0.004638658867653336,
150
+ "kl": 0.20247802734375,
151
  "learning_rate": 1.9821451197042028e-05,
152
+ "loss": 0.0081,
153
+ "reward": 1.7241055637598037,
154
+ "reward_std": 0.6285460269078612,
155
+ "rewards/accuracy_reward": 0.4876116293948144,
156
+ "rewards/cosine_scaled_reward": 0.27760252499065247,
157
  "rewards/format_reward": 0.0,
158
+ "rewards/reasoning_steps_reward": 0.9588914208114148,
159
  "step": 100
160
  },
161
  {
162
  "epoch": 0.15460430959512997,
163
+ "eval_completion_length": 785.1983642578125,
164
+ "eval_kl": 0.1015625,
165
+ "eval_loss": 0.004128854256123304,
166
+ "eval_reward": 1.8587820827960968,
167
+ "eval_reward_std": 0.4679965078830719,
168
+ "eval_rewards/accuracy_reward": 0.5345982536673546,
169
+ "eval_rewards/cosine_scaled_reward": 0.33646056056022644,
170
  "eval_rewards/format_reward": 0.0,
171
+ "eval_rewards/reasoning_steps_reward": 0.9877232909202576,
172
+ "eval_runtime": 65.77,
173
+ "eval_samples_per_second": 1.505,
174
+ "eval_steps_per_second": 0.015,
175
  "step": 100
176
  },
177
  {
178
+ "completion_length": 795.4053916931152,
179
  "epoch": 0.17006474055464296,
180
+ "grad_norm": 0.0050662218091816385,
181
+ "kl": 0.20187530517578126,
182
  "learning_rate": 1.9705423102261324e-05,
183
+ "loss": 0.0081,
184
+ "reward": 1.7285974282771348,
185
+ "reward_std": 0.6808601895347237,
186
+ "rewards/accuracy_reward": 0.49296877197921274,
187
+ "rewards/cosine_scaled_reward": 0.296454501109838,
188
  "rewards/format_reward": 0.0,
189
+ "rewards/reasoning_steps_reward": 0.9391741570085287,
190
  "step": 110
191
  },
192
  {
193
+ "completion_length": 791.2351921081543,
194
  "epoch": 0.18552517151415596,
195
+ "grad_norm": 0.005966417830813838,
196
+ "kl": 0.222845458984375,
197
  "learning_rate": 1.956102521655831e-05,
198
+ "loss": 0.0089,
199
+ "reward": 1.7569476522505283,
200
+ "reward_std": 0.6371290137991309,
201
+ "rewards/accuracy_reward": 0.4906250220956281,
202
+ "rewards/cosine_scaled_reward": 0.30925412904762195,
203
  "rewards/format_reward": 0.0,
204
+ "rewards/reasoning_steps_reward": 0.9570684999227523,
205
  "step": 120
206
  },
207
  {
208
+ "completion_length": 778.9137634277344,
209
  "epoch": 0.20098560247366895,
210
+ "grad_norm": 0.11425551975396886,
211
+ "kl": 0.455560302734375,
212
  "learning_rate": 1.9388679627438486e-05,
213
+ "loss": 0.0182,
214
+ "reward": 1.6175578892230988,
215
+ "reward_std": 0.6627502014860511,
216
+ "rewards/accuracy_reward": 0.43537948445882646,
217
+ "rewards/cosine_scaled_reward": 0.24683610293641323,
218
  "rewards/format_reward": 0.0,
219
+ "rewards/reasoning_steps_reward": 0.9353423073887825,
220
  "step": 130
221
  },
222
  {
223
+ "completion_length": 671.9368595123291,
224
  "epoch": 0.21644603343318194,
225
+ "grad_norm": 0.012687310552666826,
226
+ "kl": 2.4836822509765626,
227
  "learning_rate": 1.9188890115960967e-05,
228
+ "loss": 0.0994,
229
+ "reward": 1.4270036322064699,
230
+ "reward_std": 0.739827654324472,
231
+ "rewards/accuracy_reward": 0.39709823183948173,
232
+ "rewards/cosine_scaled_reward": 0.2161776867986191,
233
  "rewards/format_reward": 0.0,
234
+ "rewards/reasoning_steps_reward": 0.8137277197092772,
235
  "step": 140
236
  },
237
  {
238
+ "completion_length": 703.5272651672364,
239
  "epoch": 0.23190646439269494,
240
+ "grad_norm": 0.023072548858575986,
241
+ "kl": 0.175665283203125,
242
  "learning_rate": 1.8962240684142923e-05,
243
+ "loss": 0.007,
244
+ "reward": 1.8304371915757656,
245
+ "reward_std": 0.5595470611006021,
246
+ "rewards/accuracy_reward": 0.5032366321422159,
247
+ "rewards/cosine_scaled_reward": 0.3409654124639928,
248
  "rewards/format_reward": 0.0,
249
+ "rewards/reasoning_steps_reward": 0.986235162243247,
250
  "step": 150
251
  },
252
  {
253
+ "completion_length": 755.7974658966065,
254
  "epoch": 0.24736689535220793,
255
+ "grad_norm": 0.0048724703817881404,
256
+ "kl": 0.1621673583984375,
257
  "learning_rate": 1.8709393847871146e-05,
258
+ "loss": 0.0065,
259
+ "reward": 1.8066862165927886,
260
+ "reward_std": 0.541509800683707,
261
+ "rewards/accuracy_reward": 0.4974330588709563,
262
+ "rewards/cosine_scaled_reward": 0.32353881540329893,
263
  "rewards/format_reward": 0.0,
264
+ "rewards/reasoning_steps_reward": 0.9857143286615611,
265
  "step": 160
266
  },
267
  {
268
+ "completion_length": 767.8857475280762,
269
  "epoch": 0.26282732631172095,
270
+ "grad_norm": 0.0058295760602797165,
271
+ "kl": 0.1024566650390625,
272
  "learning_rate": 1.8431088700310846e-05,
273
+ "loss": 0.0041,
274
+ "reward": 1.8246684893965721,
275
+ "reward_std": 0.6182428574189544,
276
+ "rewards/accuracy_reward": 0.5167410940863192,
277
+ "rewards/cosine_scaled_reward": 0.33218330084491754,
278
  "rewards/format_reward": 0.0,
279
+ "rewards/reasoning_steps_reward": 0.9757441036403179,
280
  "step": 170
281
  },
282
  {
283
+ "completion_length": 782.8031581878662,
284
  "epoch": 0.27828775727123395,
285
+ "grad_norm": 0.007260482392875092,
286
+ "kl": 0.133380126953125,
287
  "learning_rate": 1.8128138751472432e-05,
288
+ "loss": 0.0053,
289
+ "reward": 1.6873359650373458,
290
+ "reward_std": 0.7230455877259374,
291
+ "rewards/accuracy_reward": 0.46573662804439664,
292
+ "rewards/cosine_scaled_reward": 0.2658701150892739,
293
  "rewards/format_reward": 0.0,
294
+ "rewards/reasoning_steps_reward": 0.955729215592146,
295
  "step": 180
296
  },
297
  {
298
+ "completion_length": 770.7354141235352,
299
  "epoch": 0.29374818823074694,
300
+ "grad_norm": 0.0038766706896374765,
301
+ "kl": 0.084027099609375,
302
  "learning_rate": 1.780142955025139e-05,
303
+ "loss": 0.0034,
304
+ "reward": 1.8208528086543083,
305
+ "reward_std": 0.6158834310248494,
306
+ "rewards/accuracy_reward": 0.5102678800933063,
307
+ "rewards/cosine_scaled_reward": 0.3412768360443579,
308
  "rewards/format_reward": 0.0,
309
+ "rewards/reasoning_steps_reward": 0.969308077916503,
310
  "step": 190
311
  },
312
  {
313
+ "completion_length": 777.9120876312256,
314
  "epoch": 0.30920861919025994,
315
+ "grad_norm": 0.004081871056913627,
316
+ "kl": 0.079278564453125,
317
  "learning_rate": 1.745191609589231e-05,
318
+ "loss": 0.0032,
319
+ "reward": 1.8799906723201274,
320
+ "reward_std": 0.6350350034423172,
321
+ "rewards/accuracy_reward": 0.5420759165659547,
322
+ "rewards/cosine_scaled_reward": 0.36834625932970083,
323
  "rewards/format_reward": 0.0,
324
+ "rewards/reasoning_steps_reward": 0.9695684995502234,
325
  "step": 200
326
  },
327
  {
328
  "epoch": 0.30920861919025994,
329
+ "eval_completion_length": 786.5066223144531,
330
+ "eval_kl": 0.080078125,
331
+ "eval_loss": 0.0031854985281825066,
332
+ "eval_reward": 1.7613219320774078,
333
+ "eval_reward_std": 0.6763340681791306,
334
+ "eval_rewards/accuracy_reward": 0.4933036044239998,
335
+ "eval_rewards/cosine_scaled_reward": 0.3011283501982689,
336
  "eval_rewards/format_reward": 0.0,
337
+ "eval_rewards/reasoning_steps_reward": 0.9668899178504944,
338
+ "eval_runtime": 67.4733,
339
+ "eval_samples_per_second": 1.467,
340
+ "eval_steps_per_second": 0.015,
341
  "step": 200
342
  },
343
  {
344
+ "completion_length": 762.9672210693359,
345
  "epoch": 0.32466905014977293,
346
+ "grad_norm": 0.0045327243820408964,
347
+ "kl": 0.0857818603515625,
348
  "learning_rate": 1.7080620046443503e-05,
349
+ "loss": 0.0034,
350
+ "reward": 1.8360209584236145,
351
+ "reward_std": 0.6304899661801755,
352
+ "rewards/accuracy_reward": 0.5189732388593257,
353
+ "rewards/cosine_scaled_reward": 0.3513110678992234,
354
  "rewards/format_reward": 0.0,
355
+ "rewards/reasoning_steps_reward": 0.9657366566359997,
356
  "step": 210
357
  },
358
  {
359
+ "completion_length": 740.6268199920654,
360
  "epoch": 0.3401294811092859,
361
+ "grad_norm": 0.40798247676236865,
362
+ "kl": 0.09603729248046874,
363
  "learning_rate": 1.6688626732362192e-05,
364
+ "loss": 0.0038,
365
+ "reward": 1.8989367991685868,
366
+ "reward_std": 0.6170632224529982,
367
+ "rewards/accuracy_reward": 0.541183059476316,
368
+ "rewards/cosine_scaled_reward": 0.3866971510913572,
369
  "rewards/format_reward": 0.0,
370
+ "rewards/reasoning_steps_reward": 0.9710565954446793,
371
  "step": 220
372
  },
373
  {
374
+ "completion_length": 745.6220226287842,
375
  "epoch": 0.3555899120687989,
376
+ "grad_norm": 0.009310955589968223,
377
+ "kl": 0.17754974365234374,
378
  "learning_rate": 1.6277081983999742e-05,
379
+ "loss": 0.0071,
380
+ "reward": 1.9535415962338447,
381
+ "reward_std": 0.5657559703569859,
382
+ "rewards/accuracy_reward": 0.5494419884867966,
383
+ "rewards/cosine_scaled_reward": 0.4263093855464831,
384
  "rewards/format_reward": 0.0,
385
+ "rewards/reasoning_steps_reward": 0.9777902279049158,
386
  "step": 230
387
  },
388
  {
389
+ "completion_length": 754.8473545074463,
390
  "epoch": 0.3710503430283119,
391
+ "grad_norm": 0.009032184745149096,
392
+ "kl": 0.1623504638671875,
393
  "learning_rate": 1.5847188782240473e-05,
394
+ "loss": 0.0065,
395
+ "reward": 1.8752706520259381,
396
+ "reward_std": 0.6476909777149558,
397
+ "rewards/accuracy_reward": 0.5162946661002934,
398
+ "rewards/cosine_scaled_reward": 0.3971455840044655,
399
  "rewards/format_reward": 0.0,
400
+ "rewards/reasoning_steps_reward": 0.9618303928524256,
401
  "step": 240
402
  },
403
  {
404
+ "completion_length": 767.3316184997559,
405
  "epoch": 0.3865107739878249,
406
+ "grad_norm": 0.006074054783900294,
407
+ "kl": 0.1158416748046875,
408
  "learning_rate": 1.5400203742084508e-05,
409
+ "loss": 0.0046,
410
+ "reward": 1.8485381975769997,
411
+ "reward_std": 0.6796474339440465,
412
+ "rewards/accuracy_reward": 0.5156250222586095,
413
+ "rewards/cosine_scaled_reward": 0.3913580739754252,
414
  "rewards/format_reward": 0.0,
415
+ "rewards/reasoning_steps_reward": 0.9415550928562879,
416
  "step": 250
417
  },
418
  {
419
+ "completion_length": 740.9466835021973,
420
  "epoch": 0.4019712049473379,
421
+ "grad_norm": 0.004612552363152663,
422
+ "kl": 0.10526580810546875,
423
  "learning_rate": 1.4937433439453465e-05,
424
+ "loss": 0.0042,
425
+ "reward": 1.834777297079563,
426
+ "reward_std": 0.694879194535315,
427
+ "rewards/accuracy_reward": 0.5040178820490837,
428
+ "rewards/cosine_scaled_reward": 0.38555847499519585,
429
  "rewards/format_reward": 0.0,
430
+ "rewards/reasoning_steps_reward": 0.9452009297907352,
431
  "step": 260
432
  },
433
  {
434
+ "completion_length": 769.4490287780761,
435
  "epoch": 0.4174316359068509,
436
+ "grad_norm": 0.005166739754892184,
437
+ "kl": 0.122613525390625,
438
  "learning_rate": 1.4460230591956097e-05,
439
+ "loss": 0.0049,
440
+ "reward": 1.8051817450672387,
441
+ "reward_std": 0.7667457018047571,
442
+ "rewards/accuracy_reward": 0.5031250216066837,
443
+ "rewards/cosine_scaled_reward": 0.3666772120282985,
444
  "rewards/format_reward": 0.0,
445
+ "rewards/reasoning_steps_reward": 0.9353795044124127,
446
  "step": 270
447
  },
448
  {
449
+ "completion_length": 756.1934505462647,
450
  "epoch": 0.4328920668663639,
451
+ "grad_norm": 0.004779328317174938,
452
+ "kl": 0.118280029296875,
453
  "learning_rate": 1.3969990104777712e-05,
454
+ "loss": 0.0047,
455
+ "reward": 1.835938386246562,
456
+ "reward_std": 0.6989197930321097,
457
+ "rewards/accuracy_reward": 0.5044643082190305,
458
+ "rewards/cosine_scaled_reward": 0.3808415879495442,
459
  "rewards/format_reward": 0.0,
460
+ "rewards/reasoning_steps_reward": 0.9506324753165245,
461
  "step": 280
462
  },
463
  {
464
+ "completion_length": 753.9099658966064,
465
  "epoch": 0.4483524978258769,
466
+ "grad_norm": 0.006205432336241612,
467
+ "kl": 0.12601318359375,
468
  "learning_rate": 1.3468144993251735e-05,
469
+ "loss": 0.005,
470
+ "reward": 1.8052862711250781,
471
+ "reward_std": 0.6413127107545733,
472
+ "rewards/accuracy_reward": 0.47890627244487405,
473
+ "rewards/cosine_scaled_reward": 0.3571091307036113,
474
  "rewards/format_reward": 0.0,
475
+ "rewards/reasoning_steps_reward": 0.9692708767950535,
476
  "step": 290
477
  },
478
  {
479
+ "completion_length": 766.8500350952148,
480
  "epoch": 0.4638129287853899,
481
+ "grad_norm": 0.005053729003460748,
482
+ "kl": 0.1371002197265625,
483
  "learning_rate": 1.295616219403197e-05,
484
+ "loss": 0.0055,
485
+ "reward": 1.7713046602904796,
486
+ "reward_std": 0.6539058156311512,
487
+ "rewards/accuracy_reward": 0.4574776992201805,
488
+ "rewards/cosine_scaled_reward": 0.34656501180725174,
489
  "rewards/format_reward": 0.0,
490
+ "rewards/reasoning_steps_reward": 0.967261953279376,
491
  "step": 300
492
  },
493
  {
494
  "epoch": 0.4638129287853899,
495
+ "eval_completion_length": 725.3372497558594,
496
+ "eval_kl": 0.125732421875,
497
+ "eval_loss": 0.005167535971850157,
498
+ "eval_reward": 1.8545046150684357,
499
+ "eval_reward_std": 0.5993074476718903,
500
+ "eval_rewards/accuracy_reward": 0.4888393133878708,
501
+ "eval_rewards/cosine_scaled_reward": 0.3980313614010811,
502
  "eval_rewards/format_reward": 0.0,
503
+ "eval_rewards/reasoning_steps_reward": 0.9676340073347092,
504
+ "eval_runtime": 63.1453,
505
+ "eval_samples_per_second": 1.568,
506
+ "eval_steps_per_second": 0.016,
507
  "step": 300
508
  },
509
  {
510
+ "completion_length": 738.0375347137451,
511
  "epoch": 0.47927335974490287,
512
+ "grad_norm": 0.004708932814585316,
513
+ "kl": 0.128253173828125,
514
  "learning_rate": 1.2435538277109919e-05,
515
+ "loss": 0.0051,
516
+ "reward": 1.776976404339075,
517
+ "reward_std": 0.6543458372354507,
518
+ "rewards/accuracy_reward": 0.4662946649361402,
519
+ "rewards/cosine_scaled_reward": 0.35774270847914524,
520
  "rewards/format_reward": 0.0,
521
+ "rewards/reasoning_steps_reward": 0.9529390316456556,
522
  "step": 310
523
  },
524
  {
525
+ "completion_length": 730.0644290924072,
526
  "epoch": 0.49473379070441587,
527
+ "grad_norm": 0.006404744910772637,
528
+ "kl": 0.12236328125,
529
  "learning_rate": 1.19077950712113e-05,
530
+ "loss": 0.0049,
531
+ "reward": 1.8439508713781834,
532
+ "reward_std": 0.6846362385898829,
533
+ "rewards/accuracy_reward": 0.500669667404145,
534
+ "rewards/cosine_scaled_reward": 0.3922022982500494,
535
  "rewards/format_reward": 0.0,
536
+ "rewards/reasoning_steps_reward": 0.9510789047926664,
537
  "step": 320
538
  },
539
  {
540
+ "completion_length": 733.6488037109375,
541
  "epoch": 0.5101942216639289,
542
+ "grad_norm": 0.005310241047036926,
543
+ "kl": 0.1285675048828125,
544
  "learning_rate": 1.137447521535908e-05,
545
+ "loss": 0.0051,
546
+ "reward": 1.8017703101038933,
547
+ "reward_std": 0.670677787438035,
548
+ "rewards/accuracy_reward": 0.46941966488957404,
549
+ "rewards/cosine_scaled_reward": 0.3702226262510521,
550
  "rewards/format_reward": 0.0,
551
+ "rewards/reasoning_steps_reward": 0.9621280215680599,
552
  "step": 330
553
  },
554
  {
555
+ "completion_length": 740.5896522521973,
556
  "epoch": 0.5256546526234419,
557
+ "grad_norm": 0.004911848589025536,
558
+ "kl": 0.125958251953125,
559
  "learning_rate": 1.0837137649606241e-05,
560
+ "loss": 0.005,
561
+ "reward": 1.8196691133081913,
562
+ "reward_std": 0.6627934613265097,
563
+ "rewards/accuracy_reward": 0.4854910961352289,
564
+ "rewards/cosine_scaled_reward": 0.37692351534496993,
565
  "rewards/format_reward": 0.0,
566
+ "rewards/reasoning_steps_reward": 0.957254507765174,
567
  "step": 340
568
  },
569
  {
570
+ "completion_length": 733.2659954071045,
571
  "epoch": 0.5411150835829549,
572
+ "grad_norm": 0.009426685166535624,
573
+ "kl": 0.1310546875,
574
  "learning_rate": 1.0297353058119209e-05,
575
+ "loss": 0.0052,
576
+ "reward": 1.7875644348561763,
577
+ "reward_std": 0.6663354218006134,
578
+ "rewards/accuracy_reward": 0.46261162832379343,
579
+ "rewards/cosine_scaled_reward": 0.36353164007887245,
580
  "rewards/format_reward": 0.0,
581
+ "rewards/reasoning_steps_reward": 0.961421174928546,
582
  "step": 350
583
  },
584
  {
585
+ "completion_length": 755.5462394714356,
586
  "epoch": 0.5565755145424679,
587
+ "grad_norm": 0.005204829040206616,
588
+ "kl": 0.14141845703125,
589
  "learning_rate": 9.756699277932196e-06,
590
+ "loss": 0.0057,
591
+ "reward": 1.7464446134865284,
592
+ "reward_std": 0.6827127303928137,
593
+ "rewards/accuracy_reward": 0.43928573140874505,
594
+ "rewards/cosine_scaled_reward": 0.3423150799470022,
595
  "rewards/format_reward": 0.0,
596
+ "rewards/reasoning_steps_reward": 0.9648437988013029,
597
  "step": 360
598
  },
599
  {
600
+ "completion_length": 738.9675567626953,
601
  "epoch": 0.5720359455019809,
602
+ "grad_norm": 0.0050950433186417,
603
+ "kl": 0.133477783203125,
604
  "learning_rate": 9.216756686793163e-06,
605
+ "loss": 0.0053,
606
+ "reward": 1.7593348406255245,
607
+ "reward_std": 0.7046971999108791,
608
+ "rewards/accuracy_reward": 0.4560268087312579,
609
+ "rewards/cosine_scaled_reward": 0.35353119419887663,
610
  "rewards/format_reward": 0.0,
611
+ "rewards/reasoning_steps_reward": 0.9497768227010965,
612
  "step": 370
613
  },
614
  {
615
+ "completion_length": 715.9590724945068,
616
  "epoch": 0.5874963764614939,
617
+ "grad_norm": 0.005868130396446593,
618
+ "kl": 0.1201171875,
619
  "learning_rate": 8.67910358358298e-06,
620
+ "loss": 0.0048,
621
+ "reward": 1.8290306769311429,
622
+ "reward_std": 0.7089241919107735,
623
+ "rewards/accuracy_reward": 0.4906250239349902,
624
+ "rewards/cosine_scaled_reward": 0.3883312027202919,
625
  "rewards/format_reward": 0.0,
626
+ "rewards/reasoning_steps_reward": 0.9500744428485632,
627
  "step": 380
628
  },
629
  {
630
+ "completion_length": 758.1067291259766,
631
  "epoch": 0.6029568074210069,
632
+ "grad_norm": 0.005528799006616127,
633
+ "kl": 0.1315093994140625,
634
  "learning_rate": 8.145311574811325e-06,
635
+ "loss": 0.0053,
636
+ "reward": 1.6966661393642426,
637
+ "reward_std": 0.7609130211174489,
638
+ "rewards/accuracy_reward": 0.45424109399318696,
639
+ "rewards/cosine_scaled_reward": 0.32028957750299014,
640
  "rewards/format_reward": 0.0,
641
+ "rewards/reasoning_steps_reward": 0.9221354588866234,
642
  "step": 390
643
  },
644
  {
645
+ "completion_length": 731.6211277008057,
646
  "epoch": 0.6184172383805199,
647
+ "grad_norm": 0.006163761009715641,
648
+ "kl": 0.130072021484375,
649
  "learning_rate": 7.616940980675004e-06,
650
+ "loss": 0.0052,
651
+ "reward": 1.7418419629335404,
652
+ "reward_std": 0.7564100152812898,
653
+ "rewards/accuracy_reward": 0.46339287841692567,
654
+ "rewards/cosine_scaled_reward": 0.34221391292085174,
655
  "rewards/format_reward": 0.0,
656
+ "rewards/reasoning_steps_reward": 0.9362351588904858,
657
  "step": 400
658
  },
659
  {
660
  "epoch": 0.6184172383805199,
661
+ "eval_completion_length": 721.2921142578125,
662
+ "eval_kl": 0.14404296875,
663
+ "eval_loss": 0.005833905190229416,
664
+ "eval_reward": 1.8010995388031006,
665
+ "eval_reward_std": 0.79125015437603,
666
+ "eval_rewards/accuracy_reward": 0.4899553880095482,
667
+ "eval_rewards/cosine_scaled_reward": 0.3773643299937248,
668
  "eval_rewards/format_reward": 0.0,
669
+ "eval_rewards/reasoning_steps_reward": 0.9337798058986664,
670
+ "eval_runtime": 64.0844,
671
+ "eval_samples_per_second": 1.545,
672
+ "eval_steps_per_second": 0.016,
673
  "step": 400
674
  },
675
  {
676
+ "completion_length": 730.9440063476562,
677
  "epoch": 0.6338776693400329,
678
+ "grad_norm": 0.007004458345967938,
679
+ "kl": 0.1326690673828125,
680
  "learning_rate": 7.095536274107046e-06,
681
+ "loss": 0.0053,
682
+ "reward": 1.7348041359335185,
683
+ "reward_std": 0.7573289098218083,
684
+ "rewards/accuracy_reward": 0.46227680711308494,
685
+ "rewards/cosine_scaled_reward": 0.3395287758205086,
686
  "rewards/format_reward": 0.0,
687
+ "rewards/reasoning_steps_reward": 0.93299855068326,
688
  "step": 410
689
  },
690
  {
691
+ "completion_length": 737.4034927368164,
692
  "epoch": 0.6493381002995459,
693
+ "grad_norm": 0.006302023763263314,
694
+ "kl": 0.1422760009765625,
695
  "learning_rate": 6.58262156614881e-06,
696
+ "loss": 0.0057,
697
+ "reward": 1.7033680249005556,
698
+ "reward_std": 0.7371486462652683,
699
+ "rewards/accuracy_reward": 0.43750002002343535,
700
+ "rewards/cosine_scaled_reward": 0.32375487285316923,
701
  "rewards/format_reward": 0.0,
702
+ "rewards/reasoning_steps_reward": 0.942113135010004,
703
  "step": 420
704
  },
705
  {
706
+ "completion_length": 756.3276016235352,
707
  "epoch": 0.6647985312590589,
708
+ "grad_norm": 0.008166583966853302,
709
+ "kl": 0.149725341796875,
710
  "learning_rate": 6.079696150841634e-06,
711
+ "loss": 0.006,
712
+ "reward": 1.6697823703289032,
713
+ "reward_std": 0.7648335263133049,
714
+ "rewards/accuracy_reward": 0.4290178781375289,
715
+ "rewards/cosine_scaled_reward": 0.30843557265470734,
716
  "rewards/format_reward": 0.0,
717
+ "rewards/reasoning_steps_reward": 0.9323289088904858,
718
  "step": 430
719
  },
720
  {
721
+ "completion_length": 711.6224662780762,
722
  "epoch": 0.6802589622185718,
723
+ "grad_norm": 0.006101275201994206,
724
+ "kl": 0.149908447265625,
725
  "learning_rate": 5.588230122660672e-06,
726
+ "loss": 0.006,
727
+ "reward": 1.710378536581993,
728
+ "reward_std": 0.7376122187823058,
729
+ "rewards/accuracy_reward": 0.43995537869632245,
730
+ "rewards/cosine_scaled_reward": 0.3315466307423776,
731
  "rewards/format_reward": 0.0,
732
+ "rewards/reasoning_steps_reward": 0.9388765264302492,
733
  "step": 440
734
  },
735
  {
736
+ "completion_length": 720.1637599945068,
737
  "epoch": 0.6957193931780848,
738
+ "grad_norm": 0.00827738551813243,
739
+ "kl": 0.1536865234375,
740
  "learning_rate": 5.109660079301668e-06,
741
+ "loss": 0.0061,
742
+ "reward": 1.7479658477008342,
743
+ "reward_std": 0.7545963631942868,
744
+ "rewards/accuracy_reward": 0.45546877263113855,
745
+ "rewards/cosine_scaled_reward": 0.3493794774003618,
746
  "rewards/format_reward": 0.0,
747
+ "rewards/reasoning_steps_reward": 0.9431175928562879,
748
  "step": 450
749
  },
750
  {
751
+ "completion_length": 719.8788265228271,
752
  "epoch": 0.7111798241375978,
753
+ "grad_norm": 0.009020213190404006,
754
+ "kl": 0.146099853515625,
755
  "learning_rate": 4.64538492238166e-06,
756
+ "loss": 0.0058,
757
+ "reward": 1.761041846126318,
758
+ "reward_std": 0.7622619468718768,
759
+ "rewards/accuracy_reward": 0.46506698690354825,
760
+ "rewards/cosine_scaled_reward": 0.3563170699868351,
761
  "rewards/format_reward": 0.0,
762
+ "rewards/reasoning_steps_reward": 0.9396577756851912,
763
  "step": 460
764
  },
765
  {
766
+ "completion_length": 715.616215133667,
767
  "epoch": 0.7266402550971108,
768
+ "grad_norm": 0.009535640148387967,
769
+ "kl": 0.1488037109375,
770
  "learning_rate": 4.196761768328599e-06,
771
+ "loss": 0.006,
772
+ "reward": 1.7519984051585198,
773
+ "reward_std": 0.7264958534389734,
774
+ "rewards/accuracy_reward": 0.45613841600716115,
775
+ "rewards/cosine_scaled_reward": 0.35062185342776503,
776
  "rewards/format_reward": 0.0,
777
+ "rewards/reasoning_steps_reward": 0.9452381365001201,
778
  "step": 470
779
  },
780
  {
781
+ "completion_length": 733.5050575256348,
782
  "epoch": 0.7421006860566238,
783
+ "grad_norm": 0.009375726733768255,
784
+ "kl": 0.1457244873046875,
785
  "learning_rate": 3.7651019814126656e-06,
786
+ "loss": 0.0058,
787
+ "reward": 1.7320308901369572,
788
+ "reward_std": 0.753149107657373,
789
+ "rewards/accuracy_reward": 0.45301341358572245,
790
+ "rewards/cosine_scaled_reward": 0.3402525488520041,
791
  "rewards/format_reward": 0.0,
792
+ "rewards/reasoning_steps_reward": 0.9387649200856686,
793
  "step": 480
794
  },
795
  {
796
+ "completion_length": 725.7989181518554,
797
  "epoch": 0.7575611170161368,
798
+ "grad_norm": 0.006465310695991136,
799
+ "kl": 0.1441925048828125,
800
  "learning_rate": 3.3516673405151546e-06,
801
+ "loss": 0.0058,
802
+ "reward": 1.7133542537689208,
803
+ "reward_std": 0.7624470146372915,
804
+ "rewards/accuracy_reward": 0.4443080571014434,
805
+ "rewards/cosine_scaled_reward": 0.3332202689955011,
806
  "rewards/format_reward": 0.0,
807
+ "rewards/reasoning_steps_reward": 0.9358259223401546,
808
  "step": 490
809
  },
810
  {
811
+ "completion_length": 732.0432247161865,
812
  "epoch": 0.7730215479756498,
813
+ "grad_norm": 0.006663296056320388,
814
+ "kl": 0.1493438720703125,
815
  "learning_rate": 2.957666350839663e-06,
816
+ "loss": 0.006,
817
+ "reward": 1.7120833061635494,
818
+ "reward_std": 0.7427917202934623,
819
+ "rewards/accuracy_reward": 0.44140627095475793,
820
+ "rewards/cosine_scaled_reward": 0.3323585350837675,
821
  "rewards/format_reward": 0.0,
822
+ "rewards/reasoning_steps_reward": 0.9383184887468815,
823
  "step": 500
824
  },
825
  {
826
  "epoch": 0.7730215479756498,
827
+ "eval_completion_length": 724.1022491455078,
828
+ "eval_kl": 0.14892578125,
829
+ "eval_loss": 0.006081230938434601,
830
+ "eval_reward": 1.7919847667217255,
831
+ "eval_reward_std": 0.7341814786195755,
832
+ "eval_rewards/accuracy_reward": 0.474330373108387,
833
+ "eval_rewards/cosine_scaled_reward": 0.3756899982690811,
834
  "eval_rewards/format_reward": 0.0,
835
+ "eval_rewards/reasoning_steps_reward": 0.9419643133878708,
836
+ "eval_runtime": 62.9452,
837
+ "eval_samples_per_second": 1.573,
838
+ "eval_steps_per_second": 0.016,
839
  "step": 500
840
  },
841
  {
842
+ "completion_length": 719.1628688812256,
843
  "epoch": 0.7884819789351628,
844
+ "grad_norm": 0.026910990374737,
845
+ "kl": 0.1684112548828125,
846
  "learning_rate": 2.5842507113469307e-06,
847
+ "loss": 0.0067,
848
+ "reward": 1.6821819383651018,
849
+ "reward_std": 0.7549204783514142,
850
+ "rewards/accuracy_reward": 0.42008930565789343,
851
+ "rewards/cosine_scaled_reward": 0.3171893151884433,
852
  "rewards/format_reward": 0.0,
853
+ "rewards/reasoning_steps_reward": 0.9449033126235008,
854
  "step": 510
855
  },
856
  {
857
+ "completion_length": 703.1540473937988,
858
  "epoch": 0.8039424098946758,
859
+ "grad_norm": 0.029497387730934427,
860
+ "kl": 0.1495452880859375,
861
  "learning_rate": 2.2325119482391466e-06,
862
+ "loss": 0.006,
863
+ "reward": 1.7537529528141023,
864
+ "reward_std": 0.7176604120060801,
865
+ "rewards/accuracy_reward": 0.44877234250307085,
866
+ "rewards/cosine_scaled_reward": 0.3511859173071571,
867
  "rewards/format_reward": 0.0,
868
+ "rewards/reasoning_steps_reward": 0.9537946797907353,
869
  "step": 520
870
  },
871
  {
872
+ "completion_length": 715.8909927368164,
873
  "epoch": 0.8194028408541888,
874
+ "grad_norm": 0.006911653084698067,
875
+ "kl": 0.1466156005859375,
876
  "learning_rate": 1.9034782243345074e-06,
877
+ "loss": 0.0059,
878
+ "reward": 1.7353017818182708,
879
+ "reward_std": 0.7042613643221557,
880
+ "rewards/accuracy_reward": 0.4434151995461434,
881
+ "rewards/cosine_scaled_reward": 0.34125408774707466,
882
  "rewards/format_reward": 0.0,
883
+ "rewards/reasoning_steps_reward": 0.9506324734538794,
884
  "step": 530
885
  },
886
  {
887
+ "completion_length": 731.9873096466065,
888
  "epoch": 0.8348632718137018,
889
+ "grad_norm": 0.10031774065756535,
890
+ "kl": 0.165179443359375,
891
  "learning_rate": 1.5981113336584041e-06,
892
+ "loss": 0.0066,
893
+ "reward": 1.720738895609975,
894
+ "reward_std": 0.7829023336991667,
895
+ "rewards/accuracy_reward": 0.44453127147862687,
896
+ "rewards/cosine_scaled_reward": 0.33692186851403677,
897
  "rewards/format_reward": 0.0,
898
+ "rewards/reasoning_steps_reward": 0.9392857551574707,
899
  "step": 540
900
  },
901
  {
902
+ "completion_length": 726.0302787780762,
903
  "epoch": 0.8503237027732148,
904
+ "grad_norm": 0.00915840041448343,
905
+ "kl": 0.1617706298828125,
906
  "learning_rate": 1.3173038900362977e-06,
907
+ "loss": 0.0065,
908
+ "reward": 1.7284724555909634,
909
+ "reward_std": 0.7755123546347023,
910
+ "rewards/accuracy_reward": 0.4477678783237934,
911
+ "rewards/cosine_scaled_reward": 0.34402298720087854,
912
  "rewards/format_reward": 0.0,
913
+ "rewards/reasoning_steps_reward": 0.9366815879940986,
914
  "step": 550
915
  },
916
  {
917
+ "completion_length": 716.5763721466064,
918
  "epoch": 0.8657841337327278,
919
+ "grad_norm": 0.0077065633985853085,
920
+ "kl": 0.151544189453125,
921
  "learning_rate": 1.0618767179063416e-06,
922
+ "loss": 0.0061,
923
+ "reward": 1.7493106886744498,
924
+ "reward_std": 0.7468110140413046,
925
+ "rewards/accuracy_reward": 0.45625002135057,
926
+ "rewards/cosine_scaled_reward": 0.3529192515881732,
927
  "rewards/format_reward": 0.0,
928
+ "rewards/reasoning_steps_reward": 0.9401414062827825,
929
  "step": 560
930
  },
931
  {
932
+ "completion_length": 711.7772666931153,
933
  "epoch": 0.8812445646922408,
934
+ "grad_norm": 0.011223015630773887,
935
+ "kl": 0.1598358154296875,
936
  "learning_rate": 8.325764529785851e-07,
937
+ "loss": 0.0064,
938
+ "reward": 1.7419822074472904,
939
+ "reward_std": 0.7288113379850983,
940
+ "rewards/accuracy_reward": 0.45122770036105064,
941
+ "rewards/cosine_scaled_reward": 0.34804613249725663,
942
  "rewards/format_reward": 0.0,
943
+ "rewards/reasoning_steps_reward": 0.9427083749324083,
944
  "step": 570
945
  },
946
  {
947
+ "completion_length": 717.4036037445069,
948
  "epoch": 0.8967049956517538,
949
+ "grad_norm": 0.01473136538282227,
950
+ "kl": 0.1699462890625,
951
  "learning_rate": 6.300733597542086e-07,
952
+ "loss": 0.0068,
953
+ "reward": 1.7380871541798115,
954
+ "reward_std": 0.7284659473225474,
955
+ "rewards/accuracy_reward": 0.4454241285100579,
956
+ "rewards/cosine_scaled_reward": 0.3448207150679082,
957
  "rewards/format_reward": 0.0,
958
+ "rewards/reasoning_steps_reward": 0.9478422913700342,
959
  "step": 580
960
  },
961
  {
962
+ "completion_length": 722.4015926361084,
963
  "epoch": 0.9121654266112668,
964
+ "grad_norm": 0.015247562461578802,
965
+ "kl": 0.1722503662109375,
966
  "learning_rate": 4.549593722844492e-07,
967
+ "loss": 0.0069,
968
+ "reward": 1.7376306042075158,
969
+ "reward_std": 0.7329583563841879,
970
+ "rewards/accuracy_reward": 0.4400669841095805,
971
+ "rewards/cosine_scaled_reward": 0.34566624723374845,
972
  "rewards/format_reward": 0.0,
973
+ "rewards/reasoning_steps_reward": 0.951897357404232,
974
  "step": 590
975
  },
976
  {
977
+ "completion_length": 719.5832901000977,
978
  "epoch": 0.9276258575707798,
979
+ "grad_norm": 0.008595325912121869,
980
+ "kl": 0.1673126220703125,
981
  "learning_rate": 3.0774636389618196e-07,
982
+ "loss": 0.0067,
983
+ "reward": 1.7701299749314785,
984
+ "reward_std": 0.7306436906568706,
985
+ "rewards/accuracy_reward": 0.4577009153552353,
986
+ "rewards/cosine_scaled_reward": 0.360122480080463,
987
  "rewards/format_reward": 0.0,
988
+ "rewards/reasoning_steps_reward": 0.9523065883666277,
989
  "step": 600
990
  },
991
  {
992
  "epoch": 0.9276258575707798,
993
+ "eval_completion_length": 705.7041168212891,
994
+ "eval_kl": 0.164794921875,
995
+ "eval_loss": 0.006647891830652952,
996
+ "eval_reward": 1.8423524498939514,
997
+ "eval_reward_std": 0.6980961859226227,
998
+ "eval_rewards/accuracy_reward": 0.4832589626312256,
999
+ "eval_rewards/cosine_scaled_reward": 0.39815596491098404,
1000
  "eval_rewards/format_reward": 0.0,
1001
+ "eval_rewards/reasoning_steps_reward": 0.9609375596046448,
1002
+ "eval_runtime": 63.3214,
1003
+ "eval_samples_per_second": 1.563,
1004
+ "eval_steps_per_second": 0.016,
1005
  "step": 600
1006
  },
1007
  {
1008
+ "completion_length": 719.9855236053467,
1009
  "epoch": 0.9430862885302927,
1010
+ "grad_norm": 0.014629584328010486,
1011
+ "kl": 0.17073974609375,
1012
  "learning_rate": 1.8886465094192895e-07,
1013
+ "loss": 0.0068,
1014
+ "reward": 1.7343647606670856,
1015
+ "reward_std": 0.7088968453928828,
1016
+ "rewards/accuracy_reward": 0.4390625214669853,
1017
+ "rewards/cosine_scaled_reward": 0.3427352339422214,
1018
  "rewards/format_reward": 0.0,
1019
+ "rewards/reasoning_steps_reward": 0.9525669939815998,
1020
  "step": 610
1021
  },
1022
  {
1023
+ "completion_length": 721.9812828063965,
1024
  "epoch": 0.9585467194898057,
1025
+ "grad_norm": 0.020088112225356343,
1026
+ "kl": 0.1849456787109375,
1027
  "learning_rate": 9.866173494794462e-08,
1028
+ "loss": 0.0074,
1029
+ "reward": 1.7370413817465304,
1030
+ "reward_std": 0.7334370331838727,
1031
+ "rewards/accuracy_reward": 0.44151787713635715,
1032
+ "rewards/cosine_scaled_reward": 0.34656511796929407,
1033
  "rewards/format_reward": 0.0,
1034
+ "rewards/reasoning_steps_reward": 0.9489583697170019,
1035
  "step": 620
1036
  },
1037
  {
1038
+ "completion_length": 724.9088500976562,
1039
  "epoch": 0.9740071504493187,
1040
+ "grad_norm": 0.009230798738629389,
1041
+ "kl": 0.179193115234375,
1042
  "learning_rate": 3.7401286837214224e-08,
1043
+ "loss": 0.0072,
1044
+ "reward": 1.7149522617459296,
1045
+ "reward_std": 0.740879999101162,
1046
+ "rewards/accuracy_reward": 0.43069198355078697,
1047
+ "rewards/cosine_scaled_reward": 0.3343346292153001,
1048
  "rewards/format_reward": 0.0,
1049
+ "rewards/reasoning_steps_reward": 0.9499256368726492,
1050
  "step": 630
1051
  },
1052
  {
1053
+ "completion_length": 733.0805023193359,
1054
  "epoch": 0.9894675814088317,
1055
+ "grad_norm": 0.013971562093972711,
1056
+ "kl": 0.177264404296875,
1057
  "learning_rate": 5.262376196544239e-09,
1058
+ "loss": 0.0071,
1059
+ "reward": 1.6887946531176568,
1060
+ "reward_std": 0.7455704480409622,
1061
+ "rewards/accuracy_reward": 0.4194196627475321,
1062
+ "rewards/cosine_scaled_reward": 0.3205654217163101,
1063
  "rewards/format_reward": 0.0,
1064
+ "rewards/reasoning_steps_reward": 0.9488095600157976,
1065
  "step": 640
1066
  },
1067
  {
1068
+ "completion_length": 726.9589246114095,
1069
  "epoch": 0.9987438399845395,
1070
+ "kl": 0.1743927001953125,
1071
+ "reward": 1.7412781628469627,
1072
+ "reward_std": 0.7270878640313944,
1073
+ "rewards/accuracy_reward": 0.444568472293516,
1074
+ "rewards/cosine_scaled_reward": 0.34755290367562947,
1075
  "rewards/format_reward": 0.0,
1076
+ "rewards/reasoning_steps_reward": 0.9491567853838205,
1077
  "step": 646,
1078
  "total_flos": 0.0,
1079
+ "train_loss": 0.007009532302370239,
1080
+ "train_runtime": 74639.7368,
1081
+ "train_samples_per_second": 0.971,
1082
+ "train_steps_per_second": 0.009
1083
  }
1084
  ],
1085
  "logging_steps": 10,
 
1100
  }
1101
  },
1102
  "total_flos": 0.0,
1103
+ "train_batch_size": 8,
1104
  "trial_name": null,
1105
  "trial_params": null
1106
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5dda480b3facc8d6e3736863e3482b17102b90e8146c5d30c4543a09d4e2a61
3
- size 7224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e04bfc998f8e18fbbd2065db820a1e406e5420c727e971f5e44939e03c82128
3
+ size 7416