WSX commited on
Commit
0485494
·
verified ·
1 Parent(s): a2ba9a9

Model save

Browse files
Files changed (5) hide show
  1. README.md +3 -3
  2. all_results.json +5 -5
  3. generation_config.json +11 -3
  4. train_results.json +5 -5
  5. trainer_state.json +238 -232
README.md CHANGED
@@ -26,7 +26,7 @@ print(output["generated_text"])
26
 
27
  ## Training procedure
28
 
29
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wsx/grpo/runs/tjif0i4g)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
@@ -34,9 +34,9 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
34
  ### Framework versions
35
 
36
  - TRL: 0.16.0.dev0
37
- - Transformers: 4.49.0.dev0
38
  - Pytorch: 2.5.1
39
- - Datasets: 3.3.0
40
  - Tokenizers: 0.21.0
41
 
42
  ## Citations
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/wsx/grpo/runs/qjognkqg)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
34
  ### Framework versions
35
 
36
  - TRL: 0.16.0.dev0
37
+ - Transformers: 4.50.0.dev0
38
  - Pytorch: 2.5.1
39
+ - Datasets: 3.3.1
40
  - Tokenizers: 0.21.0
41
 
42
  ## Citations
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 20.984000422099324,
4
- "train_runtime": 3511.8094,
5
- "train_samples": 5316,
6
- "train_samples_per_second": 1.514,
7
- "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 1.7066374281421304,
4
+ "train_runtime": 5498.7014,
5
+ "train_samples": 11300,
6
+ "train_samples_per_second": 2.055,
7
+ "train_steps_per_second": 0.004
8
  }
generation_config.json CHANGED
@@ -1,6 +1,14 @@
1
  {
2
  "bos_token_id": 151643,
3
- "eos_token_id": 151643,
4
- "max_new_tokens": 2048,
5
- "transformers_version": "4.49.0.dev0"
 
 
 
 
 
 
 
 
6
  }
 
1
  {
2
  "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.50.0.dev0"
14
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 20.984000422099324,
4
- "train_runtime": 3511.8094,
5
- "train_samples": 5316,
6
- "train_samples_per_second": 1.514,
7
- "train_steps_per_second": 0.006
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 1.7066374281421304,
4
+ "train_runtime": 5498.7014,
5
+ "train_samples": 11300,
6
+ "train_samples_per_second": 2.055,
7
+ "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -1,322 +1,328 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.960960960960961,
5
- "eval_steps": 100,
6
- "global_step": 20,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 199.19922256469727,
13
- "epoch": 0.04804804804804805,
14
- "grad_norm": 23.81310616225955,
15
  "kl": 0.0,
16
- "learning_rate": 1e-05,
17
- "loss": 0.0,
18
- "reward": 1.0142708644270897,
19
- "reward_std": 0.47196217253804207,
20
- "rewards/fc_accuracy_reward": 0.42833332903683186,
21
- "rewards/format_reward": 0.5846354328095913,
22
- "rewards/reasoning_steps_reward": 0.0013020833721384406,
23
  "step": 1
24
  },
25
  {
26
- "completion_length": 196.983078956604,
27
- "epoch": 0.0960960960960961,
28
- "grad_norm": 17.400559836416594,
29
  "kl": 0.0,
30
- "learning_rate": 2e-05,
31
- "loss": 0.0,
32
- "reward": 1.0837717466056347,
33
- "reward_std": 0.46565326303243637,
34
- "rewards/fc_accuracy_reward": 0.49175781942903996,
35
- "rewards/format_reward": 0.5898437723517418,
36
- "rewards/reasoning_steps_reward": 0.002170138992369175,
37
  "step": 2
38
  },
39
  {
40
- "completion_length": 219.9648494720459,
41
- "epoch": 0.14414414414414414,
42
- "grad_norm": 295118.37380778877,
43
- "kl": 9113.812822580338,
44
- "learning_rate": 1.9848077530122083e-05,
45
- "loss": 365.0355,
46
- "reward": 1.238880269229412,
47
- "reward_std": 0.40484684705734253,
48
- "rewards/fc_accuracy_reward": 0.5162239633500576,
49
- "rewards/format_reward": 0.7226562686264515,
50
- "rewards/reasoning_steps_reward": 0.0,
51
  "step": 3
52
  },
53
  {
54
- "completion_length": 274.19922733306885,
55
- "epoch": 0.1921921921921922,
56
- "grad_norm": 7394.899853833754,
57
- "kl": 308.7130870819092,
58
- "learning_rate": 1.9396926207859085e-05,
59
- "loss": 12.3462,
60
- "reward": 1.36618060618639,
61
- "reward_std": 0.33165138494223356,
62
- "rewards/fc_accuracy_reward": 0.49421875923871994,
63
- "rewards/format_reward": 0.8710937723517418,
64
- "rewards/reasoning_steps_reward": 0.0008680556202307343,
65
  "step": 4
66
  },
67
  {
68
- "completion_length": 282.16277027130127,
69
- "epoch": 0.24024024024024024,
70
- "grad_norm": 1580.0757931975875,
71
- "kl": 40.46273612976074,
72
- "learning_rate": 1.866025403784439e-05,
73
- "loss": 1.6191,
74
- "reward": 1.4065451845526695,
75
- "reward_std": 0.27760336082428694,
76
- "rewards/fc_accuracy_reward": 0.46296875178813934,
77
- "rewards/format_reward": 0.9388020969927311,
78
- "rewards/reasoning_steps_reward": 0.004774305736646056,
79
  "step": 5
80
  },
81
  {
82
- "completion_length": 287.5546989440918,
83
- "epoch": 0.2882882882882883,
84
- "grad_norm": 66.23782695389403,
85
- "kl": 1.2862091064453125,
86
- "learning_rate": 1.766044443118978e-05,
87
- "loss": 0.0516,
88
- "reward": 1.366289108991623,
89
- "reward_std": 0.26060857344418764,
90
- "rewards/fc_accuracy_reward": 0.42748698592185974,
91
- "rewards/format_reward": 0.9375000186264515,
92
- "rewards/reasoning_steps_reward": 0.0013020833721384406,
93
  "step": 6
94
  },
95
  {
96
- "completion_length": 276.5859441757202,
97
- "epoch": 0.33633633633633636,
98
- "grad_norm": 10.29947581325369,
99
- "kl": 0.10057258605957031,
100
- "learning_rate": 1.6427876096865394e-05,
101
- "loss": 0.0041,
102
- "reward": 1.4556901454925537,
103
- "reward_std": 0.269397790543735,
104
- "rewards/fc_accuracy_reward": 0.5129817798733711,
105
- "rewards/format_reward": 0.9414062723517418,
106
- "rewards/reasoning_steps_reward": 0.0013020833721384406,
107
  "step": 7
108
  },
109
  {
110
- "completion_length": 274.68620681762695,
111
- "epoch": 0.3843843843843844,
112
- "grad_norm": 135.31823127931858,
113
- "kl": 2.202852249145508,
114
- "learning_rate": 1.5000000000000002e-05,
115
- "loss": 0.0884,
116
- "reward": 1.454049527645111,
117
- "reward_std": 0.25068292673677206,
118
- "rewards/fc_accuracy_reward": 0.5048307403922081,
119
- "rewards/format_reward": 0.9479166902601719,
120
- "rewards/reasoning_steps_reward": 0.0013020833721384406,
121
  "step": 8
122
  },
123
  {
124
- "completion_length": 274.01433277130127,
125
- "epoch": 0.43243243243243246,
126
- "grad_norm": 50.596020384988215,
127
- "kl": 1.0600624084472656,
128
- "learning_rate": 1.342020143325669e-05,
129
- "loss": 0.0426,
130
- "reward": 1.4849176108837128,
131
- "reward_std": 0.23833946604281664,
132
- "rewards/fc_accuracy_reward": 0.5400390792638063,
133
- "rewards/format_reward": 0.9414062760770321,
134
- "rewards/reasoning_steps_reward": 0.0034722223645076156,
135
  "step": 9
136
  },
137
  {
138
- "completion_length": 270.1119842529297,
139
- "epoch": 0.4804804804804805,
140
- "grad_norm": 3.4647264465831866,
141
- "kl": 0.18325424194335938,
142
- "learning_rate": 1.1736481776669307e-05,
143
- "loss": 0.0075,
144
- "reward": 1.4754557833075523,
145
- "reward_std": 0.24551822617650032,
146
- "rewards/fc_accuracy_reward": 0.534049479290843,
147
- "rewards/format_reward": 0.9414062686264515,
148
- "rewards/reasoning_steps_reward": 0.0,
149
  "step": 10
150
  },
151
  {
152
- "completion_length": 268.4127674102783,
153
- "epoch": 0.5285285285285285,
154
- "grad_norm": 21954.461363262388,
155
- "kl": 1002.1737289428711,
156
- "learning_rate": 1e-05,
157
- "loss": 40.019,
158
- "reward": 1.4405295476317406,
159
- "reward_std": 0.26797763630747795,
160
- "rewards/fc_accuracy_reward": 0.4969531334936619,
161
- "rewards/format_reward": 0.9401041865348816,
162
- "rewards/reasoning_steps_reward": 0.0034722223645076156,
163
  "step": 11
164
  },
165
  {
166
- "completion_length": 267.64323711395264,
167
- "epoch": 0.5765765765765766,
168
- "grad_norm": 122.50226092016571,
169
- "kl": 7.038032531738281,
170
- "learning_rate": 8.263518223330698e-06,
171
- "loss": 0.2818,
172
- "reward": 1.4381510838866234,
173
- "reward_std": 0.27108312491327524,
174
- "rewards/fc_accuracy_reward": 0.4954427257180214,
175
- "rewards/format_reward": 0.9427083507180214,
176
- "rewards/reasoning_steps_reward": 0.0,
177
  "step": 12
178
  },
179
  {
180
- "completion_length": 268.85547828674316,
181
- "epoch": 0.6246246246246246,
182
- "grad_norm": 7.964152420466192,
183
- "kl": 0.13129806518554688,
184
- "learning_rate": 6.579798566743314e-06,
185
- "loss": 0.0053,
186
- "reward": 1.400338590145111,
187
- "reward_std": 0.24977970868349075,
188
- "rewards/fc_accuracy_reward": 0.47716146148741245,
189
- "rewards/format_reward": 0.9231771007180214,
190
- "rewards/reasoning_steps_reward": 0.0,
191
  "step": 13
192
  },
193
  {
194
- "completion_length": 265.68360328674316,
195
- "epoch": 0.6726726726726727,
196
- "grad_norm": 8.142268019517086,
197
- "kl": 0.7337493896484375,
198
- "learning_rate": 5.000000000000003e-06,
199
- "loss": 0.0295,
200
- "reward": 1.4463759139180183,
201
- "reward_std": 0.26006509829312563,
202
- "rewards/fc_accuracy_reward": 0.509309895336628,
203
- "rewards/format_reward": 0.9361979402601719,
204
- "rewards/reasoning_steps_reward": 0.0008680556202307343,
205
  "step": 14
206
  },
207
  {
208
- "completion_length": 251.79297924041748,
209
- "epoch": 0.7207207207207207,
210
- "grad_norm": 5.0488043029036005,
211
- "kl": 0.1615753173828125,
212
- "learning_rate": 3.5721239031346067e-06,
213
- "loss": 0.0065,
214
- "reward": 1.46674482524395,
215
- "reward_std": 0.24269275972619653,
216
- "rewards/fc_accuracy_reward": 0.5214322991669178,
217
- "rewards/format_reward": 0.9440104365348816,
218
- "rewards/reasoning_steps_reward": 0.0013020833721384406,
219
  "step": 15
220
  },
221
  {
222
- "completion_length": 252.04167461395264,
223
- "epoch": 0.7687687687687688,
224
- "grad_norm": 67.89594390549263,
225
- "kl": 2.355632781982422,
226
- "learning_rate": 2.339555568810221e-06,
227
- "loss": 0.0944,
228
- "reward": 1.4977865368127823,
229
- "reward_std": 0.2157565113157034,
230
- "rewards/fc_accuracy_reward": 0.5446614678949118,
231
- "rewards/format_reward": 0.9505208544433117,
232
- "rewards/reasoning_steps_reward": 0.0026041667442768812,
233
  "step": 16
234
  },
235
  {
236
- "completion_length": 253.686203956604,
237
- "epoch": 0.8168168168168168,
238
- "grad_norm": 4.6776561505879375,
239
- "kl": 0.4775199890136719,
240
- "learning_rate": 1.339745962155613e-06,
241
- "loss": 0.0192,
242
- "reward": 1.4627083614468575,
243
- "reward_std": 0.22646328434348106,
244
- "rewards/fc_accuracy_reward": 0.5004687625914812,
245
- "rewards/format_reward": 0.9596354402601719,
246
- "rewards/reasoning_steps_reward": 0.0026041667442768812,
247
  "step": 17
248
  },
249
  {
250
- "completion_length": 259.276047706604,
251
- "epoch": 0.8648648648648649,
252
- "grad_norm": 3.3536158930168773,
253
- "kl": 0.314849853515625,
254
- "learning_rate": 6.030737921409169e-07,
255
- "loss": 0.0127,
256
- "reward": 1.3979297056794167,
257
- "reward_std": 0.23455267632380128,
258
- "rewards/fc_accuracy_reward": 0.46694011986255646,
259
- "rewards/format_reward": 0.9309896007180214,
260
- "rewards/reasoning_steps_reward": 0.0,
261
  "step": 18
262
  },
263
  {
264
- "completion_length": 256.66146755218506,
265
- "epoch": 0.9129129129129129,
266
- "grad_norm": 1.5380141665723024,
267
- "kl": 0.17271041870117188,
268
- "learning_rate": 1.519224698779198e-07,
269
- "loss": 0.007,
270
- "reward": 1.479730948805809,
271
- "reward_std": 0.22270044265314937,
272
- "rewards/fc_accuracy_reward": 0.5261718854308128,
273
- "rewards/format_reward": 0.9531250186264515,
274
- "rewards/reasoning_steps_reward": 0.0004340278101153672,
275
  "step": 19
276
  },
277
  {
278
- "completion_length": 259.4140691757202,
279
- "epoch": 0.960960960960961,
280
- "grad_norm": 0.8561283187855229,
281
- "kl": 0.24119949340820312,
282
- "learning_rate": 0.0,
283
- "loss": 0.0097,
284
- "reward": 1.453385479748249,
285
- "reward_std": 0.2447610031813383,
286
- "rewards/fc_accuracy_reward": 0.5054687578231096,
287
- "rewards/format_reward": 0.9453125149011612,
288
- "rewards/reasoning_steps_reward": 0.0026041667442768812,
289
  "step": 20
290
  },
291
  {
292
- "epoch": 0.960960960960961,
293
- "step": 20,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  "total_flos": 0.0,
295
- "train_loss": 20.984000422099324,
296
- "train_runtime": 3511.8094,
297
- "train_samples_per_second": 1.514,
298
- "train_steps_per_second": 0.006
299
  }
300
  ],
301
  "logging_steps": 1,
302
- "max_steps": 20,
303
  "num_input_tokens_seen": 0,
304
  "num_train_epochs": 1,
305
- "save_steps": 500,
306
  "stateful_callbacks": {
307
  "TrainerControl": {
308
  "args": {
309
  "should_epoch_stop": false,
310
  "should_evaluate": false,
311
  "should_log": false,
312
- "should_save": false,
313
- "should_training_stop": false
314
  },
315
  "attributes": {}
316
  }
317
  },
318
  "total_flos": 0.0,
319
- "train_batch_size": 16,
320
  "trial_name": null,
321
  "trial_params": null
322
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9943502824858758,
5
+ "eval_steps": 500,
6
+ "global_step": 22,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 276.02735233306885,
13
+ "epoch": 0.04519774011299435,
14
+ "grad_norm": 0.13002647486414434,
15
  "kl": 0.0,
16
+ "learning_rate": 6.666666666666667e-06,
17
+ "loss": 0.0303,
18
+ "reward": 1.3704427555203438,
19
+ "reward_std": 0.1837239097803831,
20
+ "rewards/fc_accuracy_reward": 0.4309895895421505,
21
+ "rewards/format_reward": 0.9394531473517418,
 
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 292.13477420806885,
26
+ "epoch": 0.0903954802259887,
27
+ "grad_norm": 0.14619501253296655,
28
  "kl": 0.0,
29
+ "learning_rate": 1.3333333333333333e-05,
30
+ "loss": 0.0469,
31
+ "reward": 1.3990885764360428,
32
+ "reward_std": 0.21469345316290855,
33
+ "rewards/fc_accuracy_reward": 0.44856772013008595,
34
+ "rewards/format_reward": 0.9505208507180214,
 
35
  "step": 2
36
  },
37
  {
38
+ "completion_length": 268.9563903808594,
39
+ "epoch": 0.13559322033898305,
40
+ "grad_norm": 0.13036409926543296,
41
+ "kl": 0.0002338886260986328,
42
+ "learning_rate": 2e-05,
43
+ "loss": 0.0319,
44
+ "reward": 1.3704427406191826,
45
+ "reward_std": 0.18289842084050179,
46
+ "rewards/fc_accuracy_reward": 0.41992188431322575,
47
+ "rewards/format_reward": 0.9505208656191826,
 
48
  "step": 3
49
  },
50
  {
51
+ "completion_length": 237.50716876983643,
52
+ "epoch": 0.1807909604519774,
53
+ "grad_norm": 0.12085621262181591,
54
+ "kl": 0.00720977783203125,
55
+ "learning_rate": 1.9863613034027224e-05,
56
+ "loss": 0.0065,
57
+ "reward": 1.427734412252903,
58
+ "reward_std": 0.1744816219434142,
59
+ "rewards/fc_accuracy_reward": 0.46614584885537624,
60
+ "rewards/format_reward": 0.9615885503590107,
 
61
  "step": 4
62
  },
63
  {
64
+ "completion_length": 198.7910213470459,
65
+ "epoch": 0.22598870056497175,
66
+ "grad_norm": 1.1449772190729142,
67
+ "kl": 0.0814666748046875,
68
+ "learning_rate": 1.9458172417006347e-05,
69
+ "loss": -0.007,
70
+ "reward": 1.2617187947034836,
71
+ "reward_std": 0.17365613672882318,
72
+ "rewards/fc_accuracy_reward": 0.3222656352445483,
73
+ "rewards/format_reward": 0.939453125,
 
74
  "step": 5
75
  },
76
  {
77
+ "completion_length": 231.09896564483643,
78
+ "epoch": 0.2711864406779661,
79
+ "grad_norm": 0.1913660515760745,
80
+ "kl": 0.045501708984375,
81
+ "learning_rate": 1.879473751206489e-05,
82
+ "loss": 0.0077,
83
+ "reward": 1.3893229588866234,
84
+ "reward_std": 0.14095465373247862,
85
+ "rewards/fc_accuracy_reward": 0.43489584513008595,
86
+ "rewards/format_reward": 0.9544270932674408,
 
87
  "step": 6
88
  },
89
  {
90
+ "completion_length": 247.44727039337158,
91
+ "epoch": 0.3163841807909605,
92
+ "grad_norm": 0.13567082283951828,
93
+ "kl": 0.045623779296875,
94
+ "learning_rate": 1.789140509396394e-05,
95
+ "loss": 0.0065,
96
+ "reward": 1.3470052555203438,
97
+ "reward_std": 0.14320992957800627,
98
+ "rewards/fc_accuracy_reward": 0.3938802182674408,
99
+ "rewards/format_reward": 0.9531250074505806,
 
100
  "step": 7
101
  },
102
  {
103
+ "completion_length": 269.38086795806885,
104
+ "epoch": 0.3615819209039548,
105
+ "grad_norm": 0.16971700336774798,
106
+ "kl": 0.060699462890625,
107
+ "learning_rate": 1.6772815716257414e-05,
108
+ "loss": 0.0085,
109
+ "reward": 1.3372396156191826,
110
+ "reward_std": 0.1406525035854429,
111
+ "rewards/fc_accuracy_reward": 0.39257813431322575,
112
+ "rewards/format_reward": 0.9446614682674408,
 
113
  "step": 8
114
  },
115
  {
116
+ "completion_length": 295.05665016174316,
117
+ "epoch": 0.4067796610169492,
118
+ "grad_norm": 1.3287845887377334,
119
+ "kl": 0.070892333984375,
120
+ "learning_rate": 1.5469481581224274e-05,
121
+ "loss": 0.0192,
122
+ "reward": 1.4036458730697632,
123
+ "reward_std": 0.18207293096929789,
124
+ "rewards/fc_accuracy_reward": 0.43750000931322575,
125
+ "rewards/format_reward": 0.9661458507180214,
 
126
  "step": 9
127
  },
128
  {
129
+ "completion_length": 307.4420690536499,
130
+ "epoch": 0.4519774011299435,
131
+ "grad_norm": 54738.02031388349,
132
+ "kl": 394.0651397705078,
133
+ "learning_rate": 1.4016954246529697e-05,
134
+ "loss": 37.1931,
135
+ "reward": 1.3352864980697632,
136
+ "reward_std": 0.2137870010919869,
137
+ "rewards/fc_accuracy_reward": 0.4199218861758709,
138
+ "rewards/format_reward": 0.915364608168602,
 
139
  "step": 10
140
  },
141
  {
142
+ "completion_length": 314.0898551940918,
143
+ "epoch": 0.4971751412429379,
144
+ "grad_norm": 2.107091829265173,
145
+ "kl": 0.095855712890625,
146
+ "learning_rate": 1.2454854871407993e-05,
147
+ "loss": 0.0338,
148
+ "reward": 1.3177083730697632,
149
+ "reward_std": 0.2345268540084362,
150
+ "rewards/fc_accuracy_reward": 0.4108072966337204,
151
+ "rewards/format_reward": 0.9069010689854622,
 
152
  "step": 11
153
  },
154
  {
155
+ "completion_length": 318.7447986602783,
156
+ "epoch": 0.5423728813559322,
157
+ "grad_norm": 0.20351280734446026,
158
+ "kl": 0.081268310546875,
159
+ "learning_rate": 1.0825793454723325e-05,
160
+ "loss": 0.0176,
161
+ "reward": 1.2675781697034836,
162
+ "reward_std": 0.3210527803748846,
163
+ "rewards/fc_accuracy_reward": 0.397135429084301,
164
+ "rewards/format_reward": 0.8704427294433117,
 
165
  "step": 12
166
  },
167
  {
168
+ "completion_length": 311.57813262939453,
169
+ "epoch": 0.5875706214689266,
170
+ "grad_norm": 0.1737011976596144,
171
+ "kl": 0.07958984375,
172
+ "learning_rate": 9.174206545276678e-06,
173
+ "loss": 0.0088,
174
+ "reward": 1.3684896156191826,
175
+ "reward_std": 0.23807168938219547,
176
+ "rewards/fc_accuracy_reward": 0.4479166828095913,
177
+ "rewards/format_reward": 0.9205729365348816,
 
178
  "step": 13
179
  },
180
  {
181
+ "completion_length": 301.56706619262695,
182
+ "epoch": 0.632768361581921,
183
+ "grad_norm": 18.70037949885207,
184
+ "kl": 0.586456298828125,
185
+ "learning_rate": 7.545145128592009e-06,
186
+ "loss": 0.0599,
187
+ "reward": 1.3203125298023224,
188
+ "reward_std": 0.20432353112846613,
189
+ "rewards/fc_accuracy_reward": 0.40820313803851604,
190
+ "rewards/format_reward": 0.9121094010770321,
 
191
  "step": 14
192
  },
193
  {
194
+ "completion_length": 289.6764450073242,
195
+ "epoch": 0.6779661016949152,
196
+ "grad_norm": 0.15577324164046125,
197
+ "kl": 0.077789306640625,
198
+ "learning_rate": 5.983045753470308e-06,
199
+ "loss": 0.011,
200
+ "reward": 1.3496094048023224,
201
+ "reward_std": 0.21356581384316087,
202
+ "rewards/fc_accuracy_reward": 0.4186198003590107,
203
+ "rewards/format_reward": 0.9309896044433117,
 
204
  "step": 15
205
  },
206
  {
207
+ "completion_length": 286.3886785507202,
208
+ "epoch": 0.7231638418079096,
209
+ "grad_norm": 5.326707764035434,
210
+ "kl": 0.0885009765625,
211
+ "learning_rate": 4.530518418775734e-06,
212
+ "loss": 0.0102,
213
+ "reward": 1.3658854514360428,
214
+ "reward_std": 0.23490996472537518,
215
+ "rewards/fc_accuracy_reward": 0.44401043094694614,
216
+ "rewards/format_reward": 0.9218750260770321,
 
217
  "step": 16
218
  },
219
  {
220
+ "completion_length": 272.7597780227661,
221
+ "epoch": 0.768361581920904,
222
+ "grad_norm": 0.33239438768575263,
223
+ "kl": 0.075653076171875,
224
+ "learning_rate": 3.2271842837425917e-06,
225
+ "loss": 0.0083,
226
+ "reward": 1.3222656697034836,
227
+ "reward_std": 0.23243350349366665,
228
+ "rewards/fc_accuracy_reward": 0.4173177182674408,
229
+ "rewards/format_reward": 0.9049479365348816,
 
230
  "step": 17
231
  },
232
  {
233
+ "completion_length": 267.5846462249756,
234
+ "epoch": 0.8135593220338984,
235
+ "grad_norm": 0.16069383676764726,
236
+ "kl": 0.07232666015625,
237
+ "learning_rate": 2.1085949060360654e-06,
238
+ "loss": 0.0114,
239
+ "reward": 1.3854166939854622,
240
+ "reward_std": 0.242501275613904,
241
+ "rewards/fc_accuracy_reward": 0.47070313803851604,
242
+ "rewards/format_reward": 0.9147135652601719,
 
243
  "step": 18
244
  },
245
  {
246
+ "completion_length": 266.1367254257202,
247
+ "epoch": 0.8587570621468926,
248
+ "grad_norm": 0.14326034602692705,
249
+ "kl": 0.0689239501953125,
250
+ "learning_rate": 1.2052624879351105e-06,
251
+ "loss": 0.0075,
252
+ "reward": 1.3854166939854622,
253
+ "reward_std": 0.20935742277652025,
254
+ "rewards/fc_accuracy_reward": 0.44531251676380634,
255
+ "rewards/format_reward": 0.940104179084301,
 
256
  "step": 19
257
  },
258
  {
259
+ "completion_length": 260.7949285507202,
260
+ "epoch": 0.903954802259887,
261
+ "grad_norm": 0.14485988944724762,
262
+ "kl": 0.06939697265625,
263
+ "learning_rate": 5.418275829936537e-07,
264
+ "loss": 0.0098,
265
+ "reward": 1.395182341337204,
266
+ "reward_std": 0.2115317303687334,
267
+ "rewards/fc_accuracy_reward": 0.47330730222165585,
268
+ "rewards/format_reward": 0.9218750260770321,
 
269
  "step": 20
270
  },
271
  {
272
+ "completion_length": 262.3704528808594,
273
+ "epoch": 0.9491525423728814,
274
+ "grad_norm": 0.19459843380674965,
275
+ "kl": 0.0714874267578125,
276
+ "learning_rate": 1.3638696597277678e-07,
277
+ "loss": 0.0072,
278
+ "reward": 1.3489583656191826,
279
+ "reward_std": 0.20432352973148227,
280
+ "rewards/fc_accuracy_reward": 0.4348958469927311,
281
+ "rewards/format_reward": 0.9140625186264515,
282
+ "step": 21
283
+ },
284
+ {
285
+ "completion_length": 263.5794382095337,
286
+ "epoch": 0.9943502824858758,
287
+ "grad_norm": 0.16933108100699076,
288
+ "kl": 0.067138671875,
289
+ "learning_rate": 0.0,
290
+ "loss": 0.0169,
291
+ "reward": 1.354817733168602,
292
+ "reward_std": 0.242501275613904,
293
+ "rewards/fc_accuracy_reward": 0.4348958469927311,
294
+ "rewards/format_reward": 0.9199219010770321,
295
+ "step": 22
296
+ },
297
+ {
298
+ "epoch": 0.9943502824858758,
299
+ "step": 22,
300
  "total_flos": 0.0,
301
+ "train_loss": 1.7066374281421304,
302
+ "train_runtime": 5498.7014,
303
+ "train_samples_per_second": 2.055,
304
+ "train_steps_per_second": 0.004
305
  }
306
  ],
307
  "logging_steps": 1,
308
+ "max_steps": 22,
309
  "num_input_tokens_seen": 0,
310
  "num_train_epochs": 1,
311
+ "save_steps": 10,
312
  "stateful_callbacks": {
313
  "TrainerControl": {
314
  "args": {
315
  "should_epoch_stop": false,
316
  "should_evaluate": false,
317
  "should_log": false,
318
+ "should_save": true,
319
+ "should_training_stop": true
320
  },
321
  "attributes": {}
322
  }
323
  },
324
  "total_flos": 0.0,
325
+ "train_batch_size": 32,
326
  "trial_name": null,
327
  "trial_params": null
328
  }