greatxue1 commited on
Commit
7a522e5
·
verified ·
1 Parent(s): 972fda3

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
3
+ library_name: transformers
4
+ model_name: DeepSeek-R1-Distill-Qwen-7B-GRPO
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for DeepSeek-R1-Distill-Qwen-7B-GRPO
13
+
14
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="greatxue1/DeepSeek-R1-Distill-Qwen-7B-GRPO", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zhongkaixue-university-of-oxford/huggingface/runs/vamiu0yb)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.2
38
+ - Transformers: 4.49.0
39
+ - Pytorch: 2.5.1+cu121
40
+ - Datasets: 3.0.2
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.0009309215234963494,
4
+ "train_runtime": 2265.8224,
5
+ "train_samples": 781,
6
+ "train_samples_per_second": 0.345,
7
+ "train_steps_per_second": 0.011
8
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.49.0"
9
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.0009309215234963494,
4
+ "train_runtime": 2265.8224,
5
+ "train_samples": 781,
6
+ "train_samples_per_second": 0.345,
7
+ "train_steps_per_second": 0.011
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9795918367346939,
5
+ "eval_steps": 500,
6
+ "global_step": 24,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 394.7544860839844,
13
+ "epoch": 0.04081632653061224,
14
+ "grad_norm": 0.5584128667460793,
15
+ "kl": 0.0,
16
+ "learning_rate": 1.6666666666666667e-06,
17
+ "loss": 0.0001,
18
+ "reward": 1.335937574505806,
19
+ "reward_std": 0.6638279929757118,
20
+ "rewards/format_reward": 0.09375000419095159,
21
+ "rewards/instruction_follow_reward": 0.2232143022119999,
22
+ "rewards/tag_count_reward": 0.572544664144516,
23
+ "step": 1
24
+ },
25
+ {
26
+ "completion_length": 407.3616256713867,
27
+ "epoch": 0.08163265306122448,
28
+ "grad_norm": 0.5809418852960418,
29
+ "kl": 0.0,
30
+ "learning_rate": 3.3333333333333333e-06,
31
+ "loss": 0.0,
32
+ "reward": 1.359151840209961,
33
+ "reward_std": 0.7929508537054062,
34
+ "rewards/format_reward": 0.10267857555299997,
35
+ "rewards/instruction_follow_reward": 0.23392858356237411,
36
+ "rewards/tag_count_reward": 0.5546875149011612,
37
+ "step": 2
38
+ },
39
+ {
40
+ "completion_length": 425.6607360839844,
41
+ "epoch": 0.12244897959183673,
42
+ "grad_norm": 0.5488272016876289,
43
+ "kl": 0.0003504753112792969,
44
+ "learning_rate": 5e-06,
45
+ "loss": 0.0,
46
+ "reward": 1.3707590103149414,
47
+ "reward_std": 0.8819468468427658,
48
+ "rewards/format_reward": 0.11160714738070965,
49
+ "rewards/instruction_follow_reward": 0.23184525407850742,
50
+ "rewards/tag_count_reward": 0.5636160969734192,
51
+ "step": 3
52
+ },
53
+ {
54
+ "completion_length": 394.3437728881836,
55
+ "epoch": 0.16326530612244897,
56
+ "grad_norm": 0.6044636536170302,
57
+ "kl": 0.001239776611328125,
58
+ "learning_rate": 4.97486935900654e-06,
59
+ "loss": 0.0001,
60
+ "reward": 1.467633992433548,
61
+ "reward_std": 0.8601260483264923,
62
+ "rewards/format_reward": 0.19642858393490314,
63
+ "rewards/instruction_follow_reward": 0.2220982275903225,
64
+ "rewards/tag_count_reward": 0.6049107313156128,
65
+ "step": 4
66
+ },
67
+ {
68
+ "completion_length": 416.5223388671875,
69
+ "epoch": 0.20408163265306123,
70
+ "grad_norm": 0.5931555061780838,
71
+ "kl": 0.01104736328125,
72
+ "learning_rate": 4.900038813018817e-06,
73
+ "loss": 0.0004,
74
+ "reward": 1.9301143288612366,
75
+ "reward_std": 0.8981596827507019,
76
+ "rewards/format_reward": 0.5133928880095482,
77
+ "rewards/instruction_follow_reward": 0.21628808975219727,
78
+ "rewards/tag_count_reward": 0.767857164144516,
79
+ "step": 5
80
+ },
81
+ {
82
+ "completion_length": 420.4241256713867,
83
+ "epoch": 0.24489795918367346,
84
+ "grad_norm": 0.6761697930052907,
85
+ "kl": 0.03021240234375,
86
+ "learning_rate": 4.777179952780443e-06,
87
+ "loss": 0.0012,
88
+ "reward": 2.2354912161827087,
89
+ "reward_std": 0.8338466733694077,
90
+ "rewards/format_reward": 0.7232143133878708,
91
+ "rewards/instruction_follow_reward": 0.2187500111758709,
92
+ "rewards/tag_count_reward": 0.856026828289032,
93
+ "step": 6
94
+ },
95
+ {
96
+ "completion_length": 394.3393096923828,
97
+ "epoch": 0.2857142857142857,
98
+ "grad_norm": 2.459773044170621,
99
+ "kl": 0.13006591796875,
100
+ "learning_rate": 4.609037242210989e-06,
101
+ "loss": 0.0052,
102
+ "reward": 2.178348273038864,
103
+ "reward_std": 1.000592678785324,
104
+ "rewards/format_reward": 0.7544643133878708,
105
+ "rewards/instruction_follow_reward": 0.18854167126119137,
106
+ "rewards/tag_count_reward": 0.8582589477300644,
107
+ "step": 7
108
+ },
109
+ {
110
+ "completion_length": 365.56697845458984,
111
+ "epoch": 0.32653061224489793,
112
+ "grad_norm": 0.6203686756559748,
113
+ "kl": 0.0291900634765625,
114
+ "learning_rate": 4.39936671161711e-06,
115
+ "loss": 0.0012,
116
+ "reward": 2.3113840222358704,
117
+ "reward_std": 0.9184626936912537,
118
+ "rewards/format_reward": 0.7098214477300644,
119
+ "rewards/instruction_follow_reward": 0.2444196529686451,
120
+ "rewards/tag_count_reward": 0.8683035969734192,
121
+ "step": 8
122
+ },
123
+ {
124
+ "completion_length": 381.06251525878906,
125
+ "epoch": 0.3673469387755102,
126
+ "grad_norm": 0.5837189440797563,
127
+ "kl": 0.016021728515625,
128
+ "learning_rate": 4.152852054182151e-06,
129
+ "loss": 0.0006,
130
+ "reward": 2.10881707072258,
131
+ "reward_std": 0.9214754402637482,
132
+ "rewards/format_reward": 0.7008928954601288,
133
+ "rewards/instruction_follow_reward": 0.1835937611758709,
134
+ "rewards/tag_count_reward": 0.8571428954601288,
135
+ "step": 9
136
+ },
137
+ {
138
+ "completion_length": 352.6785888671875,
139
+ "epoch": 0.40816326530612246,
140
+ "grad_norm": 0.6626187128387949,
141
+ "kl": 0.0197906494140625,
142
+ "learning_rate": 3.875e-06,
143
+ "loss": 0.0008,
144
+ "reward": 2.1169643998146057,
145
+ "reward_std": 0.8091428875923157,
146
+ "rewards/format_reward": 0.6607143133878708,
147
+ "rewards/instruction_follow_reward": 0.1985863298177719,
148
+ "rewards/tag_count_reward": 0.8604911118745804,
149
+ "step": 10
150
+ },
151
+ {
152
+ "completion_length": 369.41966247558594,
153
+ "epoch": 0.4489795918367347,
154
+ "grad_norm": 0.6348377712692719,
155
+ "kl": 0.0179595947265625,
156
+ "learning_rate": 3.5720173048243896e-06,
157
+ "loss": 0.0007,
158
+ "reward": 2.5970983505249023,
159
+ "reward_std": 1.0680624097585678,
160
+ "rewards/format_reward": 0.6026786118745804,
161
+ "rewards/instruction_follow_reward": 0.388392873108387,
162
+ "rewards/tag_count_reward": 0.8292410969734192,
163
+ "step": 11
164
+ },
165
+ {
166
+ "completion_length": 375.0268020629883,
167
+ "epoch": 0.4897959183673469,
168
+ "grad_norm": 0.6179040419604938,
169
+ "kl": 0.0157623291015625,
170
+ "learning_rate": 3.2506721014017075e-06,
171
+ "loss": 0.0006,
172
+ "reward": 2.2924107909202576,
173
+ "reward_std": 1.0153658390045166,
174
+ "rewards/format_reward": 0.6071428805589676,
175
+ "rewards/instruction_follow_reward": 0.2786458507180214,
176
+ "rewards/tag_count_reward": 0.84933041036129,
177
+ "step": 12
178
+ },
179
+ {
180
+ "completion_length": 381.9241256713867,
181
+ "epoch": 0.5306122448979592,
182
+ "grad_norm": 0.6671138772835987,
183
+ "kl": 0.03009033203125,
184
+ "learning_rate": 2.918142710569455e-06,
185
+ "loss": 0.0012,
186
+ "reward": 2.4388394355773926,
187
+ "reward_std": 1.0967597514390945,
188
+ "rewards/format_reward": 0.6205357313156128,
189
+ "rewards/instruction_follow_reward": 0.32633931189775467,
190
+ "rewards/tag_count_reward": 0.8392857313156128,
191
+ "step": 13
192
+ },
193
+ {
194
+ "completion_length": 363.4107208251953,
195
+ "epoch": 0.5714285714285714,
196
+ "grad_norm": 0.6026307189636135,
197
+ "kl": 0.017669677734375,
198
+ "learning_rate": 2.5818572894305453e-06,
199
+ "loss": 0.0007,
200
+ "reward": 2.404017984867096,
201
+ "reward_std": 1.1934520304203033,
202
+ "rewards/format_reward": 0.5714285969734192,
203
+ "rewards/instruction_follow_reward": 0.3214285895228386,
204
+ "rewards/tag_count_reward": 0.8683036118745804,
205
+ "step": 14
206
+ },
207
+ {
208
+ "completion_length": 424.05358123779297,
209
+ "epoch": 0.6122448979591837,
210
+ "grad_norm": 0.5671014284506704,
211
+ "kl": 0.0147705078125,
212
+ "learning_rate": 2.2493278985982932e-06,
213
+ "loss": 0.0006,
214
+ "reward": 2.2103636860847473,
215
+ "reward_std": 1.0440057069063187,
216
+ "rewards/format_reward": 0.651785746216774,
217
+ "rewards/instruction_follow_reward": 0.2267431989312172,
218
+ "rewards/tag_count_reward": 0.8783482760190964,
219
+ "step": 15
220
+ },
221
+ {
222
+ "completion_length": 361.37947845458984,
223
+ "epoch": 0.6530612244897959,
224
+ "grad_norm": 0.6208470872600845,
225
+ "kl": 0.019134521484375,
226
+ "learning_rate": 1.9279826951756115e-06,
227
+ "loss": 0.0008,
228
+ "reward": 2.465401828289032,
229
+ "reward_std": 1.0677553862333298,
230
+ "rewards/format_reward": 0.6964285969734192,
231
+ "rewards/instruction_follow_reward": 0.2924107313156128,
232
+ "rewards/tag_count_reward": 0.8917411118745804,
233
+ "step": 16
234
+ },
235
+ {
236
+ "completion_length": 367.0848388671875,
237
+ "epoch": 0.6938775510204082,
238
+ "grad_norm": 0.6513303765560159,
239
+ "kl": 0.0290679931640625,
240
+ "learning_rate": 1.6250000000000007e-06,
241
+ "loss": 0.0012,
242
+ "reward": 2.10491082072258,
243
+ "reward_std": 0.9648873805999756,
244
+ "rewards/format_reward": 0.6339286118745804,
245
+ "rewards/instruction_follow_reward": 0.19642857927829027,
246
+ "rewards/tag_count_reward": 0.8816964775323868,
247
+ "step": 17
248
+ },
249
+ {
250
+ "completion_length": 378.62501525878906,
251
+ "epoch": 0.7346938775510204,
252
+ "grad_norm": 0.5707111135110878,
253
+ "kl": 0.0172271728515625,
254
+ "learning_rate": 1.3471479458178499e-06,
255
+ "loss": 0.0007,
256
+ "reward": 2.5422155261039734,
257
+ "reward_std": 0.8247152641415596,
258
+ "rewards/format_reward": 0.7812500298023224,
259
+ "rewards/instruction_follow_reward": 0.2793247886002064,
260
+ "rewards/tag_count_reward": 0.9229911118745804,
261
+ "step": 18
262
+ },
263
+ {
264
+ "completion_length": 354.74108123779297,
265
+ "epoch": 0.7755102040816326,
266
+ "grad_norm": 0.5544484847237519,
267
+ "kl": 0.023040771484375,
268
+ "learning_rate": 1.1006332883828912e-06,
269
+ "loss": 0.0009,
270
+ "reward": 2.662946581840515,
271
+ "reward_std": 1.0467701256275177,
272
+ "rewards/format_reward": 0.7366071790456772,
273
+ "rewards/instruction_follow_reward": 0.3437500149011612,
274
+ "rewards/tag_count_reward": 0.895089328289032,
275
+ "step": 19
276
+ },
277
+ {
278
+ "completion_length": 383.00447845458984,
279
+ "epoch": 0.8163265306122449,
280
+ "grad_norm": 0.510823042158658,
281
+ "kl": 0.0205535888671875,
282
+ "learning_rate": 8.909627577890121e-07,
283
+ "loss": 0.0008,
284
+ "reward": 2.3368303775787354,
285
+ "reward_std": 0.9361487179994583,
286
+ "rewards/format_reward": 0.7633928954601288,
287
+ "rewards/instruction_follow_reward": 0.22500000894069672,
288
+ "rewards/tag_count_reward": 0.8984375596046448,
289
+ "step": 20
290
+ },
291
+ {
292
+ "completion_length": 357.4151916503906,
293
+ "epoch": 0.8571428571428571,
294
+ "grad_norm": 0.5515191742215413,
295
+ "kl": 0.0199737548828125,
296
+ "learning_rate": 7.228200472195574e-07,
297
+ "loss": 0.0008,
298
+ "reward": 2.494419753551483,
299
+ "reward_std": 0.9745698273181915,
300
+ "rewards/format_reward": 0.7767857611179352,
301
+ "rewards/instruction_follow_reward": 0.2700892984867096,
302
+ "rewards/tag_count_reward": 0.9073661118745804,
303
+ "step": 21
304
+ },
305
+ {
306
+ "completion_length": 352.87947845458984,
307
+ "epoch": 0.8979591836734694,
308
+ "grad_norm": 0.6878725311389932,
309
+ "kl": 0.036224365234375,
310
+ "learning_rate": 5.999611869811834e-07,
311
+ "loss": 0.0014,
312
+ "reward": 2.4720982909202576,
313
+ "reward_std": 0.9217582643032074,
314
+ "rewards/format_reward": 0.7276785969734192,
315
+ "rewards/instruction_follow_reward": 0.2857142984867096,
316
+ "rewards/tag_count_reward": 0.887276828289032,
317
+ "step": 22
318
+ },
319
+ {
320
+ "completion_length": 373.3035888671875,
321
+ "epoch": 0.9387755102040817,
322
+ "grad_norm": 0.5789373606479209,
323
+ "kl": 0.02337646484375,
324
+ "learning_rate": 5.251306409934609e-07,
325
+ "loss": 0.0009,
326
+ "reward": 2.023437589406967,
327
+ "reward_std": 0.9313821792602539,
328
+ "rewards/format_reward": 0.7500000149011612,
329
+ "rewards/instruction_follow_reward": 0.12946429289877415,
330
+ "rewards/tag_count_reward": 0.8850446790456772,
331
+ "step": 23
332
+ },
333
+ {
334
+ "completion_length": 393.03572845458984,
335
+ "epoch": 0.9795918367346939,
336
+ "grad_norm": 0.59441068526159,
337
+ "kl": 0.03009033203125,
338
+ "learning_rate": 5.000000000000001e-07,
339
+ "loss": 0.0012,
340
+ "reward": 2.5097524523735046,
341
+ "reward_std": 0.8981894552707672,
342
+ "rewards/format_reward": 0.7857143133878708,
343
+ "rewards/instruction_follow_reward": 0.2729680556803942,
344
+ "rewards/tag_count_reward": 0.9051339775323868,
345
+ "step": 24
346
+ },
347
+ {
348
+ "epoch": 0.9795918367346939,
349
+ "step": 24,
350
+ "total_flos": 0.0,
351
+ "train_loss": 0.0009309215234963494,
352
+ "train_runtime": 2265.8224,
353
+ "train_samples_per_second": 0.345,
354
+ "train_steps_per_second": 0.011
355
+ }
356
+ ],
357
+ "logging_steps": 1,
358
+ "max_steps": 24,
359
+ "num_input_tokens_seen": 0,
360
+ "num_train_epochs": 1,
361
+ "save_steps": 500,
362
+ "stateful_callbacks": {
363
+ "TrainerControl": {
364
+ "args": {
365
+ "should_epoch_stop": false,
366
+ "should_evaluate": false,
367
+ "should_log": false,
368
+ "should_save": true,
369
+ "should_training_stop": true
370
+ },
371
+ "attributes": {}
372
+ }
373
+ },
374
+ "total_flos": 0.0,
375
+ "train_batch_size": 8,
376
+ "trial_name": null,
377
+ "trial_params": null
378
+ }