Blancy commited on
Commit
bbe1200
·
verified ·
1 Parent(s): 038f247

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-7B
3
- datasets: Blancy/secondfiltered-math220k-difficulty_stratified_8k
4
  library_name: transformers
5
  model_name: Qwen-2.5-7B-Simple-RL
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen-2.5-7B-Simple-RL
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B) on the [Blancy/secondfiltered-math220k-difficulty_stratified_8k](https://huggingface.co/datasets/Blancy/secondfiltered-math220k-difficulty_stratified_8k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/hp2j65o4)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-7B
 
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B-Simple-RL
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen-2.5-7B-Simple-RL
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/3y76zskm)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0010395098450702472,
4
- "train_runtime": 14559.9882,
5
- "train_samples": 7500,
6
- "train_samples_per_second": 0.515,
7
  "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0006319970209619013,
4
+ "train_runtime": 19204.1352,
5
+ "train_samples": 10000,
6
+ "train_samples_per_second": 0.521,
7
  "train_steps_per_second": 0.004
8
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
- "use_cache": true,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
29
  "vocab_size": 152064
 
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
+ "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
29
  "vocab_size": 152064
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a1eb6e5679e4d58dd28e87c12fa85b41e6f8649886af1b00e5e560fe9d132ea
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aae161abbf29198cd3b2b48df0e92aff65899add1650eeb5c8261fda9ba83786
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95cd0d8bf08fa8ef084bd405054d1ec4dfa7a3e1b856c6f80bcbdb25a01118a8
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ced947bddeba4477ce8a9505cc27cf7c5032a3c7f427145b4d038f68294741e
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00f846aafa78bf493ca9d4cb53df3c719240a90c7046ca60255a7bf169ba1165
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3daf3c87a42981f743aa78e8696d12f363f0f22d7ac8f722767b14bc04d4267
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:621840d94a3e10c3fd2b1d1227690bb2b13530236ad68b97eabacb16ccb7cd2c
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:989a70e4f95e48a2b61859b8addca060fd1890cdf613172d4a3ce1820ad50e90
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0010395098450702472,
4
- "train_runtime": 14559.9882,
5
- "train_samples": 7500,
6
- "train_samples_per_second": 0.515,
7
  "train_steps_per_second": 0.004
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0006319970209619013,
4
+ "train_runtime": 19204.1352,
5
+ "train_samples": 10000,
6
+ "train_samples_per_second": 0.521,
7
  "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -1,187 +1,239 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9893390191897654,
5
  "eval_steps": 100,
6
- "global_step": 58,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 743.5234680175781,
13
- "epoch": 0.017057569296375266,
14
- "grad_norm": 4.396385192871094,
15
  "kl": 0.0,
16
- "learning_rate": 5e-07,
17
  "loss": 0.0,
18
- "reward": 0.026785715366713703,
19
- "reward_std": 0.04944828595034778,
20
- "rewards/accuracy_reward": 0.026785715366713703,
21
  "rewards/format_reward": 0.0,
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 735.4506530761719,
26
- "epoch": 0.08528784648187633,
27
- "grad_norm": 0.44695502519607544,
28
- "kl": 0.0003033876419067383,
29
- "learning_rate": 2.5e-06,
30
  "loss": 0.0,
31
- "reward": 0.032924108527367935,
32
- "reward_std": 0.05200784042244777,
33
- "rewards/accuracy_reward": 0.032924108527367935,
34
  "rewards/format_reward": 0.0,
35
  "step": 5
36
  },
37
  {
38
- "completion_length": 745.6826187133789,
39
- "epoch": 0.17057569296375266,
40
- "grad_norm": 0.509334921836853,
41
- "kl": 0.001618671417236328,
42
- "learning_rate": 2.956412726139078e-06,
43
- "loss": 0.0001,
44
- "reward": 0.04508928821887821,
45
- "reward_std": 0.07083694078028202,
46
- "rewards/accuracy_reward": 0.04508928821887821,
47
  "rewards/format_reward": 0.0,
48
  "step": 10
49
  },
50
  {
51
- "completion_length": 755.3701187133789,
52
- "epoch": 0.255863539445629,
53
- "grad_norm": 0.7283040285110474,
54
- "kl": 0.00929851531982422,
55
- "learning_rate": 2.7836719084521715e-06,
56
  "loss": 0.0004,
57
- "reward": 0.061830359417945145,
58
- "reward_std": 0.08021324193105102,
59
- "rewards/accuracy_reward": 0.061830359417945145,
60
  "rewards/format_reward": 0.0,
61
  "step": 15
62
  },
63
  {
64
- "completion_length": 759.1250411987305,
65
- "epoch": 0.3411513859275053,
66
- "grad_norm": 0.9553681015968323,
67
- "kl": 0.011894989013671874,
68
- "learning_rate": 2.4946839873611927e-06,
69
- "loss": 0.0005,
70
- "reward": 0.07767857508733869,
71
- "reward_std": 0.09344197846949101,
72
- "rewards/accuracy_reward": 0.07767857508733869,
73
  "rewards/format_reward": 0.0,
74
  "step": 20
75
  },
76
  {
77
- "completion_length": 786.6553939819336,
78
- "epoch": 0.42643923240938164,
79
- "grad_norm": 0.3153317868709564,
80
- "kl": 0.0048351287841796875,
81
- "learning_rate": 2.1156192081791355e-06,
82
- "loss": 0.0002,
83
- "reward": 0.23928572454024105,
84
- "reward_std": 0.2163457485847175,
85
- "rewards/accuracy_reward": 0.23928572454024105,
86
  "rewards/format_reward": 0.0,
87
  "step": 25
88
  },
89
  {
90
- "completion_length": 800.1018218994141,
91
- "epoch": 0.511727078891258,
92
- "grad_norm": 0.23729673027992249,
93
- "kl": 0.005898094177246094,
94
- "learning_rate": 1.6808050203829845e-06,
95
- "loss": 0.0002,
96
- "reward": 0.2857142981141806,
97
- "reward_std": 0.2412072943523526,
98
- "rewards/accuracy_reward": 0.2857142981141806,
99
  "rewards/format_reward": 0.0,
100
  "step": 30
101
  },
102
  {
103
- "completion_length": 809.353158569336,
104
- "epoch": 0.5970149253731343,
105
- "grad_norm": 0.29627376794815063,
106
- "kl": 0.003314781188964844,
107
- "learning_rate": 1.2296174432791415e-06,
108
- "loss": 0.0001,
109
- "reward": 0.3125000149011612,
110
- "reward_std": 0.24806460849940776,
111
- "rewards/accuracy_reward": 0.3125000149011612,
112
  "rewards/format_reward": 0.0,
113
  "step": 35
114
  },
115
  {
116
- "completion_length": 788.3007049560547,
117
- "epoch": 0.6823027718550106,
118
- "grad_norm": 0.11735204607248306,
119
- "kl": 0.003574562072753906,
120
- "learning_rate": 8.029152419343472e-07,
121
- "loss": 0.0001,
122
- "reward": 0.2823660859838128,
123
- "reward_std": 0.20102398535236715,
124
- "rewards/accuracy_reward": 0.2823660859838128,
125
  "rewards/format_reward": 0.0,
126
  "step": 40
127
  },
128
  {
129
- "completion_length": 780.8183395385743,
130
- "epoch": 0.767590618336887,
131
- "grad_norm": 0.9417461156845093,
132
- "kl": 0.0042591094970703125,
133
- "learning_rate": 4.3933982822017883e-07,
134
- "loss": 0.0002,
135
- "reward": 0.2698660843074322,
136
- "reward_std": 0.1852410165593028,
137
- "rewards/accuracy_reward": 0.2698660843074322,
138
  "rewards/format_reward": 0.0,
139
  "step": 45
140
  },
141
  {
142
- "completion_length": 874.1305999755859,
143
- "epoch": 0.8528784648187633,
144
- "grad_norm": NaN,
145
- "kl": NaN,
146
- "learning_rate": 1.718159615201853e-07,
147
- "loss": 0.0103,
148
- "reward": 0.08928571869619191,
149
- "reward_std": 0.0756364663131535,
150
- "rewards/accuracy_reward": 0.08928571869619191,
151
  "rewards/format_reward": 0.0,
152
  "step": 50
153
  },
154
  {
155
- "completion_length": 1024.0,
156
- "epoch": 0.9381663113006397,
157
- "grad_norm": NaN,
158
- "kl": NaN,
159
- "learning_rate": 2.4570139579284723e-08,
160
- "loss": 0.0,
161
- "reward": 0.0,
162
- "reward_std": 0.0,
163
- "rewards/accuracy_reward": 0.0,
164
  "rewards/format_reward": 0.0,
165
  "step": 55
166
  },
167
  {
168
- "completion_length": 1024.0,
169
- "epoch": 0.9893390191897654,
170
- "kl": NaN,
171
- "reward": 0.0,
172
- "reward_std": 0.0,
173
- "rewards/accuracy_reward": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  "rewards/format_reward": 0.0,
175
- "step": 58,
176
  "total_flos": 0.0,
177
- "train_loss": 0.0010395098450702472,
178
- "train_runtime": 14559.9882,
179
- "train_samples_per_second": 0.515,
180
  "train_steps_per_second": 0.004
181
  }
182
  ],
183
  "logging_steps": 5,
184
- "max_steps": 58,
185
  "num_input_tokens_seen": 0,
186
  "num_train_epochs": 1,
187
  "save_steps": 500,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9984,
5
  "eval_steps": 100,
6
+ "global_step": 78,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 746.608283996582,
13
+ "epoch": 0.0128,
14
+ "grad_norm": 0.5155316591262817,
15
  "kl": 0.0,
16
+ "learning_rate": 3.75e-07,
17
  "loss": 0.0,
18
+ "reward": 0.14843750838190317,
19
+ "reward_std": 0.16338601242750883,
20
+ "rewards/accuracy_reward": 0.14843750838190317,
21
  "rewards/format_reward": 0.0,
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 727.0990829467773,
26
+ "epoch": 0.064,
27
+ "grad_norm": 1.084052324295044,
28
+ "kl": 0.0003186464309692383,
29
+ "learning_rate": 1.875e-06,
30
  "loss": 0.0,
31
+ "reward": 0.1143973259604536,
32
+ "reward_std": 0.12805407610721886,
33
+ "rewards/accuracy_reward": 0.1143973259604536,
34
  "rewards/format_reward": 0.0,
35
  "step": 5
36
  },
37
  {
38
+ "completion_length": 736.6721374511719,
39
+ "epoch": 0.128,
40
+ "grad_norm": 0.36135947704315186,
41
+ "kl": 0.0037824630737304686,
42
+ "learning_rate": 2.993961440992859e-06,
43
+ "loss": 0.0002,
44
+ "reward": 0.1305803638882935,
45
+ "reward_std": 0.12962731290608645,
46
+ "rewards/accuracy_reward": 0.1305803638882935,
47
  "rewards/format_reward": 0.0,
48
  "step": 10
49
  },
50
  {
51
+ "completion_length": 779.8250381469727,
52
+ "epoch": 0.192,
53
+ "grad_norm": 1.732832670211792,
54
+ "kl": 0.010242462158203125,
55
+ "learning_rate": 2.9265847744427307e-06,
56
  "loss": 0.0004,
57
+ "reward": 0.1544642923399806,
58
+ "reward_std": 0.1452927845530212,
59
+ "rewards/accuracy_reward": 0.1544642923399806,
60
  "rewards/format_reward": 0.0,
61
  "step": 15
62
  },
63
  {
64
+ "completion_length": 793.7176681518555,
65
+ "epoch": 0.256,
66
+ "grad_norm": 0.45336416363716125,
67
+ "kl": 0.05513496398925781,
68
+ "learning_rate": 2.7876731904027993e-06,
69
+ "loss": 0.0022,
70
+ "reward": 0.1392857213038951,
71
+ "reward_std": 0.1322711819317192,
72
+ "rewards/accuracy_reward": 0.1392857213038951,
73
  "rewards/format_reward": 0.0,
74
  "step": 20
75
  },
76
  {
77
+ "completion_length": 772.2924423217773,
78
+ "epoch": 0.32,
79
+ "grad_norm": 1.4548314809799194,
80
+ "kl": 0.008623123168945312,
81
+ "learning_rate": 2.584192295741087e-06,
82
+ "loss": 0.0003,
83
+ "reward": 0.17254465222358703,
84
+ "reward_std": 0.1551555698737502,
85
+ "rewards/accuracy_reward": 0.17254465222358703,
86
  "rewards/format_reward": 0.0,
87
  "step": 25
88
  },
89
  {
90
+ "completion_length": 758.8147674560547,
91
+ "epoch": 0.384,
92
+ "grad_norm": 0.43251317739486694,
93
+ "kl": 0.04488029479980469,
94
+ "learning_rate": 2.3263454721781537e-06,
95
+ "loss": 0.0018,
96
+ "reward": 0.18013393748551607,
97
+ "reward_std": 0.15790632385760545,
98
+ "rewards/accuracy_reward": 0.18013393748551607,
99
  "rewards/format_reward": 0.0,
100
  "step": 30
101
  },
102
  {
103
+ "completion_length": 759.0868667602539,
104
+ "epoch": 0.448,
105
+ "grad_norm": 2.017112970352173,
106
+ "kl": 0.01151580810546875,
107
+ "learning_rate": 2.027062236122014e-06,
108
+ "loss": 0.0005,
109
+ "reward": 0.16049108020961284,
110
+ "reward_std": 0.14527264153584837,
111
+ "rewards/accuracy_reward": 0.16049108020961284,
112
  "rewards/format_reward": 0.0,
113
  "step": 35
114
  },
115
  {
116
+ "completion_length": 762.179719543457,
117
+ "epoch": 0.512,
118
+ "grad_norm": 2.426039934158325,
119
+ "kl": 0.0172943115234375,
120
+ "learning_rate": 1.7013498987264833e-06,
121
+ "loss": 0.0007,
122
+ "reward": 0.17991072311997414,
123
+ "reward_std": 0.15640948014333844,
124
+ "rewards/accuracy_reward": 0.17991072311997414,
125
  "rewards/format_reward": 0.0,
126
  "step": 40
127
  },
128
  {
129
+ "completion_length": 754.7252563476562,
130
+ "epoch": 0.576,
131
+ "grad_norm": 1.071414828300476,
132
+ "kl": 0.008959579467773437,
133
+ "learning_rate": 1.3655410366448499e-06,
134
+ "loss": 0.0004,
135
+ "reward": 0.20066965445876123,
136
+ "reward_std": 0.15480793919414282,
137
+ "rewards/accuracy_reward": 0.20066965445876123,
138
  "rewards/format_reward": 0.0,
139
  "step": 45
140
  },
141
  {
142
+ "completion_length": 765.8024856567383,
143
+ "epoch": 0.64,
144
+ "grad_norm": 0.8965884447097778,
145
+ "kl": 0.008967971801757813,
146
+ "learning_rate": 1.036474508437579e-06,
147
+ "loss": 0.0004,
148
+ "reward": 0.18415179559960962,
149
+ "reward_std": 0.17140543824061752,
150
+ "rewards/accuracy_reward": 0.18415179559960962,
151
  "rewards/format_reward": 0.0,
152
  "step": 50
153
  },
154
  {
155
+ "completion_length": 740.766552734375,
156
+ "epoch": 0.704,
157
+ "grad_norm": 27.7662296295166,
158
+ "kl": 0.016036224365234376,
159
+ "learning_rate": 7.30651083891141e-07,
160
+ "loss": 0.0006,
161
+ "reward": 0.1348214357160032,
162
+ "reward_std": 0.1273266337811947,
163
+ "rewards/accuracy_reward": 0.1348214357160032,
164
  "rewards/format_reward": 0.0,
165
  "step": 55
166
  },
167
  {
168
+ "completion_length": 741.5098526000977,
169
+ "epoch": 0.768,
170
+ "grad_norm": 0.12588420510292053,
171
+ "kl": 0.011777877807617188,
172
+ "learning_rate": 4.63406026519703e-07,
173
+ "loss": 0.0005,
174
+ "reward": 0.19330358118750154,
175
+ "reward_std": 0.15862846123054625,
176
+ "rewards/accuracy_reward": 0.19330358118750154,
177
+ "rewards/format_reward": 0.0,
178
+ "step": 60
179
+ },
180
+ {
181
+ "completion_length": 734.3199005126953,
182
+ "epoch": 0.832,
183
+ "grad_norm": 4.302868843078613,
184
+ "kl": 0.07119407653808593,
185
+ "learning_rate": 2.48140119418046e-07,
186
+ "loss": 0.0029,
187
+ "reward": 0.16562500977888703,
188
+ "reward_std": 0.14798311032354833,
189
+ "rewards/accuracy_reward": 0.16562500977888703,
190
+ "rewards/format_reward": 0.0,
191
+ "step": 65
192
+ },
193
+ {
194
+ "completion_length": 737.8087387084961,
195
+ "epoch": 0.896,
196
+ "grad_norm": 1.6815804243087769,
197
+ "kl": 0.00724334716796875,
198
+ "learning_rate": 9.564769404039419e-08,
199
+ "loss": 0.0003,
200
+ "reward": 0.17589286509901286,
201
+ "reward_std": 0.14854512372985482,
202
+ "rewards/accuracy_reward": 0.17589286509901286,
203
+ "rewards/format_reward": 0.0,
204
+ "step": 70
205
+ },
206
+ {
207
+ "completion_length": 740.4969161987304,
208
+ "epoch": 0.96,
209
+ "grad_norm": 0.14792795479297638,
210
+ "kl": 0.034131622314453124,
211
+ "learning_rate": 1.357535734809795e-08,
212
+ "loss": 0.0014,
213
+ "reward": 0.16607143636792898,
214
+ "reward_std": 0.14445759197697045,
215
+ "rewards/accuracy_reward": 0.16607143636792898,
216
+ "rewards/format_reward": 0.0,
217
+ "step": 75
218
+ },
219
+ {
220
+ "completion_length": 740.5067342122396,
221
+ "epoch": 0.9984,
222
+ "kl": 0.02994537353515625,
223
+ "reward": 0.17708334047347307,
224
+ "reward_std": 0.1565772045093278,
225
+ "rewards/accuracy_reward": 0.17708334047347307,
226
  "rewards/format_reward": 0.0,
227
+ "step": 78,
228
  "total_flos": 0.0,
229
+ "train_loss": 0.0006319970209619013,
230
+ "train_runtime": 19204.1352,
231
+ "train_samples_per_second": 0.521,
232
  "train_steps_per_second": 0.004
233
  }
234
  ],
235
  "logging_steps": 5,
236
+ "max_steps": 78,
237
  "num_input_tokens_seen": 0,
238
  "num_train_epochs": 1,
239
  "save_steps": 500,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e015d8ab2e5cb45b3da98676b2843655934f11c70a34a51a48328776a67f59c5
3
  size 7992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87d6b88ba5aa2bf4c81fdcad1bb5a213d7285d62b4a660d9d975034da8f3cab7
3
  size 7992