Blancy commited on
Commit
817bd58
·
verified ·
1 Parent(s): 0e547ee

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +5 -5
  3. train_results.json +5 -5
  4. trainer_state.json +43 -147
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-7B
3
- datasets: DigitalLearningGmbH/MATH-lighteval
4
  library_name: transformers
5
  model_name: Qwen-2.5-7B-Simple-RL
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen-2.5-7B-Simple-RL
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B) on the [DigitalLearningGmbH/MATH-lighteval](https://huggingface.co/datasets/DigitalLearningGmbH/MATH-lighteval) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/3gmilpkh)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-7B
 
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B-Simple-RL
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen-2.5-7B-Simple-RL
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/eybgfkwc)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 3.3746,
5
- "train_samples": 7500,
6
- "train_samples_per_second": 2222.49,
7
- "train_steps_per_second": 17.187
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 6.178523520551001e-05,
4
+ "train_runtime": 3794.8453,
5
+ "train_samples": 1000,
6
+ "train_samples_per_second": 0.264,
7
+ "train_steps_per_second": 0.004
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0,
4
- "train_runtime": 3.3746,
5
- "train_samples": 7500,
6
- "train_samples_per_second": 2222.49,
7
- "train_steps_per_second": 17.187
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 6.178523520551001e-05,
4
+ "train_runtime": 3794.8453,
5
+ "train_samples": 1000,
6
+ "train_samples_per_second": 0.264,
7
+ "train_steps_per_second": 0.004
8
  }
trainer_state.json CHANGED
@@ -1,184 +1,80 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9893390191897654,
5
- "eval_steps": 100,
6
- "global_step": 58,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 598.6428833007812,
13
- "epoch": 0.017057569296375266,
14
- "grad_norm": 1.5689994621266787,
15
  "kl": 0.0,
16
- "learning_rate": 5e-07,
17
  "loss": 0.0,
18
- "reward": 0.6093750223517418,
19
- "reward_std": 0.35872430354356766,
20
- "rewards/accuracy_reward": 0.6093750223517418,
21
  "rewards/format_reward": 0.0,
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 598.8225679397583,
26
- "epoch": 0.08528784648187633,
27
- "grad_norm": 7.756384521859563,
28
- "kl": 0.00030493736267089844,
29
- "learning_rate": 2.5e-06,
30
  "loss": 0.0,
31
- "reward": 0.6102120848372579,
32
- "reward_std": 0.3614693288691342,
33
- "rewards/accuracy_reward": 0.6102120848372579,
34
  "rewards/format_reward": 0.0,
35
  "step": 5
36
  },
37
  {
38
- "completion_length": 635.0709037780762,
39
- "epoch": 0.17057569296375266,
40
- "grad_norm": 0.24554990369771146,
41
- "kl": 0.0016298294067382812,
42
- "learning_rate": 2.956412726139078e-06,
43
  "loss": 0.0001,
44
- "reward": 0.7444196678698063,
45
- "reward_std": 0.25730893574655056,
46
- "rewards/accuracy_reward": 0.7444196678698063,
47
  "rewards/format_reward": 0.0,
48
  "step": 10
49
  },
50
  {
51
- "completion_length": 613.8451156616211,
52
- "epoch": 0.255863539445629,
53
- "grad_norm": 0.2932648725811773,
54
- "kl": 0.004305648803710938,
55
- "learning_rate": 2.7836719084521715e-06,
56
- "loss": 0.0002,
57
- "reward": 0.7533482506871223,
58
- "reward_std": 0.22977835088968276,
59
- "rewards/accuracy_reward": 0.7533482506871223,
60
- "rewards/format_reward": 0.0,
61
- "step": 15
62
- },
63
- {
64
- "completion_length": 604.8868591308594,
65
- "epoch": 0.3411513859275053,
66
- "grad_norm": 0.16889654761639042,
67
- "kl": 0.0032588958740234373,
68
- "learning_rate": 2.4946839873611927e-06,
69
- "loss": 0.0001,
70
- "reward": 0.7676339626312256,
71
- "reward_std": 0.20254891095682978,
72
- "rewards/accuracy_reward": 0.7676339626312256,
73
- "rewards/format_reward": 0.0,
74
- "step": 20
75
- },
76
- {
77
- "completion_length": 599.3297142028808,
78
- "epoch": 0.42643923240938164,
79
- "grad_norm": 0.14357212406142408,
80
- "kl": 9.20305938720703,
81
- "learning_rate": 2.1156192081791355e-06,
82
- "loss": 0.3681,
83
- "reward": 0.7622768104076385,
84
- "reward_std": 0.19452331885695456,
85
- "rewards/accuracy_reward": 0.7622768104076385,
86
- "rewards/format_reward": 0.0,
87
- "step": 25
88
- },
89
- {
90
- "completion_length": 596.0473510742188,
91
- "epoch": 0.511727078891258,
92
- "grad_norm": 0.20095541353253643,
93
- "kl": 0.002848052978515625,
94
- "learning_rate": 1.6808050203829845e-06,
95
- "loss": 0.0001,
96
- "reward": 0.7611607506871223,
97
- "reward_std": 0.16808430003002287,
98
- "rewards/accuracy_reward": 0.7611607506871223,
99
- "rewards/format_reward": 0.0,
100
- "step": 30
101
- },
102
- {
103
- "completion_length": 584.6814987182618,
104
- "epoch": 0.5970149253731343,
105
- "grad_norm": 0.0743109219899615,
106
- "kl": 0.003064537048339844,
107
- "learning_rate": 1.2296174432791415e-06,
108
- "loss": 0.0001,
109
- "reward": 0.7562500342726708,
110
- "reward_std": 0.16550147179514169,
111
- "rewards/accuracy_reward": 0.7562500342726708,
112
- "rewards/format_reward": 0.0,
113
- "step": 35
114
- },
115
- {
116
- "completion_length": 574.5002449035644,
117
- "epoch": 0.6823027718550106,
118
- "grad_norm": 0.06794240162987708,
119
- "kl": 0.003296661376953125,
120
- "learning_rate": 8.029152419343472e-07,
121
- "loss": 0.0001,
122
- "reward": 0.7834821805357933,
123
- "reward_std": 0.15637031104415655,
124
- "rewards/accuracy_reward": 0.7834821805357933,
125
- "rewards/format_reward": 0.0,
126
- "step": 40
127
- },
128
- {
129
- "completion_length": 589.9893081665039,
130
- "epoch": 0.767590618336887,
131
- "grad_norm": 0.08693057149783366,
132
- "kl": 0.0029300689697265626,
133
- "learning_rate": 4.3933982822017883e-07,
134
- "loss": 0.0001,
135
- "reward": 0.7680803880095481,
136
- "reward_std": 0.17124480060301722,
137
- "rewards/accuracy_reward": 0.7680803880095481,
138
- "rewards/format_reward": 0.0,
139
- "step": 45
140
- },
141
- {
142
- "completion_length": 587.243330001831,
143
- "epoch": 0.8528784648187633,
144
- "grad_norm": 0.1426044961551274,
145
- "kl": 0.0030112266540527344,
146
- "learning_rate": 1.718159615201853e-07,
147
- "loss": 0.0001,
148
- "reward": 0.7603236939758062,
149
- "reward_std": 0.1613232684903778,
150
- "rewards/accuracy_reward": 0.7603236939758062,
151
- "rewards/format_reward": 0.0,
152
- "step": 50
153
- },
154
- {
155
- "completion_length": 583.5995719909668,
156
- "epoch": 0.9381663113006397,
157
- "grad_norm": 0.11866102279196994,
158
- "kl": 0.003066253662109375,
159
- "learning_rate": 2.4570139579284723e-08,
160
  "loss": 0.0001,
161
- "reward": 0.7906250342726707,
162
- "reward_std": 0.17242762465029954,
163
- "rewards/accuracy_reward": 0.7906250342726707,
164
  "rewards/format_reward": 0.0,
165
- "step": 55
166
  },
167
  {
168
- "epoch": 0.9893390191897654,
169
- "step": 58,
170
  "total_flos": 0.0,
171
- "train_loss": 0.0,
172
- "train_runtime": 3.3746,
173
- "train_samples_per_second": 2222.49,
174
- "train_steps_per_second": 17.187
175
  }
176
  ],
177
  "logging_steps": 5,
178
- "max_steps": 58,
179
  "num_input_tokens_seen": 0,
180
  "num_train_epochs": 1,
181
- "save_steps": 1,
182
  "stateful_callbacks": {
183
  "TrainerControl": {
184
  "args": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.96,
5
+ "eval_steps": 500,
6
+ "global_step": 15,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 752.8906402587891,
13
+ "epoch": 0.064,
14
+ "grad_norm": 0.8492731776446195,
15
  "kl": 0.0,
16
+ "learning_rate": 1.5e-06,
17
  "loss": 0.0,
18
+ "reward": 0.5052083432674408,
19
+ "reward_std": 0.0497216647490859,
20
+ "rewards/accuracy_reward": 0.5052083432674408,
21
  "rewards/format_reward": 0.0,
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 777.4062652587891,
26
+ "epoch": 0.32,
27
+ "grad_norm": 0.10438261848562716,
28
+ "kl": 0.000284731388092041,
29
+ "learning_rate": 2.6227661222566517e-06,
30
  "loss": 0.0,
31
+ "reward": 0.5052083432674408,
32
+ "reward_std": 0.03829827485606074,
33
+ "rewards/accuracy_reward": 0.5052083432674408,
34
  "rewards/format_reward": 0.0,
35
  "step": 5
36
  },
37
  {
38
+ "completion_length": 765.1005416870117,
39
+ "epoch": 0.64,
40
+ "grad_norm": 0.9738634307537658,
41
+ "kl": 0.0014707565307617188,
42
+ "learning_rate": 9.680926694361964e-07,
43
  "loss": 0.0001,
44
+ "reward": 0.47135417237877847,
45
+ "reward_std": 0.03448774488642812,
46
+ "rewards/accuracy_reward": 0.47135417237877847,
47
  "rewards/format_reward": 0.0,
48
  "step": 10
49
  },
50
  {
51
+ "completion_length": 820.3849151611328,
52
+ "epoch": 0.96,
53
+ "grad_norm": 0.16045206350580687,
54
+ "kl": 0.0029309749603271484,
55
+ "learning_rate": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  "loss": 0.0001,
57
+ "reward": 0.5114583395421505,
58
+ "reward_std": 0.04463785570114851,
59
+ "rewards/accuracy_reward": 0.5114583395421505,
60
  "rewards/format_reward": 0.0,
61
+ "step": 15
62
  },
63
  {
64
+ "epoch": 0.96,
65
+ "step": 15,
66
  "total_flos": 0.0,
67
+ "train_loss": 6.178523520551001e-05,
68
+ "train_runtime": 3794.8453,
69
+ "train_samples_per_second": 0.264,
70
+ "train_steps_per_second": 0.004
71
  }
72
  ],
73
  "logging_steps": 5,
74
+ "max_steps": 15,
75
  "num_input_tokens_seen": 0,
76
  "num_train_epochs": 1,
77
+ "save_steps": 10,
78
  "stateful_callbacks": {
79
  "TrainerControl": {
80
  "args": {