Blancy
/

Qwen-2.5-7B-Simple-RL

@@ -1,11 +1,9 @@
 ---
 base_model: Qwen/Qwen2.5-Math-7B
-datasets: simplescaling/s1K-1.1
 library_name: transformers
 model_name: Qwen-2.5-7B-Simple-RL
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - grpo
 licence: license
@@ -13,7 +11,7 @@ licence: license
 # Model Card for Qwen-2.5-7B-Simple-RL
-This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B) on the [simplescaling/s1K-1.1](https://huggingface.co/datasets/simplescaling/s1K-1.1) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/eybgfkwc)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ---
 base_model: Qwen/Qwen2.5-Math-7B
 library_name: transformers
 model_name: Qwen-2.5-7B-Simple-RL
 tags:
 - generated_from_trainer
 - trl
 - grpo
 licence: license
 # Model Card for Qwen-2.5-7B-Simple-RL
+This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/5pyk4dec)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 6.178523520551001e-05,
-    "train_runtime": 3794.8453,
     "train_samples": 1000,
-    "train_samples_per_second": 0.264,
-    "train_steps_per_second": 0.004
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.0006398193630351276,
+    "train_runtime": 3470.882,
     "train_samples": 1000,
+    "train_samples_per_second": 0.288,
+    "train_steps_per_second": 0.002
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 6.178523520551001e-05,
-    "train_runtime": 3794.8453,
     "train_samples": 1000,
-    "train_samples_per_second": 0.264,
-    "train_steps_per_second": 0.004
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.0006398193630351276,
+    "train_runtime": 3470.882,
     "train_samples": 1000,
+    "train_samples_per_second": 0.288,
+    "train_steps_per_second": 0.002
 }

trainer_state.json CHANGED Viewed

@@ -1,77 +1,57 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.96,
   "eval_steps": 500,
-  "global_step": 15,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "completion_length": 752.8906402587891,
-      "epoch": 0.064,
-      "grad_norm": 0.8492731776446195,
       "kl": 0.0,
-      "learning_rate": 1.5e-06,
       "loss": 0.0,
-      "reward": 0.5052083432674408,
-      "reward_std": 0.0497216647490859,
-      "rewards/accuracy_reward": 0.5052083432674408,
       "rewards/format_reward": 0.0,
       "step": 1
     },
     {
-      "completion_length": 777.4062652587891,
-      "epoch": 0.32,
-      "grad_norm": 0.10438261848562716,
-      "kl": 0.000284731388092041,
-      "learning_rate": 2.6227661222566517e-06,
       "loss": 0.0,
-      "reward": 0.5052083432674408,
-      "reward_std": 0.03829827485606074,
-      "rewards/accuracy_reward": 0.5052083432674408,
       "rewards/format_reward": 0.0,
       "step": 5
     },
     {
-      "completion_length": 765.1005416870117,
-      "epoch": 0.64,
-      "grad_norm": 0.9738634307537658,
-      "kl": 0.0014707565307617188,
-      "learning_rate": 9.680926694361964e-07,
-      "loss": 0.0001,
-      "reward": 0.47135417237877847,
-      "reward_std": 0.03448774488642812,
-      "rewards/accuracy_reward": 0.47135417237877847,
       "rewards/format_reward": 0.0,
-      "step": 10
-    },
-    {
-      "completion_length": 820.3849151611328,
-      "epoch": 0.96,
-      "grad_norm": 0.16045206350580687,
-      "kl": 0.0029309749603271484,
-      "learning_rate": 0.0,
-      "loss": 0.0001,
-      "reward": 0.5114583395421505,
-      "reward_std": 0.04463785570114851,
-      "rewards/accuracy_reward": 0.5114583395421505,
-      "rewards/format_reward": 0.0,
-      "step": 15
-    },
-    {
-      "epoch": 0.96,
-      "step": 15,
       "total_flos": 0.0,
-      "train_loss": 6.178523520551001e-05,
-      "train_runtime": 3794.8453,
-      "train_samples_per_second": 0.264,
-      "train_steps_per_second": 0.004
     }
   ],
   "logging_steps": 5,
-  "max_steps": 15,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 10,

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.896,
   "eval_steps": 500,
+  "global_step": 7,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "completion_length": 753.2539215087891,
+      "epoch": 0.128,
+      "grad_norm": 0.06466772245414092,
       "kl": 0.0,
+      "learning_rate": 3e-06,
       "loss": 0.0,
+      "reward": 0.5299479309469461,
+      "reward_std": 0.04184318264015019,
+      "rewards/accuracy_reward": 0.5299479309469461,
       "rewards/format_reward": 0.0,
       "step": 1
     },
     {
+      "completion_length": 731.6461801528931,
+      "epoch": 0.64,
+      "grad_norm": 0.17592978962322925,
+      "kl": 0.00040525197982788086,
+      "learning_rate": 7.500000000000003e-07,
       "loss": 0.0,
+      "reward": 0.4778645921032876,
+      "reward_std": 0.03562753408914432,
+      "rewards/accuracy_reward": 0.4778645921032876,
       "rewards/format_reward": 0.0,
       "step": 5
     },
     {
+      "completion_length": 752.8470211029053,
+      "epoch": 0.896,
+      "kl": 0.013365030288696289,
+      "reward": 0.485026050824672,
+      "reward_std": 0.047988956212066114,
+      "rewards/accuracy_reward": 0.485026050824672,
       "rewards/format_reward": 0.0,
+      "step": 7,
       "total_flos": 0.0,
+      "train_loss": 0.0006398193630351276,
+      "train_runtime": 3470.882,
+      "train_samples_per_second": 0.288,
+      "train_steps_per_second": 0.002
     }
   ],
   "logging_steps": 5,
+  "max_steps": 7,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 10,