Blancy
/

Qwen-2.5-7B-Simple-RL

@@ -1,11 +1,9 @@
 ---
 base_model: Qwen/Qwen2.5-Math-7B
-datasets: simplescaling/s1K-1.1
 library_name: transformers
 model_name: Qwen-2.5-7B-Simple-RL
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - grpo
 licence: license
@@ -13,7 +11,7 @@ licence: license
 # Model Card for Qwen-2.5-7B-Simple-RL
-This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B) on the [simplescaling/s1K-1.1](https://huggingface.co/datasets/simplescaling/s1K-1.1) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/g7cc54zs)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

 ---
 base_model: Qwen/Qwen2.5-Math-7B
 library_name: transformers
 model_name: Qwen-2.5-7B-Simple-RL
 tags:
 - generated_from_trainer
 - trl
 - grpo
 licence: license
 # Model Card for Qwen-2.5-7B-Simple-RL
+This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/224015062-chinese-university-of-hong-kong-shenzhen/huggingface/runs/xo02xtwc)
 This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.0,
-    "train_runtime": 6.2252,
     "train_samples": 998,
-    "train_samples_per_second": 160.317,
-    "train_steps_per_second": 1.124
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.0014906150979749583,
+    "train_runtime": 3442.5079,
     "train_samples": 998,
+    "train_samples_per_second": 0.29,
+    "train_steps_per_second": 0.002
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
     "total_flos": 0.0,
-    "train_loss": 0.0,
-    "train_runtime": 6.2252,
     "train_samples": 998,
-    "train_samples_per_second": 160.317,
-    "train_steps_per_second": 1.124
 }

 {
     "total_flos": 0.0,
+    "train_loss": 0.0014906150979749583,
+    "train_runtime": 3442.5079,
     "train_samples": 998,
+    "train_samples_per_second": 0.29,
+    "train_steps_per_second": 0.002
 }

trainer_state.json CHANGED Viewed

@@ -9,39 +9,45 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "completion_length": 879.5208587646484,
       "epoch": 0.128,
-      "grad_norm": 0.046759604940746734,
       "kl": 0.0,
       "learning_rate": 3e-06,
       "loss": 0.0,
-      "reward": 0.5507812602445483,
-      "reward_std": 0.04920096090063453,
-      "rewards/accuracy_reward": 0.5507812602445483,
       "rewards/format_reward": 0.0,
       "step": 1
     },
     {
-      "completion_length": 865.0045747756958,
       "epoch": 0.64,
-      "grad_norm": 0.25873392886207724,
-      "kl": 0.0004495680332183838,
       "learning_rate": 7.500000000000003e-07,
       "loss": 0.0,
-      "reward": 0.5003255287592765,
-      "reward_std": 0.038605744659435004,
-      "rewards/accuracy_reward": 0.5003255287592765,
       "rewards/format_reward": 0.0,
       "step": 5
     },
     {
       "epoch": 0.896,
       "step": 7,
       "total_flos": 0.0,
-      "train_loss": 0.0,
-      "train_runtime": 6.2252,
-      "train_samples_per_second": 160.317,
-      "train_steps_per_second": 1.124
     }
   ],
   "logging_steps": 5,

   "is_world_process_zero": true,
   "log_history": [
     {
+      "completion_length": 825.5468940734863,
       "epoch": 0.128,
+      "grad_norm": 0.047280427810459956,
       "kl": 0.0,
       "learning_rate": 3e-06,
       "loss": 0.0,
+      "reward": 0.5377604253590107,
+      "reward_std": 0.04342227545566857,
+      "rewards/accuracy_reward": 0.5377604253590107,
       "rewards/format_reward": 0.0,
       "step": 1
     },
     {
+      "completion_length": 817.2832221984863,
       "epoch": 0.64,
+      "grad_norm": 0.10830728111056658,
+      "kl": 0.0005020499229431152,
       "learning_rate": 7.500000000000003e-07,
       "loss": 0.0,
+      "reward": 0.4973958403279539,
+      "reward_std": 0.03326894948258996,
+      "rewards/accuracy_reward": 0.4973958403279539,
       "rewards/format_reward": 0.0,
       "step": 5
     },
     {
+      "completion_length": 799.4440307617188,
       "epoch": 0.896,
+      "kl": 0.031336188316345215,
+      "reward": 0.4934895895421505,
+      "reward_std": 0.04497725027613342,
+      "rewards/accuracy_reward": 0.4934895895421505,
+      "rewards/format_reward": 0.0,
       "step": 7,
       "total_flos": 0.0,
+      "train_loss": 0.0014906150979749583,
+      "train_runtime": 3442.5079,
+      "train_samples_per_second": 0.29,
+      "train_steps_per_second": 0.002
     }
   ],
   "logging_steps": 5,