Model save

Browse files

Files changed (5) hide show

README.md +68 -0
all_results.json +8 -0
generation_config.json +9 -0
train_results.json +8 -0
trainer_state.json +378 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+library_name: transformers
+model_name: DeepSeek-R1-Distill-Qwen-7B-GRPO
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for DeepSeek-R1-Distill-Qwen-7B-GRPO
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-7B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="greatxue1/DeepSeek-R1-Distill-Qwen-7B-GRPO", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zhongkaixue-university-of-oxford/huggingface/runs/vamiu0yb)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.15.2
+- Transformers: 4.49.0
+- Pytorch: 2.5.1+cu121
+- Datasets: 3.0.2
+- Tokenizers: 0.21.0
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.0009309215234963494,
+    "train_runtime": 2265.8224,
+    "train_samples": 781,
+    "train_samples_per_second": 0.345,
+    "train_steps_per_second": 0.011
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151646,
+  "do_sample": true,
+  "eos_token_id": 151643,
+  "temperature": 0.6,
+  "top_p": 0.95,
+  "transformers_version": "4.49.0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.0009309215234963494,
+    "train_runtime": 2265.8224,
+    "train_samples": 781,
+    "train_samples_per_second": 0.345,
+    "train_steps_per_second": 0.011
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,378 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9795918367346939,
+  "eval_steps": 500,
+  "global_step": 24,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "completion_length": 394.7544860839844,
+      "epoch": 0.04081632653061224,
+      "grad_norm": 0.5584128667460793,
+      "kl": 0.0,
+      "learning_rate": 1.6666666666666667e-06,
+      "loss": 0.0001,
+      "reward": 1.335937574505806,
+      "reward_std": 0.6638279929757118,
+      "rewards/format_reward": 0.09375000419095159,
+      "rewards/instruction_follow_reward": 0.2232143022119999,
+      "rewards/tag_count_reward": 0.572544664144516,
+      "step": 1
+    },
+    {
+      "completion_length": 407.3616256713867,
+      "epoch": 0.08163265306122448,
+      "grad_norm": 0.5809418852960418,
+      "kl": 0.0,
+      "learning_rate": 3.3333333333333333e-06,
+      "loss": 0.0,
+      "reward": 1.359151840209961,
+      "reward_std": 0.7929508537054062,
+      "rewards/format_reward": 0.10267857555299997,
+      "rewards/instruction_follow_reward": 0.23392858356237411,
+      "rewards/tag_count_reward": 0.5546875149011612,
+      "step": 2
+    },
+    {
+      "completion_length": 425.6607360839844,
+      "epoch": 0.12244897959183673,
+      "grad_norm": 0.5488272016876289,
+      "kl": 0.0003504753112792969,
+      "learning_rate": 5e-06,
+      "loss": 0.0,
+      "reward": 1.3707590103149414,
+      "reward_std": 0.8819468468427658,
+      "rewards/format_reward": 0.11160714738070965,
+      "rewards/instruction_follow_reward": 0.23184525407850742,
+      "rewards/tag_count_reward": 0.5636160969734192,
+      "step": 3
+    },
+    {
+      "completion_length": 394.3437728881836,
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.6044636536170302,
+      "kl": 0.001239776611328125,
+      "learning_rate": 4.97486935900654e-06,
+      "loss": 0.0001,
+      "reward": 1.467633992433548,
+      "reward_std": 0.8601260483264923,
+      "rewards/format_reward": 0.19642858393490314,
+      "rewards/instruction_follow_reward": 0.2220982275903225,
+      "rewards/tag_count_reward": 0.6049107313156128,
+      "step": 4
+    },
+    {
+      "completion_length": 416.5223388671875,
+      "epoch": 0.20408163265306123,
+      "grad_norm": 0.5931555061780838,
+      "kl": 0.01104736328125,
+      "learning_rate": 4.900038813018817e-06,
+      "loss": 0.0004,
+      "reward": 1.9301143288612366,
+      "reward_std": 0.8981596827507019,
+      "rewards/format_reward": 0.5133928880095482,
+      "rewards/instruction_follow_reward": 0.21628808975219727,
+      "rewards/tag_count_reward": 0.767857164144516,
+      "step": 5
+    },
+    {
+      "completion_length": 420.4241256713867,
+      "epoch": 0.24489795918367346,
+      "grad_norm": 0.6761697930052907,
+      "kl": 0.03021240234375,
+      "learning_rate": 4.777179952780443e-06,
+      "loss": 0.0012,
+      "reward": 2.2354912161827087,
+      "reward_std": 0.8338466733694077,
+      "rewards/format_reward": 0.7232143133878708,
+      "rewards/instruction_follow_reward": 0.2187500111758709,
+      "rewards/tag_count_reward": 0.856026828289032,
+      "step": 6
+    },
+    {
+      "completion_length": 394.3393096923828,
+      "epoch": 0.2857142857142857,
+      "grad_norm": 2.459773044170621,
+      "kl": 0.13006591796875,
+      "learning_rate": 4.609037242210989e-06,
+      "loss": 0.0052,
+      "reward": 2.178348273038864,
+      "reward_std": 1.000592678785324,
+      "rewards/format_reward": 0.7544643133878708,
+      "rewards/instruction_follow_reward": 0.18854167126119137,
+      "rewards/tag_count_reward": 0.8582589477300644,
+      "step": 7
+    },
+    {
+      "completion_length": 365.56697845458984,
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.6203686756559748,
+      "kl": 0.0291900634765625,
+      "learning_rate": 4.39936671161711e-06,
+      "loss": 0.0012,
+      "reward": 2.3113840222358704,
+      "reward_std": 0.9184626936912537,
+      "rewards/format_reward": 0.7098214477300644,
+      "rewards/instruction_follow_reward": 0.2444196529686451,
+      "rewards/tag_count_reward": 0.8683035969734192,
+      "step": 8
+    },
+    {
+      "completion_length": 381.06251525878906,
+      "epoch": 0.3673469387755102,
+      "grad_norm": 0.5837189440797563,
+      "kl": 0.016021728515625,
+      "learning_rate": 4.152852054182151e-06,
+      "loss": 0.0006,
+      "reward": 2.10881707072258,
+      "reward_std": 0.9214754402637482,
+      "rewards/format_reward": 0.7008928954601288,
+      "rewards/instruction_follow_reward": 0.1835937611758709,
+      "rewards/tag_count_reward": 0.8571428954601288,
+      "step": 9
+    },
+    {
+      "completion_length": 352.6785888671875,
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.6626187128387949,
+      "kl": 0.0197906494140625,
+      "learning_rate": 3.875e-06,
+      "loss": 0.0008,
+      "reward": 2.1169643998146057,
+      "reward_std": 0.8091428875923157,
+      "rewards/format_reward": 0.6607143133878708,
+      "rewards/instruction_follow_reward": 0.1985863298177719,
+      "rewards/tag_count_reward": 0.8604911118745804,
+      "step": 10
+    },
+    {
+      "completion_length": 369.41966247558594,
+      "epoch": 0.4489795918367347,
+      "grad_norm": 0.6348377712692719,
+      "kl": 0.0179595947265625,
+      "learning_rate": 3.5720173048243896e-06,
+      "loss": 0.0007,
+      "reward": 2.5970983505249023,
+      "reward_std": 1.0680624097585678,
+      "rewards/format_reward": 0.6026786118745804,
+      "rewards/instruction_follow_reward": 0.388392873108387,
+      "rewards/tag_count_reward": 0.8292410969734192,
+      "step": 11
+    },
+    {
+      "completion_length": 375.0268020629883,
+      "epoch": 0.4897959183673469,
+      "grad_norm": 0.6179040419604938,
+      "kl": 0.0157623291015625,
+      "learning_rate": 3.2506721014017075e-06,
+      "loss": 0.0006,
+      "reward": 2.2924107909202576,
+      "reward_std": 1.0153658390045166,
+      "rewards/format_reward": 0.6071428805589676,
+      "rewards/instruction_follow_reward": 0.2786458507180214,
+      "rewards/tag_count_reward": 0.84933041036129,
+      "step": 12
+    },
+    {
+      "completion_length": 381.9241256713867,
+      "epoch": 0.5306122448979592,
+      "grad_norm": 0.6671138772835987,
+      "kl": 0.03009033203125,
+      "learning_rate": 2.918142710569455e-06,
+      "loss": 0.0012,
+      "reward": 2.4388394355773926,
+      "reward_std": 1.0967597514390945,
+      "rewards/format_reward": 0.6205357313156128,
+      "rewards/instruction_follow_reward": 0.32633931189775467,
+      "rewards/tag_count_reward": 0.8392857313156128,
+      "step": 13
+    },
+    {
+      "completion_length": 363.4107208251953,
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.6026307189636135,
+      "kl": 0.017669677734375,
+      "learning_rate": 2.5818572894305453e-06,
+      "loss": 0.0007,
+      "reward": 2.404017984867096,
+      "reward_std": 1.1934520304203033,
+      "rewards/format_reward": 0.5714285969734192,
+      "rewards/instruction_follow_reward": 0.3214285895228386,
+      "rewards/tag_count_reward": 0.8683036118745804,
+      "step": 14
+    },
+    {
+      "completion_length": 424.05358123779297,
+      "epoch": 0.6122448979591837,
+      "grad_norm": 0.5671014284506704,
+      "kl": 0.0147705078125,
+      "learning_rate": 2.2493278985982932e-06,
+      "loss": 0.0006,
+      "reward": 2.2103636860847473,
+      "reward_std": 1.0440057069063187,
+      "rewards/format_reward": 0.651785746216774,
+      "rewards/instruction_follow_reward": 0.2267431989312172,
+      "rewards/tag_count_reward": 0.8783482760190964,
+      "step": 15
+    },
+    {
+      "completion_length": 361.37947845458984,
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.6208470872600845,
+      "kl": 0.019134521484375,
+      "learning_rate": 1.9279826951756115e-06,
+      "loss": 0.0008,
+      "reward": 2.465401828289032,
+      "reward_std": 1.0677553862333298,
+      "rewards/format_reward": 0.6964285969734192,
+      "rewards/instruction_follow_reward": 0.2924107313156128,
+      "rewards/tag_count_reward": 0.8917411118745804,
+      "step": 16
+    },
+    {
+      "completion_length": 367.0848388671875,
+      "epoch": 0.6938775510204082,
+      "grad_norm": 0.6513303765560159,
+      "kl": 0.0290679931640625,
+      "learning_rate": 1.6250000000000007e-06,
+      "loss": 0.0012,
+      "reward": 2.10491082072258,
+      "reward_std": 0.9648873805999756,
+      "rewards/format_reward": 0.6339286118745804,
+      "rewards/instruction_follow_reward": 0.19642857927829027,
+      "rewards/tag_count_reward": 0.8816964775323868,
+      "step": 17
+    },
+    {
+      "completion_length": 378.62501525878906,
+      "epoch": 0.7346938775510204,
+      "grad_norm": 0.5707111135110878,
+      "kl": 0.0172271728515625,
+      "learning_rate": 1.3471479458178499e-06,
+      "loss": 0.0007,
+      "reward": 2.5422155261039734,
+      "reward_std": 0.8247152641415596,
+      "rewards/format_reward": 0.7812500298023224,
+      "rewards/instruction_follow_reward": 0.2793247886002064,
+      "rewards/tag_count_reward": 0.9229911118745804,
+      "step": 18
+    },
+    {
+      "completion_length": 354.74108123779297,
+      "epoch": 0.7755102040816326,
+      "grad_norm": 0.5544484847237519,
+      "kl": 0.023040771484375,
+      "learning_rate": 1.1006332883828912e-06,
+      "loss": 0.0009,
+      "reward": 2.662946581840515,
+      "reward_std": 1.0467701256275177,
+      "rewards/format_reward": 0.7366071790456772,
+      "rewards/instruction_follow_reward": 0.3437500149011612,
+      "rewards/tag_count_reward": 0.895089328289032,
+      "step": 19
+    },
+    {
+      "completion_length": 383.00447845458984,
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.510823042158658,
+      "kl": 0.0205535888671875,
+      "learning_rate": 8.909627577890121e-07,
+      "loss": 0.0008,
+      "reward": 2.3368303775787354,
+      "reward_std": 0.9361487179994583,
+      "rewards/format_reward": 0.7633928954601288,
+      "rewards/instruction_follow_reward": 0.22500000894069672,
+      "rewards/tag_count_reward": 0.8984375596046448,
+      "step": 20
+    },
+    {
+      "completion_length": 357.4151916503906,
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.5515191742215413,
+      "kl": 0.0199737548828125,
+      "learning_rate": 7.228200472195574e-07,
+      "loss": 0.0008,
+      "reward": 2.494419753551483,
+      "reward_std": 0.9745698273181915,
+      "rewards/format_reward": 0.7767857611179352,
+      "rewards/instruction_follow_reward": 0.2700892984867096,
+      "rewards/tag_count_reward": 0.9073661118745804,
+      "step": 21
+    },
+    {
+      "completion_length": 352.87947845458984,
+      "epoch": 0.8979591836734694,
+      "grad_norm": 0.6878725311389932,
+      "kl": 0.036224365234375,
+      "learning_rate": 5.999611869811834e-07,
+      "loss": 0.0014,
+      "reward": 2.4720982909202576,
+      "reward_std": 0.9217582643032074,
+      "rewards/format_reward": 0.7276785969734192,
+      "rewards/instruction_follow_reward": 0.2857142984867096,
+      "rewards/tag_count_reward": 0.887276828289032,
+      "step": 22
+    },
+    {
+      "completion_length": 373.3035888671875,
+      "epoch": 0.9387755102040817,
+      "grad_norm": 0.5789373606479209,
+      "kl": 0.02337646484375,
+      "learning_rate": 5.251306409934609e-07,
+      "loss": 0.0009,
+      "reward": 2.023437589406967,
+      "reward_std": 0.9313821792602539,
+      "rewards/format_reward": 0.7500000149011612,
+      "rewards/instruction_follow_reward": 0.12946429289877415,
+      "rewards/tag_count_reward": 0.8850446790456772,
+      "step": 23
+    },
+    {
+      "completion_length": 393.03572845458984,
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.59441068526159,
+      "kl": 0.03009033203125,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.0012,
+      "reward": 2.5097524523735046,
+      "reward_std": 0.8981894552707672,
+      "rewards/format_reward": 0.7857143133878708,
+      "rewards/instruction_follow_reward": 0.2729680556803942,
+      "rewards/tag_count_reward": 0.9051339775323868,
+      "step": 24
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "step": 24,
+      "total_flos": 0.0,
+      "train_loss": 0.0009309215234963494,
+      "train_runtime": 2265.8224,
+      "train_samples_per_second": 0.345,
+      "train_steps_per_second": 0.011
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 24,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}