Spaces:

wuhp
/

myr1

Running

App Files Files Community

wuhp commited on Jan 31

Commit

b5aeb95

verified ·

1 Parent(s): 24eb33c

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -30

app.py CHANGED Viewed

@@ -21,41 +21,39 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
 # ZeroGPU + QLoRA Example
 ##############################################################################
-TEXT_PIPELINE = None
-COMPARISON_PIPELINE = None  # pipeline for the comparison model, if desired
-NUM_EXAMPLES = 50  # We'll train on 50 lines (or rows) for demonstration
-@spaces.GPU(duration=300)  # up to 10 min
 def finetune_small_subset():
     """
     1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
     2) Adds LoRA adapters (trainable),
-    3) Trains on a small subset of Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B,
     4) Saves LoRA adapter to 'finetuned_myr1',
     5) Reloads LoRA adapters for inference in a pipeline.
     """
-    # --- 1) Load Magpie dataset ---
-    # You can load 'train' or 'validation' split depending on your preference
     ds = load_dataset(
         "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
         split="train"
     )
-    # EXAMPLE: Filter for a single conversation_id
-    # (Alternatively, just do ds.select(range(...)) for a small random subset.)
-    # We'll demonstrate filtering for the first conversation_id:
     unique_ids = list(set(ds["conversation_id"]))
     single_id = unique_ids[0]
     ds = ds.filter(lambda x: x["conversation_id"] == single_id)
-    # After filtering, still pick just up to NUM_EXAMPLES
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # --- 2) Setup 4-bit quantization with BitsAndBytes ---
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16 if you prefer
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
     )
@@ -75,12 +73,12 @@ def finetune_small_subset():
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
-        quantization_config=bnb_config,   # <--- QLoRA 4-bit
         device_map="auto",
         trust_remote_code=True
     )
-    # Prepare the model for k-bit training (QLoRA)
     base_model = prepare_model_for_kbit_training(base_model)
     # --- 3) Create LoRA config & wrap the base model in LoRA ---
@@ -97,10 +95,9 @@ def finetune_small_subset():
     # --- 4) Tokenize dataset ---
     def tokenize_fn(ex):
         """
-        Example: combine instruction + response
-        into a single text. Adjust to your liking.
         """
-        # For demonstration, let's do a short prompt style:
         text = (
             f"Instruction: {ex['instruction']}\n\n"
             f"Response: {ex['response']}"
@@ -119,9 +116,9 @@ def finetune_small_subset():
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=5,
-        save_steps=999999,
         save_total_limit=1,
-        fp16=False,  # rely on bfloat16 from quantization
     )
     # Trainer
@@ -158,7 +155,8 @@ def finetune_small_subset():
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
-    return "Finetuning complete (QLoRA + LoRA on Magpie dataset). Model loaded for inference."
 def ensure_pipeline():
     """
@@ -186,10 +184,34 @@ def ensure_pipeline():
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
-    Generates text from the finetuned (LoRA) model if present, else the base model.
     """
     pipe = ensure_pipeline()
     out = pipe(
@@ -202,19 +224,49 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     )
     return out[0]["generated_text"]
-# (Optional) If you want to compare with another model, define it here:
-# def ensure_comparison_pipeline():
-#     ...
 with gr.Blocks() as demo:
-    gr.Markdown("## ZeroGPU QLoRA Example for wuhp/myr1 (Magpie dataset subset)")
-    gr.Markdown("Finetune or skip to use the base model. Then generate text below.")
-    finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on small subset of Magpie dataset (up to 10 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
-    gr.Markdown("### Generate with myr1 (fine-tuned if done above, else base)")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
@@ -222,8 +274,8 @@ with gr.Blocks() as demo:
     min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
-    output_box = gr.Textbox(label="Generated Text", lines=12)
-    gen_btn = gr.Button("Generate")
     gen_btn.click(
         fn=predict,
@@ -231,4 +283,16 @@ with gr.Blocks() as demo:
         outputs=output_box
     )
 demo.launch()

 # ZeroGPU + QLoRA Example
 ##############################################################################
+TEXT_PIPELINE = None        # Pipeline for wuhp/myr1 (fine-tuned or base)
+COMPARISON_PIPELINE = None  # Pipeline for the DeepSeek model
+NUM_EXAMPLES = 50  # We'll train on 50 rows for demonstration
+@spaces.GPU(duration=300)  # up to 5 min
 def finetune_small_subset():
     """
     1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
     2) Adds LoRA adapters (trainable),
+    3) Trains on a small subset of the Magpie dataset,
     4) Saves LoRA adapter to 'finetuned_myr1',
     5) Reloads LoRA adapters for inference in a pipeline.
     """
+    # --- 1) Load a small subset of the Magpie dataset ---
     ds = load_dataset(
         "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
         split="train"
     )
+    # For demonstration, pick a single conversation_id
     unique_ids = list(set(ds["conversation_id"]))
     single_id = unique_ids[0]
     ds = ds.filter(lambda x: x["conversation_id"] == single_id)
+    # Then select only NUM_EXAMPLES from that subset
     ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
     # --- 2) Setup 4-bit quantization with BitsAndBytes ---
     bnb_config = BitsAndBytesConfig(
         load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,  # or torch.float16
         bnb_4bit_use_double_quant=True,
         bnb_4bit_quant_type="nf4",
     )
         "wuhp/myr1",
         subfolder="myr1",
         config=config,
+        quantization_config=bnb_config,  # <--- QLoRA 4-bit
         device_map="auto",
         trust_remote_code=True
     )
+    # Prepare the model for k-bit training
     base_model = prepare_model_for_kbit_training(base_model)
     # --- 3) Create LoRA config & wrap the base model in LoRA ---
     # --- 4) Tokenize dataset ---
     def tokenize_fn(ex):
         """
+        Combine instruction + response into a single text.
+        You can adjust this to include more fields or different formatting.
         """
         text = (
             f"Instruction: {ex['instruction']}\n\n"
             f"Response: {ex['response']}"
         per_device_train_batch_size=1,
         gradient_accumulation_steps=2,
         logging_steps=5,
+        save_steps=999999,   # effectively don't save mid-epoch
         save_total_limit=1,
+        fp16=False,          # rely on bfloat16 from quantization
     )
     # Trainer
     global TEXT_PIPELINE
     TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
+    return "Finetuning complete. Model loaded for inference."
 def ensure_pipeline():
     """
         TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
     return TEXT_PIPELINE
+def ensure_comparison_pipeline():
+    """
+    Load the DeepSeek model pipeline if not already loaded.
+    """
+    global COMPARISON_PIPELINE
+    if COMPARISON_PIPELINE is None:
+        # If you prefer 4-bit, you can define BitsAndBytesConfig here,
+        # but let's keep it simpler for demonstration (fp16 or bf16).
+        config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+        tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+        model = AutoModelForCausalLM.from_pretrained(
+            "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+            config=config,
+            device_map="auto"
+        )
+        COMPARISON_PIPELINE = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer
+        )
+    return COMPARISON_PIPELINE
 @spaces.GPU(duration=120)  # up to 2 min for text generation
 def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
     """
+    Generates text from the fine-tuned (LoRA) model if present, else the base model.
     """
     pipe = ensure_pipeline()
     out = pipe(
     )
     return out[0]["generated_text"]
+@spaces.GPU(duration=120)  # up to 2 min for text generation
+def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
+    """
+    Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
+    AND from the DeepSeek model. Returns two strings.
+    """
+    local_pipe = ensure_pipeline()
+    comp_pipe = ensure_comparison_pipeline()
+    local_out = local_pipe(
+        prompt,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        min_new_tokens=int(min_new_tokens),
+        max_new_tokens=int(max_new_tokens),
+        do_sample=True
+    )
+    local_text = local_out[0]["generated_text"]
+    comp_out = comp_pipe(
+        prompt,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        min_new_tokens=int(min_new_tokens),
+        max_new_tokens=int(max_new_tokens),
+        do_sample=True
+    )
+    comp_text = comp_out[0]["generated_text"]
+    return local_text, comp_text
+# Build Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# QLoRA Fine-tuning & Comparison Demo")
+    gr.Markdown("**Fine-tune wuhp/myr1** on a small subset of the Magpie dataset, then generate or compare output with the DeepSeek model.")
+    finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on Magpie subset (up to 5 min)")
     status_box = gr.Textbox(label="Finetune Status")
     finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
+    gr.Markdown("### Generate with myr1 (fine-tuned if done, else base)")
     prompt_in = gr.Textbox(lines=3, label="Prompt")
     temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
     min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
     max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
+    output_box = gr.Textbox(label="myr1 Output", lines=8)
+    gen_btn = gr.Button("Generate with myr1")
     gen_btn.click(
         fn=predict,
         outputs=output_box
     )
+    gr.Markdown("### Compare myr1 vs DeepSeek side-by-side")
+    compare_btn = gr.Button("Compare")
+    out_local = gr.Textbox(label="myr1 Output", lines=8)
+    out_deepseek = gr.Textbox(label="DeepSeek Output", lines=8)
+    compare_btn.click(
+        fn=compare_models,
+        inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
+        outputs=[out_local, out_deepseek]
+    )
 demo.launch()