Update app.py
Browse files
app.py
CHANGED
|
@@ -21,41 +21,39 @@ from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_tr
|
|
| 21 |
# ZeroGPU + QLoRA Example
|
| 22 |
##############################################################################
|
| 23 |
|
| 24 |
-
TEXT_PIPELINE = None
|
| 25 |
-
COMPARISON_PIPELINE = None #
|
| 26 |
-
NUM_EXAMPLES = 50 # We'll train on 50 lines (or rows) for demonstration
|
| 27 |
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
def finetune_small_subset():
|
| 30 |
"""
|
| 31 |
1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
|
| 32 |
2) Adds LoRA adapters (trainable),
|
| 33 |
-
3) Trains on a small subset of Magpie
|
| 34 |
4) Saves LoRA adapter to 'finetuned_myr1',
|
| 35 |
5) Reloads LoRA adapters for inference in a pipeline.
|
| 36 |
"""
|
| 37 |
|
| 38 |
-
# --- 1) Load Magpie dataset ---
|
| 39 |
-
# You can load 'train' or 'validation' split depending on your preference
|
| 40 |
ds = load_dataset(
|
| 41 |
"Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
|
| 42 |
split="train"
|
| 43 |
)
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
# (Alternatively, just do ds.select(range(...)) for a small random subset.)
|
| 47 |
-
# We'll demonstrate filtering for the first conversation_id:
|
| 48 |
unique_ids = list(set(ds["conversation_id"]))
|
| 49 |
single_id = unique_ids[0]
|
| 50 |
ds = ds.filter(lambda x: x["conversation_id"] == single_id)
|
| 51 |
|
| 52 |
-
#
|
| 53 |
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
| 54 |
|
| 55 |
# --- 2) Setup 4-bit quantization with BitsAndBytes ---
|
| 56 |
bnb_config = BitsAndBytesConfig(
|
| 57 |
load_in_4bit=True,
|
| 58 |
-
bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
|
| 59 |
bnb_4bit_use_double_quant=True,
|
| 60 |
bnb_4bit_quant_type="nf4",
|
| 61 |
)
|
|
@@ -75,12 +73,12 @@ def finetune_small_subset():
|
|
| 75 |
"wuhp/myr1",
|
| 76 |
subfolder="myr1",
|
| 77 |
config=config,
|
| 78 |
-
quantization_config=bnb_config,
|
| 79 |
device_map="auto",
|
| 80 |
trust_remote_code=True
|
| 81 |
)
|
| 82 |
|
| 83 |
-
# Prepare the model for k-bit training
|
| 84 |
base_model = prepare_model_for_kbit_training(base_model)
|
| 85 |
|
| 86 |
# --- 3) Create LoRA config & wrap the base model in LoRA ---
|
|
@@ -97,10 +95,9 @@ def finetune_small_subset():
|
|
| 97 |
# --- 4) Tokenize dataset ---
|
| 98 |
def tokenize_fn(ex):
|
| 99 |
"""
|
| 100 |
-
|
| 101 |
-
|
| 102 |
"""
|
| 103 |
-
# For demonstration, let's do a short prompt style:
|
| 104 |
text = (
|
| 105 |
f"Instruction: {ex['instruction']}\n\n"
|
| 106 |
f"Response: {ex['response']}"
|
|
@@ -119,9 +116,9 @@ def finetune_small_subset():
|
|
| 119 |
per_device_train_batch_size=1,
|
| 120 |
gradient_accumulation_steps=2,
|
| 121 |
logging_steps=5,
|
| 122 |
-
save_steps=999999,
|
| 123 |
save_total_limit=1,
|
| 124 |
-
fp16=False,
|
| 125 |
)
|
| 126 |
|
| 127 |
# Trainer
|
|
@@ -158,7 +155,8 @@ def finetune_small_subset():
|
|
| 158 |
global TEXT_PIPELINE
|
| 159 |
TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
|
| 160 |
|
| 161 |
-
return "Finetuning complete
|
|
|
|
| 162 |
|
| 163 |
def ensure_pipeline():
|
| 164 |
"""
|
|
@@ -186,10 +184,34 @@ def ensure_pipeline():
|
|
| 186 |
TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
|
| 187 |
return TEXT_PIPELINE
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
@spaces.GPU(duration=120) # up to 2 min for text generation
|
| 190 |
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
| 191 |
"""
|
| 192 |
-
Generates text from the
|
| 193 |
"""
|
| 194 |
pipe = ensure_pipeline()
|
| 195 |
out = pipe(
|
|
@@ -202,19 +224,49 @@ def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
|
| 202 |
)
|
| 203 |
return out[0]["generated_text"]
|
| 204 |
|
| 205 |
-
# (Optional) If you want to compare with another model, define it here:
|
| 206 |
-
# def ensure_comparison_pipeline():
|
| 207 |
-
# ...
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
with gr.Blocks() as demo:
|
| 210 |
-
gr.Markdown("
|
| 211 |
-
gr.Markdown("
|
| 212 |
|
| 213 |
-
finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on
|
| 214 |
status_box = gr.Textbox(label="Finetune Status")
|
| 215 |
finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
|
| 216 |
|
| 217 |
-
gr.Markdown("### Generate with myr1 (fine-tuned if done
|
| 218 |
|
| 219 |
prompt_in = gr.Textbox(lines=3, label="Prompt")
|
| 220 |
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
|
|
@@ -222,8 +274,8 @@ with gr.Blocks() as demo:
|
|
| 222 |
min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
|
| 223 |
max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
|
| 224 |
|
| 225 |
-
output_box = gr.Textbox(label="
|
| 226 |
-
gen_btn = gr.Button("Generate")
|
| 227 |
|
| 228 |
gen_btn.click(
|
| 229 |
fn=predict,
|
|
@@ -231,4 +283,16 @@ with gr.Blocks() as demo:
|
|
| 231 |
outputs=output_box
|
| 232 |
)
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
demo.launch()
|
|
|
|
| 21 |
# ZeroGPU + QLoRA Example
|
| 22 |
##############################################################################
|
| 23 |
|
| 24 |
+
TEXT_PIPELINE = None # Pipeline for wuhp/myr1 (fine-tuned or base)
|
| 25 |
+
COMPARISON_PIPELINE = None # Pipeline for the DeepSeek model
|
|
|
|
| 26 |
|
| 27 |
+
NUM_EXAMPLES = 50 # We'll train on 50 rows for demonstration
|
| 28 |
+
|
| 29 |
+
@spaces.GPU(duration=300) # up to 5 min
|
| 30 |
def finetune_small_subset():
|
| 31 |
"""
|
| 32 |
1) Loads 'wuhp/myr1' in 4-bit quantization (QLoRA style),
|
| 33 |
2) Adds LoRA adapters (trainable),
|
| 34 |
+
3) Trains on a small subset of the Magpie dataset,
|
| 35 |
4) Saves LoRA adapter to 'finetuned_myr1',
|
| 36 |
5) Reloads LoRA adapters for inference in a pipeline.
|
| 37 |
"""
|
| 38 |
|
| 39 |
+
# --- 1) Load a small subset of the Magpie dataset ---
|
|
|
|
| 40 |
ds = load_dataset(
|
| 41 |
"Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B",
|
| 42 |
split="train"
|
| 43 |
)
|
| 44 |
|
| 45 |
+
# For demonstration, pick a single conversation_id
|
|
|
|
|
|
|
| 46 |
unique_ids = list(set(ds["conversation_id"]))
|
| 47 |
single_id = unique_ids[0]
|
| 48 |
ds = ds.filter(lambda x: x["conversation_id"] == single_id)
|
| 49 |
|
| 50 |
+
# Then select only NUM_EXAMPLES from that subset
|
| 51 |
ds = ds.select(range(min(NUM_EXAMPLES, len(ds))))
|
| 52 |
|
| 53 |
# --- 2) Setup 4-bit quantization with BitsAndBytes ---
|
| 54 |
bnb_config = BitsAndBytesConfig(
|
| 55 |
load_in_4bit=True,
|
| 56 |
+
bnb_4bit_compute_dtype=torch.bfloat16, # or torch.float16
|
| 57 |
bnb_4bit_use_double_quant=True,
|
| 58 |
bnb_4bit_quant_type="nf4",
|
| 59 |
)
|
|
|
|
| 73 |
"wuhp/myr1",
|
| 74 |
subfolder="myr1",
|
| 75 |
config=config,
|
| 76 |
+
quantization_config=bnb_config, # <--- QLoRA 4-bit
|
| 77 |
device_map="auto",
|
| 78 |
trust_remote_code=True
|
| 79 |
)
|
| 80 |
|
| 81 |
+
# Prepare the model for k-bit training
|
| 82 |
base_model = prepare_model_for_kbit_training(base_model)
|
| 83 |
|
| 84 |
# --- 3) Create LoRA config & wrap the base model in LoRA ---
|
|
|
|
| 95 |
# --- 4) Tokenize dataset ---
|
| 96 |
def tokenize_fn(ex):
|
| 97 |
"""
|
| 98 |
+
Combine instruction + response into a single text.
|
| 99 |
+
You can adjust this to include more fields or different formatting.
|
| 100 |
"""
|
|
|
|
| 101 |
text = (
|
| 102 |
f"Instruction: {ex['instruction']}\n\n"
|
| 103 |
f"Response: {ex['response']}"
|
|
|
|
| 116 |
per_device_train_batch_size=1,
|
| 117 |
gradient_accumulation_steps=2,
|
| 118 |
logging_steps=5,
|
| 119 |
+
save_steps=999999, # effectively don't save mid-epoch
|
| 120 |
save_total_limit=1,
|
| 121 |
+
fp16=False, # rely on bfloat16 from quantization
|
| 122 |
)
|
| 123 |
|
| 124 |
# Trainer
|
|
|
|
| 155 |
global TEXT_PIPELINE
|
| 156 |
TEXT_PIPELINE = pipeline("text-generation", model=lora_model_2, tokenizer=tokenizer)
|
| 157 |
|
| 158 |
+
return "Finetuning complete. Model loaded for inference."
|
| 159 |
+
|
| 160 |
|
| 161 |
def ensure_pipeline():
|
| 162 |
"""
|
|
|
|
| 184 |
TEXT_PIPELINE = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
|
| 185 |
return TEXT_PIPELINE
|
| 186 |
|
| 187 |
+
|
| 188 |
+
def ensure_comparison_pipeline():
|
| 189 |
+
"""
|
| 190 |
+
Load the DeepSeek model pipeline if not already loaded.
|
| 191 |
+
"""
|
| 192 |
+
global COMPARISON_PIPELINE
|
| 193 |
+
if COMPARISON_PIPELINE is None:
|
| 194 |
+
# If you prefer 4-bit, you can define BitsAndBytesConfig here,
|
| 195 |
+
# but let's keep it simpler for demonstration (fp16 or bf16).
|
| 196 |
+
config = AutoConfig.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
| 197 |
+
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
|
| 198 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 199 |
+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
|
| 200 |
+
config=config,
|
| 201 |
+
device_map="auto"
|
| 202 |
+
)
|
| 203 |
+
COMPARISON_PIPELINE = pipeline(
|
| 204 |
+
"text-generation",
|
| 205 |
+
model=model,
|
| 206 |
+
tokenizer=tokenizer
|
| 207 |
+
)
|
| 208 |
+
return COMPARISON_PIPELINE
|
| 209 |
+
|
| 210 |
+
|
| 211 |
@spaces.GPU(duration=120) # up to 2 min for text generation
|
| 212 |
def predict(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
| 213 |
"""
|
| 214 |
+
Generates text from the fine-tuned (LoRA) model if present, else the base model.
|
| 215 |
"""
|
| 216 |
pipe = ensure_pipeline()
|
| 217 |
out = pipe(
|
|
|
|
| 224 |
)
|
| 225 |
return out[0]["generated_text"]
|
| 226 |
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
@spaces.GPU(duration=120) # up to 2 min for text generation
|
| 229 |
+
def compare_models(prompt, temperature, top_p, min_new_tokens, max_new_tokens):
|
| 230 |
+
"""
|
| 231 |
+
Generates text side-by-side from the local myr1 pipeline (fine-tuned or base)
|
| 232 |
+
AND from the DeepSeek model. Returns two strings.
|
| 233 |
+
"""
|
| 234 |
+
local_pipe = ensure_pipeline()
|
| 235 |
+
comp_pipe = ensure_comparison_pipeline()
|
| 236 |
+
|
| 237 |
+
local_out = local_pipe(
|
| 238 |
+
prompt,
|
| 239 |
+
temperature=float(temperature),
|
| 240 |
+
top_p=float(top_p),
|
| 241 |
+
min_new_tokens=int(min_new_tokens),
|
| 242 |
+
max_new_tokens=int(max_new_tokens),
|
| 243 |
+
do_sample=True
|
| 244 |
+
)
|
| 245 |
+
local_text = local_out[0]["generated_text"]
|
| 246 |
+
|
| 247 |
+
comp_out = comp_pipe(
|
| 248 |
+
prompt,
|
| 249 |
+
temperature=float(temperature),
|
| 250 |
+
top_p=float(top_p),
|
| 251 |
+
min_new_tokens=int(min_new_tokens),
|
| 252 |
+
max_new_tokens=int(max_new_tokens),
|
| 253 |
+
do_sample=True
|
| 254 |
+
)
|
| 255 |
+
comp_text = comp_out[0]["generated_text"]
|
| 256 |
+
|
| 257 |
+
return local_text, comp_text
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# Build Gradio UI
|
| 261 |
with gr.Blocks() as demo:
|
| 262 |
+
gr.Markdown("# QLoRA Fine-tuning & Comparison Demo")
|
| 263 |
+
gr.Markdown("**Fine-tune wuhp/myr1** on a small subset of the Magpie dataset, then generate or compare output with the DeepSeek model.")
|
| 264 |
|
| 265 |
+
finetune_btn = gr.Button("Finetune 4-bit (QLoRA) on Magpie subset (up to 5 min)")
|
| 266 |
status_box = gr.Textbox(label="Finetune Status")
|
| 267 |
finetune_btn.click(fn=finetune_small_subset, outputs=status_box)
|
| 268 |
|
| 269 |
+
gr.Markdown("### Generate with myr1 (fine-tuned if done, else base)")
|
| 270 |
|
| 271 |
prompt_in = gr.Textbox(lines=3, label="Prompt")
|
| 272 |
temperature = gr.Slider(0.0, 1.5, step=0.1, value=0.7, label="Temperature")
|
|
|
|
| 274 |
min_tokens = gr.Slider(50, 1024, value=50, step=10, label="Min New Tokens")
|
| 275 |
max_tokens = gr.Slider(50, 1024, value=200, step=50, label="Max New Tokens")
|
| 276 |
|
| 277 |
+
output_box = gr.Textbox(label="myr1 Output", lines=8)
|
| 278 |
+
gen_btn = gr.Button("Generate with myr1")
|
| 279 |
|
| 280 |
gen_btn.click(
|
| 281 |
fn=predict,
|
|
|
|
| 283 |
outputs=output_box
|
| 284 |
)
|
| 285 |
|
| 286 |
+
gr.Markdown("### Compare myr1 vs DeepSeek side-by-side")
|
| 287 |
+
|
| 288 |
+
compare_btn = gr.Button("Compare")
|
| 289 |
+
out_local = gr.Textbox(label="myr1 Output", lines=8)
|
| 290 |
+
out_deepseek = gr.Textbox(label="DeepSeek Output", lines=8)
|
| 291 |
+
|
| 292 |
+
compare_btn.click(
|
| 293 |
+
fn=compare_models,
|
| 294 |
+
inputs=[prompt_in, temperature, top_p, min_tokens, max_tokens],
|
| 295 |
+
outputs=[out_local, out_deepseek]
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
demo.launch()
|