Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
import threading | |
import torch | |
# Load the base model without quantization to avoid bitsandbytes issues | |
base_model = AutoModelForCausalLM.from_pretrained( | |
"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit", | |
device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues | |
torch_dtype=torch.float32 # Explicitly set dtype | |
) | |
# Load the LoRA adapter | |
model = PeftModel.from_pretrained( | |
base_model, | |
"ZennyKenny/GPRO_LoRA_Qwen_3B" | |
) | |
# Move model to CPU explicitly (since peft sometimes does not move it automatically) | |
model.to("cpu") | |
model.eval() # Ensure the model is in inference mode | |
# Load the tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit") | |
def generate_response(prompt): | |
reasoning_prompt = ( | |
"Answer the following question and explain your reasoning step by step.\n" | |
f"Question: {prompt}\nReasoning:" | |
) | |
# Tokenize and move to correct device | |
inputs = tokenizer(reasoning_prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device | |
# Using TextIteratorStreamer for streaming responses | |
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) | |
# Adjust generation parameters | |
generation_kwargs = dict( | |
input_ids=input_ids, | |
max_new_tokens=300, | |
do_sample=True, | |
temperature=0.8, | |
top_p=0.95, | |
streamer=streamer | |
) | |
# Ensure streaming happens in a separate thread | |
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
for new_text in streamer: | |
yield new_text | |
# Define Gradio UI | |
demo = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), | |
outputs=gr.Textbox(label="Response"), | |
title="LoRA Model Reasoning Inference", | |
description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.", | |
allow_flagging="never" | |
) | |
# Launch the Gradio app | |
demo.launch(share=True) | |