Spaces:

ZennyKenny
/

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo

Running on Zero

App Files Files Community

GRPO_Qwen_3B_ZK_FineTune_LoRA_Demo / app.py

ZennyKenny

Update app.py

3f3d24b verified 12 days ago

raw

history blame contribute delete

2.21 kB

	import spaces
	import gradio as gr
	from peft import PeftModel
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import threading
	import torch

	# Load the base model without quantization to avoid bitsandbytes issues
	base_model = AutoModelForCausalLM.from_pretrained(
	"unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit",
	device_map="cpu", # Ensure it runs on CPU to avoid bitsandbytes issues
	torch_dtype=torch.float32 # Explicitly set dtype
	)

	# Load the LoRA adapter
	model = PeftModel.from_pretrained(
	base_model,
	"ZennyKenny/GPRO_LoRA_Qwen_3B"
	)

	# Move model to CPU explicitly (since peft sometimes does not move it automatically)
	model.to("cpu")
	model.eval() # Ensure the model is in inference mode

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained("unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit")

	@spaces.GPU
	def generate_response(prompt):
	reasoning_prompt = (
	"Answer the following question and explain your reasoning step by step.\n"
	f"Question: {prompt}\nReasoning:"
	)

	# Tokenize and move to correct device
	inputs = tokenizer(reasoning_prompt, return_tensors="pt")
	input_ids = inputs["input_ids"].to("cpu") # Ensure tensor is on the correct device

	# Using TextIteratorStreamer for streaming responses
	streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

	# Adjust generation parameters
	generation_kwargs = dict(
	input_ids=input_ids,
	max_new_tokens=300,
	do_sample=True,
	temperature=0.8,
	top_p=0.95,
	streamer=streamer
	)

	# Ensure streaming happens in a separate thread
	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	for new_text in streamer:
	yield new_text

	# Define Gradio UI
	demo = gr.Interface(
	fn=generate_response,
	inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
	outputs=gr.Textbox(label="Response"),
	title="LoRA Model Reasoning Inference",
	description="Demo your LoRA model with step-by-step reasoning in Hugging Face Gradio.",
	allow_flagging="never"
	)

	# Launch the Gradio app
	demo.launch(share=True)