Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from peft import PeftModel | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from huggingface_hub import login | |
import os | |
# Get token from environment (automatically loaded from secrets) | |
hf_token = os.getenv("HF_TOKEN") | |
login(hf_token) | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-pt") | |
# Load base model on CPU with optimizations | |
base_model = AutoModelForCausalLM.from_pretrained( | |
"google/gemma-3-1b-pt", | |
torch_dtype=torch.bfloat16, # Efficient memory usage | |
low_cpu_mem_usage=True | |
) | |
# Load fine-tuned model | |
model = PeftModel.from_pretrained(base_model, "hackergeek98/gemma-finetuned") | |
model = model.to("cpu") # Ensure it runs on CPU | |
# Chatbot function | |
def chat(message, history=[]): | |
messages = [{"role": "user", "content": message}] | |
input_ids = tokenizer(message, return_tensors="pt").input_ids.to("cpu") | |
with torch.no_grad(): # Disable gradient calculations for efficiency | |
output_ids = model.generate(input_ids, max_length=100) | |
response = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
history.append((message, response)) # Store conversation history | |
return history, history | |
# Gradio UI | |
demo = gr.ChatInterface( | |
chat, | |
chatbot=gr.Chatbot(height=400), | |
additional_inputs=[ | |
gr.Textbox(value="Welcome to the chatbot!", label="System message") | |
], | |
title="Fine-Tuned Gemma Chatbot", | |
description="This chatbot is fine-tuned on Persian text using Gemma.", | |
) | |
if __name__ == "__main__": | |
demo.launch() | |