import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
import os

# Get token from environment (automatically loaded from secrets)
hf_token = os.getenv("HF_TOKEN")
login(hf_token)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-pt")

# Load base model on CPU with optimizations
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-3-1b-pt",
    torch_dtype=torch.bfloat16,  # Efficient memory usage
    low_cpu_mem_usage=True
)

# Load fine-tuned model
model = PeftModel.from_pretrained(base_model, "hackergeek98/gemma-finetuned")
model = model.to("cpu")  # Ensure it runs on CPU

# Chatbot function
def chat(message, history=[]):
    messages = [{"role": "user", "content": message}]
    
    input_ids = tokenizer(message, return_tensors="pt").input_ids.to("cpu")

    with torch.no_grad():  # Disable gradient calculations for efficiency
        output_ids = model.generate(input_ids, max_length=100)
    
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    history.append((message, response))  # Store conversation history
    return history, history

# Gradio UI
demo = gr.ChatInterface(
    chat,
    chatbot=gr.Chatbot(height=400),
    additional_inputs=[
        gr.Textbox(value="Welcome to the chatbot!", label="System message")
    ],
    title="Fine-Tuned Gemma Chatbot",
    description="This chatbot is fine-tuned on Persian text using Gemma.",
)

if __name__ == "__main__":
    demo.launch()