import spaces
import gradio as gr
from transformers import pipeline
import torch
import os

hf_token = os.environ["HF_TOKEN"]

# Load the Gemma 3 pipeline.
# Gemma 3 is a multimodal model that accepts text and image inputs.
pipe = pipeline(
    "image-text-to-text",
    model="google/gemma-3-4b-it",
    device="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None,
    use_auth_token=hf_token
)
@spaces.GPU

def generate_response(user_text, user_image, history):
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}]
        }
    ]
    user_content = []
    if user_image is not None:
        user_content.append({"type": "image", "image": user_image})
    if user_text:
        user_content.append({"type": "text", "text": user_text})
    messages.append({"role": "user", "content": user_content})
    
    output = pipe(text=messages, max_new_tokens=200)
    response = output[0][0]["generated_text"][-1]["content"]
    history.append((user_text, response))
    return history, history

with gr.Blocks() as demo:
    gr.Markdown("# Gemma 3 Chat Interface")
    gr.Markdown(
        "This interface lets you chat with the Gemma 3 model. "
        "You can type a message and optionally attach an image."
    )
    chatbot = gr.Chatbot(type="messages")
    with gr.Row():
        txt = gr.Textbox(show_label=False, placeholder="Type your message here...", container=False)
        # Removed the 'source' parameter to avoid the error.
        img = gr.Image(tool="editor", type="pil", label="Attach an image (optional)")
    state = gr.State([])

    txt.submit(generate_response, inputs=[txt, img, state], outputs=[chatbot, state])
    demo.launch()