import spaces import gradio as gr from transformers import pipeline import torch import os hf_token = os.environ["HF_TOKEN"] # Load the Gemma 3 pipeline. # Gemma 3 is a multimodal model that accepts text and image inputs. pipe = pipeline( "image-text-to-text", model="google/gemma-3-4b-it", device="cuda" if torch.cuda.is_available() else "cpu", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None, use_auth_token=hf_token ) @spaces.GPU def generate_response(user_text, user_image, history): messages = [ { "role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}] } ] user_content = [] if user_image is not None: user_content.append({"type": "image", "image": user_image}) if user_text: user_content.append({"type": "text", "text": user_text}) messages.append({"role": "user", "content": user_content}) output = pipe(text=messages, max_new_tokens=200) response = output[0][0]["generated_text"][-1]["content"] history.append((user_text, response)) return history, history with gr.Blocks() as demo: gr.Markdown("# Gemma 3 Chat Interface") gr.Markdown( "This interface lets you chat with the Gemma 3 model. " "You can type a message and optionally attach an image." ) chatbot = gr.Chatbot(type="messages") with gr.Row(): txt = gr.Textbox(show_label=False, placeholder="Type your message here...", container=False) # Removed the 'source' parameter to avoid the error. img = gr.Image(tool="editor", type="pil", label="Attach an image (optional)") state = gr.State([]) txt.submit(generate_response, inputs=[txt, img, state], outputs=[chatbot, state]) demo.launch()