import gradio as gr from huggingface_hub import InferenceClient import spaces import torch import os from huggingface_hub import login from PIL import Image from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ # client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") print(f"Is CUDA available: {torch.cuda.is_available()}") print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") print(f"CUDA version: {torch.version.cuda}") print(f"Python version: {platform.python_version()}") print(f"Pytorch version: {torch.__version__}") print(f"Gradio version: {gr. __version__}") duration=None login(token = os.getenv('gemma')) ckpt = "google/gemma-3-4b-it" model = Gemma3ForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16,) processor = AutoProcessor.from_pretrained(ckpt) @spaces.GPU def bot_streaming(message, history, max_new_tokens=250): txt = message["text"] ext_buffer = f"{txt}" messages= [] images = [] for i, msg in enumerate(history): if isinstance(msg[0], tuple): messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]}) images.append(Image.open(msg[0][0]).convert("RGB")) elif isinstance(history[i-1], tuple) and isinstance(msg[0], str): # messages are already handled pass elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]}) messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]}) # add current message if len(message["files"]) == 1: if isinstance(message["files"][0], str): # examples image = Image.open(message["files"][0]).convert("RGB") else: # regular input image = Image.open(message["files"][0]["path"]).convert("RGB") images.append(image) messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]}) else: messages.append({"role": "user", "content": [{"type": "text", "text": txt}]}) texts = processor.apply_chat_template(messages, add_generation_prompt=True) if images == []: inputs = processor(text=texts, return_tensors="pt").to("cuda") else: inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens) generated_text = "" thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() buffer = "" for new_text in streamer: buffer += new_text generated_text_without_prompt = buffer time.sleep(0.01) yield buffer demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Gemma 3 Model by Google", textbox=gr.MultimodalTextbox(), additional_inputs = [gr.Slider( minimum=10, maximum=500, value=250, step=10, label="Maximum number of new tokens to generate", ) ], cache_examples=False, description="Upload an image, and start chatting about it, or just enter any text into the prompt to start.", stop_btn="Stop Generation", fill_height=True, multimodal=True) demo.launch(debug=True)