import spaces import torch import gradio as gr from PIL import Image from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from functools import lru_cache MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B" @lru_cache(maxsize=1) def _load_model(): """Load and cache the model and processor inside GPU worker.""" model = Qwen2_5_VLForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16 ).to("cuda") processor = AutoProcessor.from_pretrained(MODEL_ID) return model, processor @spaces.GPU def gpu_inference(image_path: str, prompt: str) -> str: """Perform inference entirely in GPU subprocess.""" model, processor = _load_model() # Load and preprocess image image = Image.open(image_path).convert("RGB") if image.width > 512: ratio = image.height / image.width image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS) # Build conversation system_msg = ( "You are VL-Thinking U+1F914, a helpful assistant with excellent reasoning ability.\n" "A user asks you a question, and you should try to solve it." "You should first think about the reasoning process in the mind and then provides the user with the answer.\n" "The reasoning process and answer are enclosed within and tags, respectively, i.e., reasoning process here answer here ." ) conversation = [ {"role": "system", "content": [{"type": "text", "text": system_msg}]}, {"role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": prompt} ]} ] # Tokenize, generate, decode chat_input = processor.apply_chat_template( conversation, tokenize=False, add_generation_prompt=True ) inputs = processor(text=[chat_input], images=[image], return_tensors="pt").to("cuda") output_ids = model.generate(**inputs, max_new_tokens=1024) decoded = processor.batch_decode( output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] # Extract assistant portion return decoded.split("assistant", 1)[-1].strip().lstrip(":").strip() # Message handling def add_message(history, user_input): if history is None: history = [] for f in user_input.get("files", []): history.append({"role": "user", "content": (f,)}) text = user_input.get("text", "") if text: history.append({"role": "user", "content": text}) return history, gr.MultimodalTextbox(value=None) def inference_interface(history): if not history: return history, gr.MultimodalTextbox(value=None) # Last user text user_text = next( (m["content"] for m in reversed(history) if m["role"] == "user" and isinstance(m["content"], str)), None ) if user_text is None: return history, gr.MultimodalTextbox(value=None) # Last user image image_path = next( (m["content"][0] for m in reversed(history) if m["role"] == "user" and isinstance(m["content"], tuple)), None ) if image_path is None: return history, gr.MultimodalTextbox(value=None) # GPU inference reply = gpu_inference(image_path, user_text) history.append({"role": "assistant", "content": reply}) return history, gr.MultimodalTextbox(value=None) def build_demo(): with gr.Blocks() as demo: gr.Markdown("# SpaceThinker-Qwen2.5VL-3B") chatbot = gr.Chatbot([], type="messages", label="Conversation") chat_input = gr.MultimodalTextbox( interactive=True, file_types=["image"], placeholder="Enter text and upload an image.", show_label=True ) submit_evt = chat_input.submit( add_message, [chatbot, chat_input], [chatbot, chat_input] ) submit_evt.then( inference_interface, [chatbot], [chatbot, chat_input] ) with gr.Row(): send_btn = gr.Button("Send") clear_btn = gr.ClearButton([chatbot, chat_input]) send_click = send_btn.click( add_message, [chatbot, chat_input], [chatbot, chat_input] ) send_click.then( inference_interface, [chatbot], [chatbot, chat_input] ) return demo if __name__ == "__main__": demo = build_demo() demo.launch(share=True)