Spaces:

remyxai
/

SpaceThinker-Qwen2.5VL-3B

Running on Zero

App Files Files Community

salma-remyx commited on Apr 19

Commit

e19b349

1 Parent(s): 476e594

update app

Browse files

Files changed (1) hide show

app.py +69 -118

app.py CHANGED Viewed

@@ -1,176 +1,127 @@
 import spaces
 import torch
-import time
 import gradio as gr
 from PIL import Image
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from typing import List
 from functools import lru_cache
 MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B"
-@spaces.GPU
 @lru_cache(maxsize=1)
-def load_model():
-    print("Loading model and processor...")
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-    ).to(device)
     processor = AutoProcessor.from_pretrained(MODEL_ID)
     return model, processor
-def process_image(image_path_or_obj):
-    if isinstance(image_path_or_obj, str):
-        image = Image.open(image_path_or_obj).convert("RGB")
-    elif isinstance(image_path_or_obj, Image.Image):
-        image = image_path_or_obj.convert("RGB")
-    else:
-        raise ValueError("process_image expects a file path (str) or PIL.Image")
-    max_width = 512
-    if image.width > max_width:
-        aspect_ratio = image.height / image.width
-        new_height = int(max_width * aspect_ratio)
-        image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
-    return image
-def get_latest_image(history):
-    for item in reversed(history):
-        if item["role"] == "user" and isinstance(item["content"], tuple):
-            return item["content"][0]
-    return None
-def only_assistant_text(full_text: str) -> str:
-    if "assistant" in full_text:
-        parts = full_text.split("assistant", 1)
-        result = parts[-1].strip()
-        result = result.lstrip(":").strip()
-        return result
-    return full_text.strip()
-def run_inference(image, prompt):
-    model, processor = load_model()
     system_msg = (
-        "You are VL-Thinking 🤔, a helpful assistant with excellent reasoning ability. "
-        "You should first think about the reasoning process and then provide the answer. "
-        "Use <think>...</think> and <answer>...</answer> tags."
     )
     conversation = [
-        {
-            "role": "system",
-            "content": [{"type": "text", "text": system_msg}],
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt},
-            ],
-        },
     ]
-    text_input = processor.apply_chat_template(
         conversation, tokenize=False, add_generation_prompt=True
     )
-    inputs = processor(text=[text_input], images=[image], return_tensors="pt").to(model.device)
-    generated_ids = model.generate(**inputs, max_new_tokens=1024)
-    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    return only_assistant_text(output_text)
 def add_message(history, user_input):
-    if not isinstance(history, list):
         history = []
-    files = user_input.get("files", [])
-    text = user_input.get("text", "")
-    for f in files:
         history.append({"role": "user", "content": (f,)})
     if text:
         history.append({"role": "user", "content": text})
     return history, gr.MultimodalTextbox(value=None)
 def inference_interface(history):
     if not history:
         return history, gr.MultimodalTextbox(value=None)
-    user_text = ""
-    user_idx = -1
-    for idx in range(len(history) - 1, -1, -1):
-        msg = history[idx]
-        if msg["role"] == "user" and isinstance(msg["content"], str):
-            user_text = msg["content"]
-            user_idx = idx
-            break
-    if user_idx == -1:
         return history, gr.MultimodalTextbox(value=None)
-    latest_image = get_latest_image(history)
-    if not latest_image:
         return history, gr.MultimodalTextbox(value=None)
-    pil_image = process_image(latest_image)
-    assistant_reply = run_inference(pil_image, user_text)
-    history.append({"role": "assistant", "content": assistant_reply})
     return history, gr.MultimodalTextbox(value=None)
 def build_demo():
     with gr.Blocks() as demo:
         gr.Markdown("# SpaceThinker-Qwen2.5VL-3B Image Prompt Chatbot")
-        chatbot = gr.Chatbot([], type="messages", line_breaks=True)
         chat_input = gr.MultimodalTextbox(
             interactive=True,
             file_types=["image"],
             placeholder="Enter text and upload an image.",
             show_label=True
         )
-        submit_event = chat_input.submit(
-            fn=add_message,
-            inputs=[chatbot, chat_input],
-            outputs=[chatbot, chat_input]
         )
-        submit_event.then(
-            fn=inference_interface,
-            inputs=[chatbot],
-            outputs=[chatbot, chat_input]
         )
         with gr.Row():
-            send_button = gr.Button("Send")
-            clear_button = gr.ClearButton([chatbot, chat_input])
-        send_click = send_button.click(
-            fn=add_message,
-            inputs=[chatbot, chat_input],
-            outputs=[chatbot, chat_input]
         )
         send_click.then(
-            fn=inference_interface,
-            inputs=[chatbot],
-            outputs=[chatbot, chat_input]
         )
-        gr.Examples(
-            examples=[
-                {
-                    "text": "Give me the height of the man in the red hat in feet.",
-                    "files": ["./examples/warehouse_rgb.jpg"]
-                }
-            ],
-            inputs=[chat_input],
-        )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.launch(share=True)

 import spaces
 import torch
 import gradio as gr
 from PIL import Image
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from functools import lru_cache
 MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B"
 @lru_cache(maxsize=1)
+def _load_model():
+    """Load and cache the model and processor inside GPU worker."""
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
+        torch_dtype=torch.bfloat16
+    ).to("cuda")
     processor = AutoProcessor.from_pretrained(MODEL_ID)
     return model, processor
+@spaces.GPU
+def gpu_inference(image_path: str, prompt: str) -> str:
+    """Perform inference entirely in GPU subprocess."""
+    model, processor = _load_model()
+    # Load and preprocess image
+    image = Image.open(image_path).convert("RGB")
+    if image.width > 512:
+        ratio = image.height / image.width
+        image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS)
+    # Build conversation
     system_msg = (
+        "You are VL-Thinking 🤔, a helpful assistant. "
+        "Think through your reasoning then provide the answer. "
+        "Wrap reasoning in <think>...</think> and final in <answer>...</answer>."
     )
     conversation = [
+        {"role": "system", "content": [{"type": "text", "text": system_msg}]},
+        {"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": prompt}
+        ]}
     ]
+    # Tokenize, generate, decode
+    chat_input = processor.apply_chat_template(
         conversation, tokenize=False, add_generation_prompt=True
     )
+    inputs = processor(text=[chat_input], images=[image], return_tensors="pt").to("cuda")
+    output_ids = model.generate(**inputs, max_new_tokens=1024)
+    decoded = processor.batch_decode(
+        output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    # Extract assistant portion
+    return decoded.split("assistant", 1)[-1].strip().lstrip(":").strip()
+# Message handling
 def add_message(history, user_input):
+    if history is None:
         history = []
+    for f in user_input.get("files", []):
         history.append({"role": "user", "content": (f,)})
+    text = user_input.get("text", "")
     if text:
         history.append({"role": "user", "content": text})
     return history, gr.MultimodalTextbox(value=None)
 def inference_interface(history):
     if not history:
         return history, gr.MultimodalTextbox(value=None)
+    # Last user text
+    user_text = next(
+        (m["content"] for m in reversed(history)
+         if m["role"] == "user" and isinstance(m["content"], str)),
+        None
+    )
+    if user_text is None:
         return history, gr.MultimodalTextbox(value=None)
+    # Last user image
+    image_path = next(
+        (m["content"][0] for m in reversed(history)
+         if m["role"] == "user" and isinstance(m["content"], tuple)),
+        None
+    )
+    if image_path is None:
         return history, gr.MultimodalTextbox(value=None)
+    # GPU inference
+    reply = gpu_inference(image_path, user_text)
+    history.append({"role": "assistant", "content": reply})
     return history, gr.MultimodalTextbox(value=None)
 def build_demo():
     with gr.Blocks() as demo:
         gr.Markdown("# SpaceThinker-Qwen2.5VL-3B Image Prompt Chatbot")
+        chatbot = gr.Chatbot([], type="messages", label="Conversation")
         chat_input = gr.MultimodalTextbox(
             interactive=True,
             file_types=["image"],
             placeholder="Enter text and upload an image.",
             show_label=True
         )
+        submit_evt = chat_input.submit(
+            add_message, [chatbot, chat_input], [chatbot, chat_input]
         )
+        submit_evt.then(
+            inference_interface, [chatbot], [chatbot, chat_input]
         )
         with gr.Row():
+            send_btn = gr.Button("Send")
+            clear_btn = gr.ClearButton([chatbot, chat_input])
+        send_click = send_btn.click(
+            add_message, [chatbot, chat_input], [chatbot, chat_input]
         )
         send_click.then(
+            inference_interface, [chatbot], [chatbot, chat_input]
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.launch(share=True)