Spaces:

remyxai
/

SpaceThinker-Qwen2.5VL-3B

Running on Zero

App Files Files Community

salma-remyx commited on Apr 19

Commit

80b7578

1 Parent(s): 0c890d5

add SpaceThinker

Browse files

Files changed (8) hide show

README.md +5 -8
app.py +187 -157
checkpoints/depth_pro.pt +0 -3
examples/bee_and_flower.jpg +0 -0
examples/gears.png +0 -0
examples/road-through-dense-forest.jpg +0 -0
examples/spooky_doggy.png +0 -0
requirements.txt +9 -36

README.md CHANGED Viewed

@@ -1,15 +1,12 @@
 ---
-title: VQASynth
-emoji: 🎹
-colorFrom: blue
-colorTo: purple
 sdk: gradio
-sdk_version: 5.5.0
 app_file: app.py
 pinned: false
-license: apache-2.0
-short_description: VQASynth Scene Reconstruction Pipeline
-startup_duration_timeout: 4h
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SpaceThinker-Qwen2.5VL-3B
+emoji: 🌌
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
+sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,196 +1,226 @@
 import spaces
-import subprocess
-import sys
-# Ensure the package is installed from the Git repository
-package_name = "vqasynth"  # Replace with the actual package name if different
-git_repo_url = "git+https://github.com/remyxai/VQASynth.git"
-try:
-    __import__(package_name)
-except ImportError:
-    print(f"{package_name} not found. Installing from {git_repo_url}...")
-    subprocess.check_call([sys.executable, "-m", "pip", "install", git_repo_url])
-import os
-import uuid
-import tempfile
-import cv2
-import open3d as o3d
-import PIL
-from PIL import Image
-from vqasynth.depth import DepthEstimator
-from vqasynth.localize import Localizer
-from vqasynth.scene_fusion import SpatialSceneConstructor
-from vqasynth.prompts import PromptGenerator
-import numpy as np
 import gradio as gr
-import spacy
-try:
-    nlp = spacy.load("en_core_web_sm")
-except OSError:
-    # Download the model if it's not already available
-    from spacy.cli import download
-    download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
-depth = DepthEstimator(from_onnx=False)
-localizer = Localizer()
-spatial_scene_constructor = SpatialSceneConstructor()
-prompt_generator = PromptGenerator()
-def combine_segmented_pointclouds(
-    pointcloud_ply_files: list, captions: list, prompts: list, cache_dir: str
-):
     """
-    Process a list of segmented point clouds to combine two based on captions and return the resulting 3D point cloud and the identified prompt.
-    Args:
-        pointcloud_ply_files (list): List of file paths to `.pcd` files representing segmented point clouds.
-        captions (list): List of captions corresponding to the segmented point clouds.
-        prompts (list): List of prompts containing questions and answers about the captions.
-        cache_dir (str): Directory to save the final `.ply` and `.obj` files.
-    Returns:
-        tuple: The path to the generated `.obj` file and the identified prompt text.
     """
-    selected_prompt = None
-    selected_indices = None
-    for i, caption1 in enumerate(captions):
-        for j, caption2 in enumerate(captions):
-            if i != j:
-                for prompt in prompts:
-                    if caption1 in prompt and caption2 in prompt:
-                        selected_prompt = prompt
-                        selected_indices = (i, j)
-                        break
-                if selected_prompt:
-                    break
-        if selected_prompt:
-            break
-    if not selected_prompt or not selected_indices:
-        raise ValueError("No prompt found containing two captions.")
-    idx1, idx2 = selected_indices
-    pointcloud_files = [pointcloud_ply_files[idx1], pointcloud_ply_files[idx2]]
-    captions = [captions[idx1], captions[idx2]]
-    combined_point_cloud = o3d.geometry.PointCloud()
-    for idx, pointcloud_file in enumerate(pointcloud_files):
-        pcd = o3d.io.read_point_cloud(pointcloud_file)
-        if pcd.is_empty():
-            continue
-        combined_point_cloud += pcd
-    if combined_point_cloud.is_empty():
-        raise ValueError(
-            "Combined point cloud is empty after loading the selected segments."
-        )
-    uuid_out = str(uuid.uuid4())
-    ply_file = os.path.join(cache_dir, f"combined_output_{uuid_out}.ply")
-    obj_file = os.path.join(cache_dir, f"combined_output_{uuid_out}.obj")
-    o3d.io.write_point_cloud(ply_file, combined_point_cloud)
-    mesh = o3d.io.read_triangle_mesh(ply_file)
-    o3d.io.write_triangle_mesh(obj_file, mesh)
-    return obj_file, selected_prompt
-@spaces.GPU
-def run_vqasynth_pipeline(image: PIL.Image, cache_dir: str):
-    depth_map, focal_length = depth.run(image)
-    masks, bounding_boxes, captions = localizer.run(image)
-    pointcloud_data, cannonicalized = spatial_scene_constructor.run(
-        str(0), image, depth_map, focal_length, masks, cache_dir
-    )
-    prompts = prompt_generator.run(captions, pointcloud_data, cannonicalized)
-    obj_file, selected_prompt = combine_segmented_pointclouds(
-        pointcloud_data, captions, prompts, cache_dir
-    )
-    return obj_file, selected_prompt
-def process_image(image: str):
-    # Use a persistent temporary directory to keep the .obj file accessible by Gradio
-    temp_dir = tempfile.mkdtemp()
-    image = Image.open(image).convert("RGB")
-    obj_file, prompt = run_vqasynth_pipeline(image, temp_dir)
-    return obj_file, prompt
 def build_demo():
     with gr.Blocks() as demo:
-        gr.Markdown(
-            """
-        # Synthesizing SpatialVQA Samples with VQASynth
-        This space helps test the full [VQASynth](https://github.com/remyxai/VQASynth) scene reconstruction pipeline on a single image with visualizations.
-        ### [Github](https://github.com/remyxai/VQASynth) | [Collection](https://huggingface.co/collections/remyxai/spacevlms-66a3dbb924756d98e7aec678)
-        """
         )
-        gr.Markdown(
-            """
-        ## Instructions
-        Upload an image, and the tool will generate a corresponding 3D point cloud visualization of the objects found and an example prompt and response describing a spatial relationship between the objects.
-        """
         )
         with gr.Row():
-            with gr.Column():
-                image_input = gr.Image(type="filepath", label="Upload an Image")
-                generate_button = gr.Button("Generate")
-            with gr.Column():
-                model_output = gr.Model3D(label="3D Point Cloud")  # Only used as output
-                caption_output = gr.Text(label="Caption")
-        generate_button.click(
-            process_image, inputs=image_input, outputs=[model_output, caption_output]
         )
         gr.Examples(
             examples=[
-                ["./examples/warehouse_rgb.jpg"],
-                ["./examples/spooky_doggy.png"],
-                ["./examples/bee_and_flower.jpg"],
-                ["./examples/gears.png"],
-                ["./examples/road-through-dense-forest.jpg"],
             ],
-            inputs=image_input,
-            label="Example Images",
-            examples_per_page=5,
-        )
-        gr.Markdown(
-            """
-        ## Citation
-        ```
-        @article{chen2024spatialvlm,
-          title = {SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities},
-          author = {Chen, Boyuan and Xu, Zhuo and Kirmani, Sean and Ichter, Brian and Driess, Danny and Florence, Pete and Sadigh, Dorsa and Guibas, Leonidas and Xia, Fei},
-          journal = {arXiv preprint arXiv:2401.12168},
-          year = {2024},
-          url = {https://arxiv.org/abs/2401.12168},
-        }
-        ```
-        """
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.launch(share=True)

 import spaces
+import torch
+import time
 import gradio as gr
+from PIL import Image
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from typing import List
+MODEL_ID = "remyxai/SpaceThinker-Qwen2.5VL-3B"
+@spaces.GPU
+def load_model():
+    print("Loading model and processor...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    ).to(device)
+    processor = AutoProcessor.from_pretrained(MODEL_ID)
+    return model, processor
+model, processor = load_model()
+def process_image(image_path_or_obj):
+    """Loads, resizes, and preprocesses an image path or Pillow Image."""
+    if isinstance(image_path_or_obj, str):
+        # Path on disk or from history
+        image = Image.open(image_path_or_obj).convert("RGB")
+    elif isinstance(image_path_or_obj, Image.Image):
+        image = image_path_or_obj.convert("RGB")
+    else:
+        raise ValueError("process_image expects a file path (str) or PIL.Image")
+    max_width = 512
+    if image.width > max_width:
+        aspect_ratio = image.height / image.width
+        new_height = int(max_width * aspect_ratio)
+        image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
+        print(f"Resized image to: {max_width}x{new_height}")
+    return image
+def get_latest_image(history):
     """
+    Look from the end to find the last user-uploaded image (stored as (file_path,) ).
+    Return None if not found.
     """
+    for user_msg, _assistant_msg in reversed(history):
+        if isinstance(user_msg, tuple) and len(user_msg) > 0:
+            return user_msg[0]
+    return None
+def only_assistant_text(full_text: str) -> str:
+    """
+    Helper to strip out any lines containing 'system', 'user', etc.,
+    and return only the final assistant answer.
+    Adjust this parsing if your model's output format differs.
+    """
+    # Example output might look like:
+    #   system
+    #   ...
+    #   user
+    #   ...
+    #   assistant
+    #   The final answer
+    #
+    # We'll just split on 'assistant' and return everything after it.
+    if "assistant" in full_text:
+        parts = full_text.split("assistant", 1)
+        result = parts[-1].strip()
+        # Remove any leading punctuation (like a colon)
+        result = result.lstrip(":").strip()
+        return result
+    return full_text.strip()
+def run_inference(image, prompt):
+    """Runs Qwen2.5-VL inference on a single image and text prompt."""
+    system_msg = (
+        "You are VL-Thinking 🤔, a helpful assistant with excellent reasoning ability. "
+        "You should first think about the reasoning process and then provide the answer. "
+        "Use <think>...</think> and <answer>...</answer> tags."
+    )
+    conversation = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": system_msg}],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt},
+            ],
+        },
+    ]
+    text_input = processor.apply_chat_template(
+        conversation, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(text=[text_input], images=[image], return_tensors="pt").to(model.device)
+    generated_ids = model.generate(**inputs, max_new_tokens=1024)
+    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    # Parse out only the final assistant text
+    return only_assistant_text(output_text)
+def add_message(history, user_input):
+    """
+    Step 1 (triggered by user's 'Submit' or 'Send'):
+    - Save new text or images into `history`.
+    - The Chatbot display uses pairs: [user_text_or_image, assistant_reply].
+    """
+    if not isinstance(history, list):
+        history = []
+    files = user_input.get("files", [])
+    text = user_input.get("text", "")
+    # Store images
+    for f in files:
+        # Each image is stored as `[(file_path,), None]`
+        history.append([(f,), None])
+    # Store text
+    if text:
+        history.append([text, None])
+    return history, gr.MultimodalTextbox(value=None)
+def inference_interface(history):
+    """
+    Step 2: Use the most recent text + the most recent image to run Qwen2.5-VL.
+    Instead of adding another entry, we fill the assistant's answer into
+    the last user text entry.
+    """
+    if not history:
+        return history, gr.MultimodalTextbox(value=None)
+    # 1) Get the user's most recent text
+    user_text = ""
+    # We'll search from the end for the first str we find
+    for idx in range(len(history) - 1, -1, -1):
+        user_msg, assistant_msg = history[idx]
+        if isinstance(user_msg, str):
+            user_text = user_msg
+            # We'll also keep track of this index so we can fill in the assistant reply
+            user_idx = idx
+            break
+    else:
+        # No user text found
+        print("No user text found in history. Skipping inference.")
+        return history, gr.MultimodalTextbox(value=None)
+    # 2) Get the latest image from the entire conversation
+    latest_image = get_latest_image(history)
+    if not latest_image:
+        # No image found => can't run the model
+        print("No image found in history. Skipping inference.")
+        return history, gr.MultimodalTextbox(value=None)
+    # 3) Process the image
+    pil_image = process_image(latest_image)
+    # 4) Run inference
+    assistant_reply = run_inference(pil_image, user_text)
+    # 5) Fill that assistant reply back into the last user text entry
+    history[user_idx][1] = assistant_reply
+    return history, gr.MultimodalTextbox(value=None)
 def build_demo():
     with gr.Blocks() as demo:
+        gr.Markdown("# SpaceThinker-Qwen2.5VL-3B Image Prompt Chatbot")
+        chatbot = gr.Chatbot([], line_breaks=True)
+        chat_input = gr.MultimodalTextbox(
+            interactive=True,
+            file_types=["image"],
+            placeholder="Enter text and upload an image.",
+            show_label=True
         )
+        # When the user presses Enter in the MultimodalTextbox:
+        submit_event = chat_input.submit(
+            fn=add_message,  # Step 1: store user data
+            inputs=[chatbot, chat_input],
+            outputs=[chatbot, chat_input]
+        )
+        # After storing, run inference
+        submit_event.then(
+            fn=inference_interface,  # Step 2: run Qwen2.5-VL
+            inputs=[chatbot],
+            outputs=[chatbot, chat_input]
         )
+        # Same logic for a "Send" button
         with gr.Row():
+            send_button = gr.Button("Send")
+            clear_button = gr.ClearButton([chatbot, chat_input])
+        send_click = send_button.click(
+            fn=add_message,
+            inputs=[chatbot, chat_input],
+            outputs=[chatbot, chat_input]
+        )
+        send_click.then(
+            fn=inference_interface,
+            inputs=[chatbot],
+            outputs=[chatbot, chat_input]
         )
+        # Example
         gr.Examples(
             examples=[
+                {
+                    "text": "Give me the height of the man in the red hat in feet.",
+                    "files": ["./examples/warehouse_rgb.jpg"]
+                }
             ],
+            inputs=[chat_input],
         )
     return demo
 if __name__ == "__main__":
     demo = build_demo()
     demo.launch(share=True)

checkpoints/depth_pro.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3eb35ca68168ad3d14cb150f8947a4edf85589941661fdb2686259c80685c0ce
-size 1904446787

examples/bee_and_flower.jpg DELETED Viewed

Binary file (18.2 kB)

examples/gears.png DELETED Viewed

Binary file (525 kB)

examples/road-through-dense-forest.jpg DELETED Viewed

Binary file (292 kB)

examples/spooky_doggy.png DELETED Viewed

Binary file (892 kB)

requirements.txt CHANGED Viewed

@@ -1,36 +1,9 @@
---extra-index-url https://download.pytorch.org/whl/cu121
-torch==2.4.0
-torchvision==0.19.0
-torchaudio==2.4.0
-transformers==4.44.2
-pillow==11.0.0
-gradio==5.5.0
-accelerate==0.34.2
-numpy==1.26.4
-timm==1.0.9
-einops==0.7.0
-open3d==0.18.0
-opencv-python==4.7.0.72
-tqdm==4.66.3
-torchprofile==0.0.4
-matplotlib==3.6.2
-huggingface-hub==0.25.1
-onnx==1.13.1
-onnxruntime==1.14.1
-onnxsim==0.4.35
-scipy==1.12.0
-litellm==1.25.2
-pycocotools==2.0.6
-datasets==3.1.0
-spacy==3.7.5
-pandas==2.2.3
-html5lib==1.1
-spaces==0.30.4
-#git+https://github.com/remyxai/VQASynth.git
-git+https://github.com/apple/ml-depth-pro.git
-git+https://github.com/facebookresearch/sam2.git
-git+https://github.com/openai/CLIP.git
-flash-attn @ https://remyx.ai/assets/spatialvlm/flash_attn-2.7.0.post2-cp310-cp310-linux_x86_64.whl

+torch
+transformers>=4.41.0
+Pillow
+gradio==5.15.0
+spaces
+multiprocess
+requests
+accelerate>=0.26.0
+git+https://github.com/huggingface/transformers.git