Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Jul 4

Commit

2e943bd

verified ·

1 Parent(s): b7f1b34

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -76

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from threading import Thread
 import base64
 from io import BytesIO
 import re
-from typing import Literal
 import gradio as gr
 import spaces
@@ -23,6 +22,7 @@ from transformers import (
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForCausalLM,
 )
 from qwen_vl_utils import process_vision_info
@@ -71,15 +71,15 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 # Load moondream2
 MODEL_ID_MD = "vikhyatk/moondream2"
 model_md = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_MD,
     revision="2025-06-21",
     trust_remote_code=True,
-    torch_dtype=torch.float16,
 ).to(device).eval()
-# Helper functions for object detection and drawing
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
     buffered = BytesIO()
@@ -95,14 +95,6 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
         draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
     return image
-def draw_points(image, points, color="lime", radius=10):
-    """Draw points (circles) on an image."""
-    draw = ImageDraw.Draw(image)
-    for point in points:
-        x, y = point
-        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
-    return image
 def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
     """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
     x_scale = original_width / scaled_width
@@ -127,11 +119,11 @@ default_system_prompt = (
     "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 )
-# Function for ViLaSR object detection
 @spaces.GPU
 def run_example(image, text_input, system_prompt):
     """Detect objects in an image and return bounding box annotations."""
-    model = model_x
     processor = processor_x
     messages = [
@@ -172,41 +164,6 @@ def run_example(image, text_input, system_prompt):
     annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
     return output_text[0], str(parsed_boxes), annotated_image
-# Function for Moondream object pointing/detection
-@spaces.GPU
-def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
-    """
-    Open Vocabulary Detection/Pointing using moondream2.
-    """
-    if image is None:
-        return "Please upload an image.", None
-    original_width, original_height = image.size
-    annotated_image = image.copy()
-    json_output = {}
-    if mode == "point":
-        # FIX: Changed 'im' to 'image'
-        result = model_md.point(image=image, prompt=prompt)
-        points = result.get("points", [])
-        json_output = result
-        if points:
-            rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
-            annotated_image = draw_points(annotated_image, rescaled_points)
-    elif mode == "object_detection":
-        # FIX: Changed 'im' to 'image'
-        result = model_md.detect(image=image, prompt=prompt)
-        boxes = result.get("objects", [])
-        json_output = result
-        if boxes:
-            rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
-            annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
-    else:
-        return "Invalid mode selected.", None
-    return json_output, annotated_image
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -249,6 +206,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -305,6 +281,31 @@ def generate_video(model_name: str, text: str, video_path: str,
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -351,11 +352,11 @@ def generate_video(model_name: str, text: str, video_path: str,
         time.sleep(0.01)
         yield buffer, buffer
-# Define examples
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
-    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
@@ -364,8 +365,9 @@ video_examples = [
     ["explain the video in detail.", "videos/2.mp4"]
 ]
 object_detection_examples = [
-    ["Detect Spider-Man T-shirt.", "images/22.png"],
     ["Detect Green Car.", "images/11.png"]
 ]
@@ -428,25 +430,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         inputs=[input_img, text_input, system_prompt],
                         outputs=[model_output_text, parsed_boxes, annotated_image]
                     )
-                # NEW MOONDREAM TAB
-                with gr.TabItem("moondream-vision"):
-                    gr.Markdown("## Moondream Vision: Object Pointing & Detection")
-                    with gr.Row():
-                        with gr.Column():
-                            moondream_input_img = gr.Image(label="Input Image", type="pil")
-                            moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
-                            moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
-                            moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
-                        with gr.Column():
-                            moondream_json_output = gr.JSON(label="Output JSON")
-                            moondream_annotated_image = gr.Image(label="Detection Result")
-                    moondream_submit_btn.click(
-                        fn=run_moondream,
-                        inputs=[moondream_input_img, moondream_text_input, moondream_mode],
-                        outputs=[moondream_json_output, moondream_annotated_image]
-                    )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -460,22 +443,23 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 gr.Markdown("## Result.Md")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
-                with gr.Accordion("Formatted Result (Result.Md)", open=False):
                     markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
-                choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
             gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
-            gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
             gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
             gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
-            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],

 import base64
 from io import BytesIO
 import re
 import gradio as gr
 import spaces
     AutoProcessor,
     TextIteratorStreamer,
     AutoModelForCausalLM,
+    AutoTokenizer
 )
 from qwen_vl_utils import process_vision_info
 # Load moondream2
 MODEL_ID_MD = "vikhyatk/moondream2"
+tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
 model_md = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_MD,
     revision="2025-06-21",
     trust_remote_code=True,
+    torch_dtype=torch.float16
 ).to(device).eval()
+# Helper functions for object detection
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
     buffered = BytesIO()
         draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
     return image
 def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
     """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
     x_scale = original_width / scaled_width
     "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
 )
+# Function for object detection
 @spaces.GPU
 def run_example(image, text_input, system_prompt):
     """Detect objects in an image and return bounding box annotations."""
+    model = model_x
     processor = processor_x
     messages = [
     annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
     return output_text[0], str(parsed_boxes), annotated_image
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
+    elif model_name == "moondream2":
+        model = model_md
+        tokenizer = tokenizer_md
+        image_embeds = model.encode_image(image)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        thread = Thread(target=model.answer_question, kwargs={
+            "image_embeds": image_embeds,
+            "question": text,
+            "tokenizer": tokenizer,
+            "max_new_tokens": max_new_tokens,
+            "streamer": streamer,
+        })
+        thread.start()
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.01)
+            yield buffer, buffer
+        return
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
     elif model_name == "ShotVL-7B":
         processor = processor_s
         model = model_s
+    elif model_name == "moondream2":
+        model = model_md
+        tokenizer = tokenizer_md
+        frames = downsample_video(video_path)
+        buffer = ""
+        for frame in frames:
+            image, timestamp = frame
+            image_embeds = model.encode_image(image)
+            streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+            thread = Thread(target=model.answer_question, kwargs={
+                "image_embeds": image_embeds,
+                "question": text,
+                "tokenizer": tokenizer,
+                "max_new_tokens": max_new_tokens,
+                "streamer": streamer,
+            })
+            thread.start()
+            frame_buffer = f"Frame {timestamp}:\n"
+            for new_text in streamer:
+                frame_buffer += new_text
+                buffer += new_text
+                time.sleep(0.01)
+                yield buffer, buffer
+            buffer += "\n\n"
+        return
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         time.sleep(0.01)
         yield buffer, buffer
+# Define examples for image and video inference
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
+    ["explain the movie shot in detail.", "images/3.png"],
     ["fill the correct numbers.", "images/4.png"]
 ]
     ["explain the video in detail.", "videos/2.mp4"]
 ]
+# Define examples for object detection
 object_detection_examples = [
+    ["Detect Spider-Man T-shirt.", -shirt.", "images/22.png"],
     ["Detect Green Car.", "images/11.png"]
 ]
                         inputs=[input_img, text_input, system_prompt],
                         outputs=[model_output_text, parsed_boxes, annotated_image]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 gr.Markdown("## Result.Md")
                 output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
+                with gr.Accordion("Formatted Result (Result.Md)", open=False):
                     markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
             model_choice = gr.Radio(
+                choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B", "moondream2"],
                 label="Select Model",
                 value="Camel-Doc-OCR-062825"
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
             gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
+            gr.Markdown("> [OCRFlux-3B](https://h темаuggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
             gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
             gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
+            gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
+            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
         fn=generate_image,
         inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],