Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Jul 4

Commit

fd7a385

verified ·

1 Parent(s): 7d59d39

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -0

app.py CHANGED Viewed

@@ -21,6 +21,8 @@ from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 from qwen_vl_utils import process_vision_info
@@ -67,6 +69,16 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
 # Helper functions for object detection
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
@@ -83,6 +95,14 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
         draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
     return image
 def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
     """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
     x_scale = original_width / scaled_width
@@ -296,6 +316,29 @@ def generate_video(model_name: str, text: str, video_path: str,
         time.sleep(0.01)
         yield buffer, buffer
 # Define examples for image and video inference
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
@@ -315,6 +358,12 @@ object_detection_examples = [
     ["Detect Green Car.", "images/11.png"]
 ]
 # Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
@@ -374,6 +423,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         inputs=[input_img, text_input, system_prompt],
                         outputs=[model_output_text, parsed_boxes, annotated_image]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)

     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
+    AutoModelForCausalLM,
+    AutoTokenizer
 )
 from qwen_vl_utils import process_vision_info
     torch_dtype=torch.float16
 ).to(device).eval()
+# Load Moondream2
+@spaces.GPU
+def load_moondream_model():
+    return AutoModelForCausalLM.from_pretrained(
+        "vikhyatk/moondream2",
+        revision="2025-04-14",
+        trust_remote_code=True,
+        device_map={"": "cuda"},
+    )
 # Helper functions for object detection
 def image_to_base64(image):
     """Convert a PIL image to a base64-encoded string."""
         draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
     return image
+def draw_points(image, points, color="red", radius=5):
+    """Draw points on an image."""
+    draw = ImageDraw.Draw(image)
+    for point in points:
+        x, y = point
+        draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color)
+    return image
 def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
     """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
     x_scale = original_width / scaled_width
         time.sleep(0.01)
         yield buffer, buffer
+# Moondream Vision Detection Function
+@spaces.GPU
+def detect_moondream(im: Image.Image, object_name: str, mode: str):
+    """
+    Open Vocabulary Detection using moondream2
+    Args:
+        im: Pillow Image
+        object_name: the object you would like to detect
+        mode: point or object_detection
+    Returns:
+        list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
+        annotated_image: Image with detections drawn
+    """
+    model = load_moondream_model()
+    if mode == "point":
+        points = model.point(im, object_name)["points"]
+        annotated_image = draw_points(im.copy(), points)
+        return points, annotated_image
+    elif mode == "object_detection":
+        boxes = model.detect(im, object_name)["objects"]
+        annotated_image = draw_bounding_boxes(im.copy(), boxes)
+        return boxes, annotated_image
 # Define examples for image and video inference
 image_examples = [
     ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
     ["Detect Green Car.", "images/11.png"]
 ]
+# Define examples for Moondream Vision
+moondream_examples = [
+    ["Spider-Man T-shirt", "images/22.png", "point"],
+    ["Green Car", "images/11.png", "object_detection"]
+]
 # Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
                         inputs=[input_img, text_input, system_prompt],
                         outputs=[model_output_text, parsed_boxes, annotated_image]
                     )
+                with gr.TabItem("Moondream Vision"):
+                    with gr.Row():
+                        with gr.Column():
+                            moon_image = gr.Image(label="Input Image", type="pil")
+                            moon_object = gr.Textbox(label="Object to Detect", placeholder="e.g., Spider-Man T-shirt")
+                            moon_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="point")
+                            moon_submit = gr.Button("Detect", elem_classes="submit-btn")
+                            gr.Examples(
+                                examples=moondream_examples,
+                                inputs=[moon_object, moon_image, moon_mode]
+                            )
+                        with gr.Column():
+                            moon_output_json = gr.JSON(label="Detection Results")
+                            moon_annotated_image = gr.Image(label="Annotated Image")
+                    moon_submit.click(
+                        fn=detect_moondream,
+                        inputs=[moon_image, moon_object, moon_mode],
+                        outputs=[moon_output_json, moon_annotated_image]
+                    )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)