Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -21,6 +21,8 @@ from transformers import (
|
|
21 |
Qwen2_5_VLForConditionalGeneration,
|
22 |
AutoProcessor,
|
23 |
TextIteratorStreamer,
|
|
|
|
|
24 |
)
|
25 |
from qwen_vl_utils import process_vision_info
|
26 |
|
@@ -67,6 +69,16 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
67 |
torch_dtype=torch.float16
|
68 |
).to(device).eval()
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
# Helper functions for object detection
|
71 |
def image_to_base64(image):
|
72 |
"""Convert a PIL image to a base64-encoded string."""
|
@@ -83,6 +95,14 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
|
|
83 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
84 |
return image
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
87 |
"""Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
|
88 |
x_scale = original_width / scaled_width
|
@@ -296,6 +316,29 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
296 |
time.sleep(0.01)
|
297 |
yield buffer, buffer
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# Define examples for image and video inference
|
300 |
image_examples = [
|
301 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
@@ -315,6 +358,12 @@ object_detection_examples = [
|
|
315 |
["Detect Green Car.", "images/11.png"]
|
316 |
]
|
317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
# Added CSS to style the output area as a "Canvas"
|
319 |
css = """
|
320 |
.submit-btn {
|
@@ -374,6 +423,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
374 |
inputs=[input_img, text_input, system_prompt],
|
375 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
376 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
with gr.Accordion("Advanced options", open=False):
|
379 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
|
|
21 |
Qwen2_5_VLForConditionalGeneration,
|
22 |
AutoProcessor,
|
23 |
TextIteratorStreamer,
|
24 |
+
AutoModelForCausalLM,
|
25 |
+
AutoTokenizer
|
26 |
)
|
27 |
from qwen_vl_utils import process_vision_info
|
28 |
|
|
|
69 |
torch_dtype=torch.float16
|
70 |
).to(device).eval()
|
71 |
|
72 |
+
# Load Moondream2
|
73 |
+
@spaces.GPU
|
74 |
+
def load_moondream_model():
|
75 |
+
return AutoModelForCausalLM.from_pretrained(
|
76 |
+
"vikhyatk/moondream2",
|
77 |
+
revision="2025-04-14",
|
78 |
+
trust_remote_code=True,
|
79 |
+
device_map={"": "cuda"},
|
80 |
+
)
|
81 |
+
|
82 |
# Helper functions for object detection
|
83 |
def image_to_base64(image):
|
84 |
"""Convert a PIL image to a base64-encoded string."""
|
|
|
95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
96 |
return image
|
97 |
|
98 |
+
def draw_points(image, points, color="red", radius=5):
|
99 |
+
"""Draw points on an image."""
|
100 |
+
draw = ImageDraw.Draw(image)
|
101 |
+
for point in points:
|
102 |
+
x, y = point
|
103 |
+
draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color)
|
104 |
+
return image
|
105 |
+
|
106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
107 |
"""Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
|
108 |
x_scale = original_width / scaled_width
|
|
|
316 |
time.sleep(0.01)
|
317 |
yield buffer, buffer
|
318 |
|
319 |
+
# Moondream Vision Detection Function
|
320 |
+
@spaces.GPU
|
321 |
+
def detect_moondream(im: Image.Image, object_name: str, mode: str):
|
322 |
+
"""
|
323 |
+
Open Vocabulary Detection using moondream2
|
324 |
+
Args:
|
325 |
+
im: Pillow Image
|
326 |
+
object_name: the object you would like to detect
|
327 |
+
mode: point or object_detection
|
328 |
+
Returns:
|
329 |
+
list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
|
330 |
+
annotated_image: Image with detections drawn
|
331 |
+
"""
|
332 |
+
model = load_moondream_model()
|
333 |
+
if mode == "point":
|
334 |
+
points = model.point(im, object_name)["points"]
|
335 |
+
annotated_image = draw_points(im.copy(), points)
|
336 |
+
return points, annotated_image
|
337 |
+
elif mode == "object_detection":
|
338 |
+
boxes = model.detect(im, object_name)["objects"]
|
339 |
+
annotated_image = draw_bounding_boxes(im.copy(), boxes)
|
340 |
+
return boxes, annotated_image
|
341 |
+
|
342 |
# Define examples for image and video inference
|
343 |
image_examples = [
|
344 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
|
|
358 |
["Detect Green Car.", "images/11.png"]
|
359 |
]
|
360 |
|
361 |
+
# Define examples for Moondream Vision
|
362 |
+
moondream_examples = [
|
363 |
+
["Spider-Man T-shirt", "images/22.png", "point"],
|
364 |
+
["Green Car", "images/11.png", "object_detection"]
|
365 |
+
]
|
366 |
+
|
367 |
# Added CSS to style the output area as a "Canvas"
|
368 |
css = """
|
369 |
.submit-btn {
|
|
|
423 |
inputs=[input_img, text_input, system_prompt],
|
424 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
425 |
)
|
426 |
+
with gr.TabItem("Moondream Vision"):
|
427 |
+
with gr.Row():
|
428 |
+
with gr.Column():
|
429 |
+
moon_image = gr.Image(label="Input Image", type="pil")
|
430 |
+
moon_object = gr.Textbox(label="Object to Detect", placeholder="e.g., Spider-Man T-shirt")
|
431 |
+
moon_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="point")
|
432 |
+
moon_submit = gr.Button("Detect", elem_classes="submit-btn")
|
433 |
+
gr.Examples(
|
434 |
+
examples=moondream_examples,
|
435 |
+
inputs=[moon_object, moon_image, moon_mode]
|
436 |
+
)
|
437 |
+
with gr.Column():
|
438 |
+
moon_output_json = gr.JSON(label="Detection Results")
|
439 |
+
moon_annotated_image = gr.Image(label="Annotated Image")
|
440 |
+
|
441 |
+
moon_submit.click(
|
442 |
+
fn=detect_moondream,
|
443 |
+
inputs=[moon_image, moon_object, moon_mode],
|
444 |
+
outputs=[moon_output_json, moon_annotated_image]
|
445 |
+
)
|
446 |
|
447 |
with gr.Accordion("Advanced options", open=False):
|
448 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|