Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ from threading import Thread
|
|
8 |
import base64
|
9 |
from io import BytesIO
|
10 |
import re
|
|
|
11 |
|
12 |
import gradio as gr
|
13 |
import spaces
|
@@ -22,7 +23,6 @@ from transformers import (
|
|
22 |
AutoProcessor,
|
23 |
TextIteratorStreamer,
|
24 |
AutoModelForCausalLM,
|
25 |
-
AutoTokenizer
|
26 |
)
|
27 |
from qwen_vl_utils import process_vision_info
|
28 |
|
@@ -69,17 +69,17 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
69 |
torch_dtype=torch.float16
|
70 |
).to(device).eval()
|
71 |
|
72 |
-
# Load
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
)
|
81 |
|
82 |
-
|
|
|
83 |
def image_to_base64(image):
|
84 |
"""Convert a PIL image to a base64-encoded string."""
|
85 |
buffered = BytesIO()
|
@@ -95,12 +95,12 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
|
|
95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
96 |
return image
|
97 |
|
98 |
-
def draw_points(image, points, color="
|
99 |
-
"""Draw points on an image."""
|
100 |
draw = ImageDraw.Draw(image)
|
101 |
for point in points:
|
102 |
x, y = point
|
103 |
-
draw.ellipse(
|
104 |
return image
|
105 |
|
106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
@@ -127,11 +127,11 @@ default_system_prompt = (
|
|
127 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
128 |
)
|
129 |
|
130 |
-
# Function for object detection
|
131 |
@spaces.GPU
|
132 |
def run_example(image, text_input, system_prompt):
|
133 |
"""Detect objects in an image and return bounding box annotations."""
|
134 |
-
model = model_x
|
135 |
processor = processor_x
|
136 |
|
137 |
messages = [
|
@@ -172,6 +172,39 @@ def run_example(image, text_input, system_prompt):
|
|
172 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
173 |
return output_text[0], str(parsed_boxes), annotated_image
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
def downsample_video(video_path):
|
176 |
"""
|
177 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
@@ -316,34 +349,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
316 |
time.sleep(0.01)
|
317 |
yield buffer, buffer
|
318 |
|
319 |
-
#
|
320 |
-
@spaces.GPU
|
321 |
-
def detect_moondream(im: Image.Image, object_name: str, mode: str):
|
322 |
-
"""
|
323 |
-
Open Vocabulary Detection using moondream2
|
324 |
-
Args:
|
325 |
-
im: Pillow Image
|
326 |
-
object_name: the object you would like to detect
|
327 |
-
mode: point or object_detection
|
328 |
-
Returns:
|
329 |
-
list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
|
330 |
-
annotated_image: Image with detections drawn
|
331 |
-
"""
|
332 |
-
model = load_moondream_model()
|
333 |
-
if mode == "point":
|
334 |
-
points = model.point(im, object_name)["points"]
|
335 |
-
annotated_image = draw_points(im.copy(), points)
|
336 |
-
return points, annotated_image
|
337 |
-
elif mode == "object_detection":
|
338 |
-
boxes = model.detect(im, object_name)["objects"]
|
339 |
-
annotated_image = draw_bounding_boxes(im.copy(), boxes)
|
340 |
-
return boxes, annotated_image
|
341 |
-
|
342 |
-
# Define examples for image and video inference
|
343 |
image_examples = [
|
344 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
345 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
346 |
-
["explain the movie shot in detail.", "images/3.png"],
|
347 |
["fill the correct numbers.", "images/4.png"]
|
348 |
]
|
349 |
|
@@ -352,18 +362,11 @@ video_examples = [
|
|
352 |
["explain the video in detail.", "videos/2.mp4"]
|
353 |
]
|
354 |
|
355 |
-
# Define examples for object detection
|
356 |
object_detection_examples = [
|
357 |
["Detect Spider-Man T-shirt.", "images/22.png"],
|
358 |
["Detect Green Car.", "images/11.png"]
|
359 |
]
|
360 |
|
361 |
-
# Define examples for Moondream Vision
|
362 |
-
moondream_examples = [
|
363 |
-
["Spider-Man T-shirt", "images/22.png", "point"],
|
364 |
-
["Green Car", "images/11.png", "object_detection"]
|
365 |
-
]
|
366 |
-
|
367 |
# Added CSS to style the output area as a "Canvas"
|
368 |
css = """
|
369 |
.submit-btn {
|
@@ -423,27 +426,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
423 |
inputs=[input_img, text_input, system_prompt],
|
424 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
425 |
)
|
426 |
-
|
|
|
|
|
427 |
with gr.Row():
|
428 |
with gr.Column():
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
gr.Examples(
|
434 |
-
examples=moondream_examples,
|
435 |
-
inputs=[moon_object, moon_image, moon_mode]
|
436 |
-
)
|
437 |
with gr.Column():
|
438 |
-
|
439 |
-
|
440 |
|
441 |
-
|
442 |
-
fn=
|
443 |
-
inputs=[
|
444 |
-
outputs=[
|
445 |
)
|
446 |
|
|
|
447 |
with gr.Accordion("Advanced options", open=False):
|
448 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
449 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
@@ -456,7 +458,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
456 |
gr.Markdown("## Result.Md")
|
457 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
458 |
|
459 |
-
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
460 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
461 |
|
462 |
model_choice = gr.Radio(
|
@@ -470,8 +472,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
470 |
gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
471 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
472 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
473 |
-
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
474 |
-
|
475 |
image_submit.click(
|
476 |
fn=generate_image,
|
477 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
|
|
8 |
import base64
|
9 |
from io import BytesIO
|
10 |
import re
|
11 |
+
from typing import Literal
|
12 |
|
13 |
import gradio as gr
|
14 |
import spaces
|
|
|
23 |
AutoProcessor,
|
24 |
TextIteratorStreamer,
|
25 |
AutoModelForCausalLM,
|
|
|
26 |
)
|
27 |
from qwen_vl_utils import process_vision_info
|
28 |
|
|
|
69 |
torch_dtype=torch.float16
|
70 |
).to(device).eval()
|
71 |
|
72 |
+
# Load moondream2
|
73 |
+
MODEL_ID_MD = "vikhyatk/moondream2"
|
74 |
+
model_md = AutoModelForCausalLM.from_pretrained(
|
75 |
+
MODEL_ID_MD,
|
76 |
+
revision="2025-06-21",
|
77 |
+
trust_remote_code=True,
|
78 |
+
torch_dtype=torch.float16,
|
79 |
+
).to(device).eval()
|
|
|
80 |
|
81 |
+
|
82 |
+
# Helper functions for object detection and drawing
|
83 |
def image_to_base64(image):
|
84 |
"""Convert a PIL image to a base64-encoded string."""
|
85 |
buffered = BytesIO()
|
|
|
95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
96 |
return image
|
97 |
|
98 |
+
def draw_points(image, points, color="lime", radius=10):
|
99 |
+
"""Draw points (circles) on an image."""
|
100 |
draw = ImageDraw.Draw(image)
|
101 |
for point in points:
|
102 |
x, y = point
|
103 |
+
draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
|
104 |
return image
|
105 |
|
106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
|
|
127 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
128 |
)
|
129 |
|
130 |
+
# Function for ViLaSR object detection
|
131 |
@spaces.GPU
|
132 |
def run_example(image, text_input, system_prompt):
|
133 |
"""Detect objects in an image and return bounding box annotations."""
|
134 |
+
model = model_x
|
135 |
processor = processor_x
|
136 |
|
137 |
messages = [
|
|
|
172 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
173 |
return output_text[0], str(parsed_boxes), annotated_image
|
174 |
|
175 |
+
# Function for Moondream object pointing/detection
|
176 |
+
@spaces.GPU
|
177 |
+
def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
|
178 |
+
"""
|
179 |
+
Open Vocabulary Detection/Pointing using moondream2.
|
180 |
+
"""
|
181 |
+
if image is None:
|
182 |
+
return "Please upload an image.", None
|
183 |
+
|
184 |
+
original_width, original_height = image.size
|
185 |
+
annotated_image = image.copy()
|
186 |
+
json_output = {}
|
187 |
+
|
188 |
+
if mode == "point":
|
189 |
+
result = model_md.point(im=image, prompt=prompt)
|
190 |
+
points = result.get("points", [])
|
191 |
+
json_output = result
|
192 |
+
if points:
|
193 |
+
rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
|
194 |
+
annotated_image = draw_points(annotated_image, rescaled_points)
|
195 |
+
|
196 |
+
elif mode == "object_detection":
|
197 |
+
result = model_md.detect(im=image, prompt=prompt)
|
198 |
+
boxes = result.get("objects", [])
|
199 |
+
json_output = result
|
200 |
+
if boxes:
|
201 |
+
rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
|
202 |
+
annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
|
203 |
+
else:
|
204 |
+
return "Invalid mode selected.", None
|
205 |
+
|
206 |
+
return json_output, annotated_image
|
207 |
+
|
208 |
def downsample_video(video_path):
|
209 |
"""
|
210 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
|
349 |
time.sleep(0.01)
|
350 |
yield buffer, buffer
|
351 |
|
352 |
+
# Define examples
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
image_examples = [
|
354 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
355 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
356 |
+
["explain the movie shot in detail.", "images/3.png"],
|
357 |
["fill the correct numbers.", "images/4.png"]
|
358 |
]
|
359 |
|
|
|
362 |
["explain the video in detail.", "videos/2.mp4"]
|
363 |
]
|
364 |
|
|
|
365 |
object_detection_examples = [
|
366 |
["Detect Spider-Man T-shirt.", "images/22.png"],
|
367 |
["Detect Green Car.", "images/11.png"]
|
368 |
]
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
# Added CSS to style the output area as a "Canvas"
|
371 |
css = """
|
372 |
.submit-btn {
|
|
|
426 |
inputs=[input_img, text_input, system_prompt],
|
427 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
428 |
)
|
429 |
+
# NEW MOONDREAM TAB
|
430 |
+
with gr.TabItem("moondream-vision"):
|
431 |
+
gr.Markdown("## Moondream Vision: Object Pointing & Detection")
|
432 |
with gr.Row():
|
433 |
with gr.Column():
|
434 |
+
moondream_input_img = gr.Image(label="Input Image", type="pil")
|
435 |
+
moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
|
436 |
+
moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
|
437 |
+
moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
|
|
|
|
|
|
|
|
|
438 |
with gr.Column():
|
439 |
+
moondream_json_output = gr.JSON(label="Output JSON")
|
440 |
+
moondream_annotated_image = gr.Image(label="Detection Result")
|
441 |
|
442 |
+
moondream_submit_btn.click(
|
443 |
+
fn=run_moondream,
|
444 |
+
inputs=[moondream_input_img, moondream_text_input, moondream_mode],
|
445 |
+
outputs=[moondream_json_output, moondream_annotated_image]
|
446 |
)
|
447 |
|
448 |
+
|
449 |
with gr.Accordion("Advanced options", open=False):
|
450 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
451 |
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
|
|
|
458 |
gr.Markdown("## Result.Md")
|
459 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
460 |
|
461 |
+
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
462 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
463 |
|
464 |
model_choice = gr.Radio(
|
|
|
472 |
gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
473 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
474 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
475 |
+
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
476 |
+
|
477 |
image_submit.click(
|
478 |
fn=generate_image,
|
479 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|