Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,6 @@ from threading import Thread
|
|
8 |
import base64
|
9 |
from io import BytesIO
|
10 |
import re
|
11 |
-
from typing import Literal
|
12 |
|
13 |
import gradio as gr
|
14 |
import spaces
|
@@ -23,6 +22,7 @@ from transformers import (
|
|
23 |
AutoProcessor,
|
24 |
TextIteratorStreamer,
|
25 |
AutoModelForCausalLM,
|
|
|
26 |
)
|
27 |
from qwen_vl_utils import process_vision_info
|
28 |
|
@@ -71,15 +71,15 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
|
71 |
|
72 |
# Load moondream2
|
73 |
MODEL_ID_MD = "vikhyatk/moondream2"
|
|
|
74 |
model_md = AutoModelForCausalLM.from_pretrained(
|
75 |
MODEL_ID_MD,
|
76 |
revision="2025-06-21",
|
77 |
trust_remote_code=True,
|
78 |
-
torch_dtype=torch.float16
|
79 |
).to(device).eval()
|
80 |
|
81 |
-
|
82 |
-
# Helper functions for object detection and drawing
|
83 |
def image_to_base64(image):
|
84 |
"""Convert a PIL image to a base64-encoded string."""
|
85 |
buffered = BytesIO()
|
@@ -95,14 +95,6 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
|
|
95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
96 |
return image
|
97 |
|
98 |
-
def draw_points(image, points, color="lime", radius=10):
|
99 |
-
"""Draw points (circles) on an image."""
|
100 |
-
draw = ImageDraw.Draw(image)
|
101 |
-
for point in points:
|
102 |
-
x, y = point
|
103 |
-
draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
|
104 |
-
return image
|
105 |
-
|
106 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
107 |
"""Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
|
108 |
x_scale = original_width / scaled_width
|
@@ -127,11 +119,11 @@ default_system_prompt = (
|
|
127 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
128 |
)
|
129 |
|
130 |
-
# Function for
|
131 |
@spaces.GPU
|
132 |
def run_example(image, text_input, system_prompt):
|
133 |
"""Detect objects in an image and return bounding box annotations."""
|
134 |
-
model = model_x
|
135 |
processor = processor_x
|
136 |
|
137 |
messages = [
|
@@ -172,41 +164,6 @@ def run_example(image, text_input, system_prompt):
|
|
172 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
173 |
return output_text[0], str(parsed_boxes), annotated_image
|
174 |
|
175 |
-
# Function for Moondream object pointing/detection
|
176 |
-
@spaces.GPU
|
177 |
-
def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
|
178 |
-
"""
|
179 |
-
Open Vocabulary Detection/Pointing using moondream2.
|
180 |
-
"""
|
181 |
-
if image is None:
|
182 |
-
return "Please upload an image.", None
|
183 |
-
|
184 |
-
original_width, original_height = image.size
|
185 |
-
annotated_image = image.copy()
|
186 |
-
json_output = {}
|
187 |
-
|
188 |
-
if mode == "point":
|
189 |
-
# FIX: Changed 'im' to 'image'
|
190 |
-
result = model_md.point(image=image, prompt=prompt)
|
191 |
-
points = result.get("points", [])
|
192 |
-
json_output = result
|
193 |
-
if points:
|
194 |
-
rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
|
195 |
-
annotated_image = draw_points(annotated_image, rescaled_points)
|
196 |
-
|
197 |
-
elif mode == "object_detection":
|
198 |
-
# FIX: Changed 'im' to 'image'
|
199 |
-
result = model_md.detect(image=image, prompt=prompt)
|
200 |
-
boxes = result.get("objects", [])
|
201 |
-
json_output = result
|
202 |
-
if boxes:
|
203 |
-
rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
|
204 |
-
annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
|
205 |
-
else:
|
206 |
-
return "Invalid mode selected.", None
|
207 |
-
|
208 |
-
return json_output, annotated_image
|
209 |
-
|
210 |
def downsample_video(video_path):
|
211 |
"""
|
212 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
@@ -249,6 +206,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
249 |
elif model_name == "ShotVL-7B":
|
250 |
processor = processor_s
|
251 |
model = model_s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
else:
|
253 |
yield "Invalid model selected.", "Invalid model selected."
|
254 |
return
|
@@ -305,6 +281,31 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
305 |
elif model_name == "ShotVL-7B":
|
306 |
processor = processor_s
|
307 |
model = model_s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
else:
|
309 |
yield "Invalid model selected.", "Invalid model selected."
|
310 |
return
|
@@ -351,11 +352,11 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
351 |
time.sleep(0.01)
|
352 |
yield buffer, buffer
|
353 |
|
354 |
-
# Define examples
|
355 |
image_examples = [
|
356 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
357 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
358 |
-
["explain the movie shot in detail.", "images/3.png"],
|
359 |
["fill the correct numbers.", "images/4.png"]
|
360 |
]
|
361 |
|
@@ -364,8 +365,9 @@ video_examples = [
|
|
364 |
["explain the video in detail.", "videos/2.mp4"]
|
365 |
]
|
366 |
|
|
|
367 |
object_detection_examples = [
|
368 |
-
["Detect Spider-Man T-shirt.", "images/22.png"],
|
369 |
["Detect Green Car.", "images/11.png"]
|
370 |
]
|
371 |
|
@@ -428,25 +430,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
428 |
inputs=[input_img, text_input, system_prompt],
|
429 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
430 |
)
|
431 |
-
# NEW MOONDREAM TAB
|
432 |
-
with gr.TabItem("moondream-vision"):
|
433 |
-
gr.Markdown("## Moondream Vision: Object Pointing & Detection")
|
434 |
-
with gr.Row():
|
435 |
-
with gr.Column():
|
436 |
-
moondream_input_img = gr.Image(label="Input Image", type="pil")
|
437 |
-
moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
|
438 |
-
moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
|
439 |
-
moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
|
440 |
-
with gr.Column():
|
441 |
-
moondream_json_output = gr.JSON(label="Output JSON")
|
442 |
-
moondream_annotated_image = gr.Image(label="Detection Result")
|
443 |
-
|
444 |
-
moondream_submit_btn.click(
|
445 |
-
fn=run_moondream,
|
446 |
-
inputs=[moondream_input_img, moondream_text_input, moondream_mode],
|
447 |
-
outputs=[moondream_json_output, moondream_annotated_image]
|
448 |
-
)
|
449 |
-
|
450 |
|
451 |
with gr.Accordion("Advanced options", open=False):
|
452 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
@@ -460,22 +443,23 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
460 |
gr.Markdown("## Result.Md")
|
461 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
462 |
|
463 |
-
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
464 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
465 |
|
466 |
model_choice = gr.Radio(
|
467 |
-
choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
|
468 |
label="Select Model",
|
469 |
value="Camel-Doc-OCR-062825"
|
470 |
)
|
471 |
|
472 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
|
473 |
gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
474 |
-
gr.Markdown("> [OCRFlux-3B](https://
|
475 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
476 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
477 |
-
gr.Markdown("
|
478 |
-
|
|
|
479 |
image_submit.click(
|
480 |
fn=generate_image,
|
481 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|
|
|
8 |
import base64
|
9 |
from io import BytesIO
|
10 |
import re
|
|
|
11 |
|
12 |
import gradio as gr
|
13 |
import spaces
|
|
|
22 |
AutoProcessor,
|
23 |
TextIteratorStreamer,
|
24 |
AutoModelForCausalLM,
|
25 |
+
AutoTokenizer
|
26 |
)
|
27 |
from qwen_vl_utils import process_vision_info
|
28 |
|
|
|
71 |
|
72 |
# Load moondream2
|
73 |
MODEL_ID_MD = "vikhyatk/moondream2"
|
74 |
+
tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
|
75 |
model_md = AutoModelForCausalLM.from_pretrained(
|
76 |
MODEL_ID_MD,
|
77 |
revision="2025-06-21",
|
78 |
trust_remote_code=True,
|
79 |
+
torch_dtype=torch.float16
|
80 |
).to(device).eval()
|
81 |
|
82 |
+
# Helper functions for object detection
|
|
|
83 |
def image_to_base64(image):
|
84 |
"""Convert a PIL image to a base64-encoded string."""
|
85 |
buffered = BytesIO()
|
|
|
95 |
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
|
96 |
return image
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
|
99 |
"""Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
|
100 |
x_scale = original_width / scaled_width
|
|
|
119 |
"of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
|
120 |
)
|
121 |
|
122 |
+
# Function for object detection
|
123 |
@spaces.GPU
|
124 |
def run_example(image, text_input, system_prompt):
|
125 |
"""Detect objects in an image and return bounding box annotations."""
|
126 |
+
model = model_x
|
127 |
processor = processor_x
|
128 |
|
129 |
messages = [
|
|
|
164 |
annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
|
165 |
return output_text[0], str(parsed_boxes), annotated_image
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
def downsample_video(video_path):
|
168 |
"""
|
169 |
Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
|
|
|
206 |
elif model_name == "ShotVL-7B":
|
207 |
processor = processor_s
|
208 |
model = model_s
|
209 |
+
elif model_name == "moondream2":
|
210 |
+
model = model_md
|
211 |
+
tokenizer = tokenizer_md
|
212 |
+
image_embeds = model.encode_image(image)
|
213 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
214 |
+
thread = Thread(target=model.answer_question, kwargs={
|
215 |
+
"image_embeds": image_embeds,
|
216 |
+
"question": text,
|
217 |
+
"tokenizer": tokenizer,
|
218 |
+
"max_new_tokens": max_new_tokens,
|
219 |
+
"streamer": streamer,
|
220 |
+
})
|
221 |
+
thread.start()
|
222 |
+
buffer = ""
|
223 |
+
for new_text in streamer:
|
224 |
+
buffer += new_text
|
225 |
+
time.sleep(0.01)
|
226 |
+
yield buffer, buffer
|
227 |
+
return
|
228 |
else:
|
229 |
yield "Invalid model selected.", "Invalid model selected."
|
230 |
return
|
|
|
281 |
elif model_name == "ShotVL-7B":
|
282 |
processor = processor_s
|
283 |
model = model_s
|
284 |
+
elif model_name == "moondream2":
|
285 |
+
model = model_md
|
286 |
+
tokenizer = tokenizer_md
|
287 |
+
frames = downsample_video(video_path)
|
288 |
+
buffer = ""
|
289 |
+
for frame in frames:
|
290 |
+
image, timestamp = frame
|
291 |
+
image_embeds = model.encode_image(image)
|
292 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
293 |
+
thread = Thread(target=model.answer_question, kwargs={
|
294 |
+
"image_embeds": image_embeds,
|
295 |
+
"question": text,
|
296 |
+
"tokenizer": tokenizer,
|
297 |
+
"max_new_tokens": max_new_tokens,
|
298 |
+
"streamer": streamer,
|
299 |
+
})
|
300 |
+
thread.start()
|
301 |
+
frame_buffer = f"Frame {timestamp}:\n"
|
302 |
+
for new_text in streamer:
|
303 |
+
frame_buffer += new_text
|
304 |
+
buffer += new_text
|
305 |
+
time.sleep(0.01)
|
306 |
+
yield buffer, buffer
|
307 |
+
buffer += "\n\n"
|
308 |
+
return
|
309 |
else:
|
310 |
yield "Invalid model selected.", "Invalid model selected."
|
311 |
return
|
|
|
352 |
time.sleep(0.01)
|
353 |
yield buffer, buffer
|
354 |
|
355 |
+
# Define examples for image and video inference
|
356 |
image_examples = [
|
357 |
["convert this page to doc [text] precisely for markdown.", "images/1.png"],
|
358 |
["convert this page to doc [table] precisely for markdown.", "images/2.png"],
|
359 |
+
["explain the movie shot in detail.", "images/3.png"],
|
360 |
["fill the correct numbers.", "images/4.png"]
|
361 |
]
|
362 |
|
|
|
365 |
["explain the video in detail.", "videos/2.mp4"]
|
366 |
]
|
367 |
|
368 |
+
# Define examples for object detection
|
369 |
object_detection_examples = [
|
370 |
+
["Detect Spider-Man T-shirt.", -shirt.", "images/22.png"],
|
371 |
["Detect Green Car.", "images/11.png"]
|
372 |
]
|
373 |
|
|
|
430 |
inputs=[input_img, text_input, system_prompt],
|
431 |
outputs=[model_output_text, parsed_boxes, annotated_image]
|
432 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
|
434 |
with gr.Accordion("Advanced options", open=False):
|
435 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
|
|
443 |
gr.Markdown("## Result.Md")
|
444 |
output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
|
445 |
|
446 |
+
with gr.Accordion("Formatted Result (Result.Md)", open=False):
|
447 |
markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
|
448 |
|
449 |
model_choice = gr.Radio(
|
450 |
+
choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B", "moondream2"],
|
451 |
label="Select Model",
|
452 |
value="Camel-Doc-OCR-062825"
|
453 |
)
|
454 |
|
455 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
|
456 |
gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
|
457 |
+
gr.Markdown("> [OCRFlux-3B](https://h темаuggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
|
458 |
gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
|
459 |
gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
|
460 |
+
gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
|
461 |
+
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
462 |
+
|
463 |
image_submit.click(
|
464 |
fn=generate_image,
|
465 |
inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
|