prithivMLmods commited on
Commit
74f76da
·
verified ·
1 Parent(s): fd7a385

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -66
app.py CHANGED
@@ -8,6 +8,7 @@ from threading import Thread
8
  import base64
9
  from io import BytesIO
10
  import re
 
11
 
12
  import gradio as gr
13
  import spaces
@@ -22,7 +23,6 @@ from transformers import (
22
  AutoProcessor,
23
  TextIteratorStreamer,
24
  AutoModelForCausalLM,
25
- AutoTokenizer
26
  )
27
  from qwen_vl_utils import process_vision_info
28
 
@@ -69,17 +69,17 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
69
  torch_dtype=torch.float16
70
  ).to(device).eval()
71
 
72
- # Load Moondream2
73
- @spaces.GPU
74
- def load_moondream_model():
75
- return AutoModelForCausalLM.from_pretrained(
76
- "vikhyatk/moondream2",
77
- revision="2025-04-14",
78
- trust_remote_code=True,
79
- device_map={"": "cuda"},
80
- )
81
 
82
- # Helper functions for object detection
 
83
  def image_to_base64(image):
84
  """Convert a PIL image to a base64-encoded string."""
85
  buffered = BytesIO()
@@ -95,12 +95,12 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
95
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
96
  return image
97
 
98
- def draw_points(image, points, color="red", radius=5):
99
- """Draw points on an image."""
100
  draw = ImageDraw.Draw(image)
101
  for point in points:
102
  x, y = point
103
- draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color)
104
  return image
105
 
106
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
@@ -127,11 +127,11 @@ default_system_prompt = (
127
  "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
128
  )
129
 
130
- # Function for object detection
131
  @spaces.GPU
132
  def run_example(image, text_input, system_prompt):
133
  """Detect objects in an image and return bounding box annotations."""
134
- model = model_x
135
  processor = processor_x
136
 
137
  messages = [
@@ -172,6 +172,39 @@ def run_example(image, text_input, system_prompt):
172
  annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
173
  return output_text[0], str(parsed_boxes), annotated_image
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def downsample_video(video_path):
176
  """
177
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -316,34 +349,11 @@ def generate_video(model_name: str, text: str, video_path: str,
316
  time.sleep(0.01)
317
  yield buffer, buffer
318
 
319
- # Moondream Vision Detection Function
320
- @spaces.GPU
321
- def detect_moondream(im: Image.Image, object_name: str, mode: str):
322
- """
323
- Open Vocabulary Detection using moondream2
324
- Args:
325
- im: Pillow Image
326
- object_name: the object you would like to detect
327
- mode: point or object_detection
328
- Returns:
329
- list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
330
- annotated_image: Image with detections drawn
331
- """
332
- model = load_moondream_model()
333
- if mode == "point":
334
- points = model.point(im, object_name)["points"]
335
- annotated_image = draw_points(im.copy(), points)
336
- return points, annotated_image
337
- elif mode == "object_detection":
338
- boxes = model.detect(im, object_name)["objects"]
339
- annotated_image = draw_bounding_boxes(im.copy(), boxes)
340
- return boxes, annotated_image
341
-
342
- # Define examples for image and video inference
343
  image_examples = [
344
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
345
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
346
- ["explain the movie shot in detail.", "images/3.png"],
347
  ["fill the correct numbers.", "images/4.png"]
348
  ]
349
 
@@ -352,18 +362,11 @@ video_examples = [
352
  ["explain the video in detail.", "videos/2.mp4"]
353
  ]
354
 
355
- # Define examples for object detection
356
  object_detection_examples = [
357
  ["Detect Spider-Man T-shirt.", "images/22.png"],
358
  ["Detect Green Car.", "images/11.png"]
359
  ]
360
 
361
- # Define examples for Moondream Vision
362
- moondream_examples = [
363
- ["Spider-Man T-shirt", "images/22.png", "point"],
364
- ["Green Car", "images/11.png", "object_detection"]
365
- ]
366
-
367
  # Added CSS to style the output area as a "Canvas"
368
  css = """
369
  .submit-btn {
@@ -423,27 +426,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
423
  inputs=[input_img, text_input, system_prompt],
424
  outputs=[model_output_text, parsed_boxes, annotated_image]
425
  )
426
- with gr.TabItem("Moondream Vision"):
 
 
427
  with gr.Row():
428
  with gr.Column():
429
- moon_image = gr.Image(label="Input Image", type="pil")
430
- moon_object = gr.Textbox(label="Object to Detect", placeholder="e.g., Spider-Man T-shirt")
431
- moon_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="point")
432
- moon_submit = gr.Button("Detect", elem_classes="submit-btn")
433
- gr.Examples(
434
- examples=moondream_examples,
435
- inputs=[moon_object, moon_image, moon_mode]
436
- )
437
  with gr.Column():
438
- moon_output_json = gr.JSON(label="Detection Results")
439
- moon_annotated_image = gr.Image(label="Annotated Image")
440
 
441
- moon_submit.click(
442
- fn=detect_moondream,
443
- inputs=[moon_image, moon_object, moon_mode],
444
- outputs=[moon_output_json, moon_annotated_image]
445
  )
446
 
 
447
  with gr.Accordion("Advanced options", open=False):
448
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
449
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -456,7 +458,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
456
  gr.Markdown("## Result.Md")
457
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
458
 
459
- with gr.Accordion("Formatted Result (Result.Md)", open=False):
460
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
461
 
462
  model_choice = gr.Radio(
@@ -470,8 +472,8 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
470
  gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
471
  gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
472
  gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
473
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
474
-
475
  image_submit.click(
476
  fn=generate_image,
477
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
8
  import base64
9
  from io import BytesIO
10
  import re
11
+ from typing import Literal
12
 
13
  import gradio as gr
14
  import spaces
 
23
  AutoProcessor,
24
  TextIteratorStreamer,
25
  AutoModelForCausalLM,
 
26
  )
27
  from qwen_vl_utils import process_vision_info
28
 
 
69
  torch_dtype=torch.float16
70
  ).to(device).eval()
71
 
72
+ # Load moondream2
73
+ MODEL_ID_MD = "vikhyatk/moondream2"
74
+ model_md = AutoModelForCausalLM.from_pretrained(
75
+ MODEL_ID_MD,
76
+ revision="2025-06-21",
77
+ trust_remote_code=True,
78
+ torch_dtype=torch.float16,
79
+ ).to(device).eval()
 
80
 
81
+
82
+ # Helper functions for object detection and drawing
83
  def image_to_base64(image):
84
  """Convert a PIL image to a base64-encoded string."""
85
  buffered = BytesIO()
 
95
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
96
  return image
97
 
98
+ def draw_points(image, points, color="lime", radius=10):
99
+ """Draw points (circles) on an image."""
100
  draw = ImageDraw.Draw(image)
101
  for point in points:
102
  x, y = point
103
+ draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
104
  return image
105
 
106
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
 
127
  "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
128
  )
129
 
130
+ # Function for ViLaSR object detection
131
  @spaces.GPU
132
  def run_example(image, text_input, system_prompt):
133
  """Detect objects in an image and return bounding box annotations."""
134
+ model = model_x
135
  processor = processor_x
136
 
137
  messages = [
 
172
  annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
173
  return output_text[0], str(parsed_boxes), annotated_image
174
 
175
+ # Function for Moondream object pointing/detection
176
+ @spaces.GPU
177
+ def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
178
+ """
179
+ Open Vocabulary Detection/Pointing using moondream2.
180
+ """
181
+ if image is None:
182
+ return "Please upload an image.", None
183
+
184
+ original_width, original_height = image.size
185
+ annotated_image = image.copy()
186
+ json_output = {}
187
+
188
+ if mode == "point":
189
+ result = model_md.point(im=image, prompt=prompt)
190
+ points = result.get("points", [])
191
+ json_output = result
192
+ if points:
193
+ rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
194
+ annotated_image = draw_points(annotated_image, rescaled_points)
195
+
196
+ elif mode == "object_detection":
197
+ result = model_md.detect(im=image, prompt=prompt)
198
+ boxes = result.get("objects", [])
199
+ json_output = result
200
+ if boxes:
201
+ rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
202
+ annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
203
+ else:
204
+ return "Invalid mode selected.", None
205
+
206
+ return json_output, annotated_image
207
+
208
  def downsample_video(video_path):
209
  """
210
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
 
349
  time.sleep(0.01)
350
  yield buffer, buffer
351
 
352
+ # Define examples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  image_examples = [
354
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
355
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
356
+ ["explain the movie shot in detail.", "images/3.png"],
357
  ["fill the correct numbers.", "images/4.png"]
358
  ]
359
 
 
362
  ["explain the video in detail.", "videos/2.mp4"]
363
  ]
364
 
 
365
  object_detection_examples = [
366
  ["Detect Spider-Man T-shirt.", "images/22.png"],
367
  ["Detect Green Car.", "images/11.png"]
368
  ]
369
 
 
 
 
 
 
 
370
  # Added CSS to style the output area as a "Canvas"
371
  css = """
372
  .submit-btn {
 
426
  inputs=[input_img, text_input, system_prompt],
427
  outputs=[model_output_text, parsed_boxes, annotated_image]
428
  )
429
+ # NEW MOONDREAM TAB
430
+ with gr.TabItem("moondream-vision"):
431
+ gr.Markdown("## Moondream Vision: Object Pointing & Detection")
432
  with gr.Row():
433
  with gr.Column():
434
+ moondream_input_img = gr.Image(label="Input Image", type="pil")
435
+ moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
436
+ moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
437
+ moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
 
 
 
 
438
  with gr.Column():
439
+ moondream_json_output = gr.JSON(label="Output JSON")
440
+ moondream_annotated_image = gr.Image(label="Detection Result")
441
 
442
+ moondream_submit_btn.click(
443
+ fn=run_moondream,
444
+ inputs=[moondream_input_img, moondream_text_input, moondream_mode],
445
+ outputs=[moondream_json_output, moondream_annotated_image]
446
  )
447
 
448
+
449
  with gr.Accordion("Advanced options", open=False):
450
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
451
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
458
  gr.Markdown("## Result.Md")
459
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
460
 
461
+ with gr.Accordion("Formatted Result (Result.Md)", open=False):
462
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
463
 
464
  model_choice = gr.Radio(
 
472
  gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
473
  gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
474
  gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
475
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
476
+
477
  image_submit.click(
478
  fn=generate_image,
479
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],