prithivMLmods commited on
Commit
2e943bd
·
verified ·
1 Parent(s): b7f1b34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -76
app.py CHANGED
@@ -8,7 +8,6 @@ from threading import Thread
8
  import base64
9
  from io import BytesIO
10
  import re
11
- from typing import Literal
12
 
13
  import gradio as gr
14
  import spaces
@@ -23,6 +22,7 @@ from transformers import (
23
  AutoProcessor,
24
  TextIteratorStreamer,
25
  AutoModelForCausalLM,
 
26
  )
27
  from qwen_vl_utils import process_vision_info
28
 
@@ -71,15 +71,15 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
71
 
72
  # Load moondream2
73
  MODEL_ID_MD = "vikhyatk/moondream2"
 
74
  model_md = AutoModelForCausalLM.from_pretrained(
75
  MODEL_ID_MD,
76
  revision="2025-06-21",
77
  trust_remote_code=True,
78
- torch_dtype=torch.float16,
79
  ).to(device).eval()
80
 
81
-
82
- # Helper functions for object detection and drawing
83
  def image_to_base64(image):
84
  """Convert a PIL image to a base64-encoded string."""
85
  buffered = BytesIO()
@@ -95,14 +95,6 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
95
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
96
  return image
97
 
98
- def draw_points(image, points, color="lime", radius=10):
99
- """Draw points (circles) on an image."""
100
- draw = ImageDraw.Draw(image)
101
- for point in points:
102
- x, y = point
103
- draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
104
- return image
105
-
106
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
107
  """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
108
  x_scale = original_width / scaled_width
@@ -127,11 +119,11 @@ default_system_prompt = (
127
  "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
128
  )
129
 
130
- # Function for ViLaSR object detection
131
  @spaces.GPU
132
  def run_example(image, text_input, system_prompt):
133
  """Detect objects in an image and return bounding box annotations."""
134
- model = model_x
135
  processor = processor_x
136
 
137
  messages = [
@@ -172,41 +164,6 @@ def run_example(image, text_input, system_prompt):
172
  annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
173
  return output_text[0], str(parsed_boxes), annotated_image
174
 
175
- # Function for Moondream object pointing/detection
176
- @spaces.GPU
177
- def run_moondream(image: Image.Image, prompt: str, mode: Literal["point", "object_detection"]):
178
- """
179
- Open Vocabulary Detection/Pointing using moondream2.
180
- """
181
- if image is None:
182
- return "Please upload an image.", None
183
-
184
- original_width, original_height = image.size
185
- annotated_image = image.copy()
186
- json_output = {}
187
-
188
- if mode == "point":
189
- # FIX: Changed 'im' to 'image'
190
- result = model_md.point(image=image, prompt=prompt)
191
- points = result.get("points", [])
192
- json_output = result
193
- if points:
194
- rescaled_points = [[p[0] * original_width, p[1] * original_height] for p in points]
195
- annotated_image = draw_points(annotated_image, rescaled_points)
196
-
197
- elif mode == "object_detection":
198
- # FIX: Changed 'im' to 'image'
199
- result = model_md.detect(image=image, prompt=prompt)
200
- boxes = result.get("objects", [])
201
- json_output = result
202
- if boxes:
203
- rescaled_boxes = [[b[0] * original_width, b[1] * original_height, b[2] * original_width, b[3] * original_height] for b in boxes]
204
- annotated_image = draw_bounding_boxes(annotated_image, rescaled_boxes, outline_color="lime", line_width=3)
205
- else:
206
- return "Invalid mode selected.", None
207
-
208
- return json_output, annotated_image
209
-
210
  def downsample_video(video_path):
211
  """
212
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -249,6 +206,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
249
  elif model_name == "ShotVL-7B":
250
  processor = processor_s
251
  model = model_s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  else:
253
  yield "Invalid model selected.", "Invalid model selected."
254
  return
@@ -305,6 +281,31 @@ def generate_video(model_name: str, text: str, video_path: str,
305
  elif model_name == "ShotVL-7B":
306
  processor = processor_s
307
  model = model_s
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  else:
309
  yield "Invalid model selected.", "Invalid model selected."
310
  return
@@ -351,11 +352,11 @@ def generate_video(model_name: str, text: str, video_path: str,
351
  time.sleep(0.01)
352
  yield buffer, buffer
353
 
354
- # Define examples
355
  image_examples = [
356
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
357
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
358
- ["explain the movie shot in detail.", "images/3.png"],
359
  ["fill the correct numbers.", "images/4.png"]
360
  ]
361
 
@@ -364,8 +365,9 @@ video_examples = [
364
  ["explain the video in detail.", "videos/2.mp4"]
365
  ]
366
 
 
367
  object_detection_examples = [
368
- ["Detect Spider-Man T-shirt.", "images/22.png"],
369
  ["Detect Green Car.", "images/11.png"]
370
  ]
371
 
@@ -428,25 +430,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
428
  inputs=[input_img, text_input, system_prompt],
429
  outputs=[model_output_text, parsed_boxes, annotated_image]
430
  )
431
- # NEW MOONDREAM TAB
432
- with gr.TabItem("moondream-vision"):
433
- gr.Markdown("## Moondream Vision: Object Pointing & Detection")
434
- with gr.Row():
435
- with gr.Column():
436
- moondream_input_img = gr.Image(label="Input Image", type="pil")
437
- moondream_text_input = gr.Textbox(label="Object to Detect", placeholder="e.g., A red car")
438
- moondream_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="object_detection")
439
- moondream_submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
440
- with gr.Column():
441
- moondream_json_output = gr.JSON(label="Output JSON")
442
- moondream_annotated_image = gr.Image(label="Detection Result")
443
-
444
- moondream_submit_btn.click(
445
- fn=run_moondream,
446
- inputs=[moondream_input_img, moondream_text_input, moondream_mode],
447
- outputs=[moondream_json_output, moondream_annotated_image]
448
- )
449
-
450
 
451
  with gr.Accordion("Advanced options", open=False):
452
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -460,22 +443,23 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
460
  gr.Markdown("## Result.Md")
461
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
462
 
463
- with gr.Accordion("Formatted Result (Result.Md)", open=False):
464
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
465
 
466
  model_choice = gr.Radio(
467
- choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B"],
468
  label="Select Model",
469
  value="Camel-Doc-OCR-062825"
470
  )
471
 
472
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
473
  gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
474
- gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
475
  gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
476
  gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
477
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
478
-
 
479
  image_submit.click(
480
  fn=generate_image,
481
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
8
  import base64
9
  from io import BytesIO
10
  import re
 
11
 
12
  import gradio as gr
13
  import spaces
 
22
  AutoProcessor,
23
  TextIteratorStreamer,
24
  AutoModelForCausalLM,
25
+ AutoTokenizer
26
  )
27
  from qwen_vl_utils import process_vision_info
28
 
 
71
 
72
  # Load moondream2
73
  MODEL_ID_MD = "vikhyatk/moondream2"
74
+ tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD)
75
  model_md = AutoModelForCausalLM.from_pretrained(
76
  MODEL_ID_MD,
77
  revision="2025-06-21",
78
  trust_remote_code=True,
79
+ torch_dtype=torch.float16
80
  ).to(device).eval()
81
 
82
+ # Helper functions for object detection
 
83
  def image_to_base64(image):
84
  """Convert a PIL image to a base64-encoded string."""
85
  buffered = BytesIO()
 
95
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
96
  return image
97
 
 
 
 
 
 
 
 
 
98
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
99
  """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
100
  x_scale = original_width / scaled_width
 
119
  "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
120
  )
121
 
122
+ # Function for object detection
123
  @spaces.GPU
124
  def run_example(image, text_input, system_prompt):
125
  """Detect objects in an image and return bounding box annotations."""
126
+ model = model_x
127
  processor = processor_x
128
 
129
  messages = [
 
164
  annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
165
  return output_text[0], str(parsed_boxes), annotated_image
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  def downsample_video(video_path):
168
  """
169
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
 
206
  elif model_name == "ShotVL-7B":
207
  processor = processor_s
208
  model = model_s
209
+ elif model_name == "moondream2":
210
+ model = model_md
211
+ tokenizer = tokenizer_md
212
+ image_embeds = model.encode_image(image)
213
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
214
+ thread = Thread(target=model.answer_question, kwargs={
215
+ "image_embeds": image_embeds,
216
+ "question": text,
217
+ "tokenizer": tokenizer,
218
+ "max_new_tokens": max_new_tokens,
219
+ "streamer": streamer,
220
+ })
221
+ thread.start()
222
+ buffer = ""
223
+ for new_text in streamer:
224
+ buffer += new_text
225
+ time.sleep(0.01)
226
+ yield buffer, buffer
227
+ return
228
  else:
229
  yield "Invalid model selected.", "Invalid model selected."
230
  return
 
281
  elif model_name == "ShotVL-7B":
282
  processor = processor_s
283
  model = model_s
284
+ elif model_name == "moondream2":
285
+ model = model_md
286
+ tokenizer = tokenizer_md
287
+ frames = downsample_video(video_path)
288
+ buffer = ""
289
+ for frame in frames:
290
+ image, timestamp = frame
291
+ image_embeds = model.encode_image(image)
292
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
293
+ thread = Thread(target=model.answer_question, kwargs={
294
+ "image_embeds": image_embeds,
295
+ "question": text,
296
+ "tokenizer": tokenizer,
297
+ "max_new_tokens": max_new_tokens,
298
+ "streamer": streamer,
299
+ })
300
+ thread.start()
301
+ frame_buffer = f"Frame {timestamp}:\n"
302
+ for new_text in streamer:
303
+ frame_buffer += new_text
304
+ buffer += new_text
305
+ time.sleep(0.01)
306
+ yield buffer, buffer
307
+ buffer += "\n\n"
308
+ return
309
  else:
310
  yield "Invalid model selected.", "Invalid model selected."
311
  return
 
352
  time.sleep(0.01)
353
  yield buffer, buffer
354
 
355
+ # Define examples for image and video inference
356
  image_examples = [
357
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
358
  ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
359
+ ["explain the movie shot in detail.", "images/3.png"],
360
  ["fill the correct numbers.", "images/4.png"]
361
  ]
362
 
 
365
  ["explain the video in detail.", "videos/2.mp4"]
366
  ]
367
 
368
+ # Define examples for object detection
369
  object_detection_examples = [
370
+ ["Detect Spider-Man T-shirt.", -shirt.", "images/22.png"],
371
  ["Detect Green Car.", "images/11.png"]
372
  ]
373
 
 
430
  inputs=[input_img, text_input, system_prompt],
431
  outputs=[model_output_text, parsed_boxes, annotated_image]
432
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
  with gr.Accordion("Advanced options", open=False):
435
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
443
  gr.Markdown("## Result.Md")
444
  output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
445
 
446
+ with gr.Accordion("Formatted Result (Result.Md)", open=False):
447
  markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
448
 
449
  model_choice = gr.Radio(
450
+ choices=["Camel-Doc-OCR-062825", "ViLaSR-7B", "OCRFlux-3B", "ShotVL-7B", "moondream2"],
451
  label="Select Model",
452
  value="Camel-Doc-OCR-062825"
453
  )
454
 
455
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
456
  gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
457
+ gr.Markdown("> [OCRFlux-3B](https://h темаuggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
458
  gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
459
  gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
460
+ gr.Markdown("> [moondream2](https://huggingface.co/vikhyatk/moondream2) : A small vision language model that can be run on edge devices. Capable of captioning, visual querying, object detection, and more.")
461
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
462
+
463
  image_submit.click(
464
  fn=generate_image,
465
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],