prithivMLmods commited on
Commit
760a8e5
·
verified ·
1 Parent(s): 035a499

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -143
app.py CHANGED
@@ -1,23 +1,13 @@
1
  import os
2
- import random
3
- import uuid
4
- import json
5
  import time
6
- import asyncio
7
- from threading import Thread
8
- import base64
9
- from io import BytesIO
10
- import re
11
-
12
  import gradio as gr
13
  import spaces
14
  import torch
15
  import numpy as np
16
- from PIL import Image, ImageDraw
17
  import cv2
18
-
19
  from transformers import (
20
- Qwen2VLForConditionalGeneration,
21
  Qwen2_5_VLForConditionalGeneration,
22
  AutoProcessor,
23
  TextIteratorStreamer,
@@ -67,91 +57,6 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
67
  torch_dtype=torch.float16
68
  ).to(device).eval()
69
 
70
- # Helper functions for object detection
71
- def image_to_base64(image):
72
- """Convert a PIL image to a base64-encoded string."""
73
- buffered = BytesIO()
74
- image.save(buffered, format="PNG")
75
- img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
76
- return img_str
77
-
78
- def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
79
- """Draw bounding boxes on an image."""
80
- draw = ImageDraw.Draw(image)
81
- for box in bounding_boxes:
82
- xmin, ymin, xmax, ymax = box
83
- draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
84
- return image
85
-
86
- def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
87
- """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
88
- x_scale = original_width / scaled_width
89
- y_scale = original_height / scaled_height
90
- rescaled_boxes = []
91
- for box in bounding_boxes:
92
- xmin, ymin, xmax, ymax = box
93
- rescaled_box = [
94
- xmin * x_scale,
95
- ymin * y_scale,
96
- xmax * x_scale,
97
- ymax * y_scale
98
- ]
99
- rescaled_boxes.append(rescaled_box)
100
- return rescaled_boxes
101
-
102
- # Default system prompt for object detection
103
- default_system_prompt = (
104
- "You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, Parse only the boxes; don't write unnecessary content."
105
- "you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] with the values being scaled "
106
- "to 512 by 512 pixels. When there are more than one result, answer with a list of bounding boxes in the form "
107
- "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
108
- )
109
-
110
- # Function for object detection
111
- @spaces.GPU
112
- def run_example(image, text_input, system_prompt):
113
- """Detect objects in an image and return bounding box annotations."""
114
- model = model_x
115
- processor = processor_x
116
-
117
- messages = [
118
- {
119
- "role": "user",
120
- "content": [
121
- {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
122
- {"type": "text", "text": system_prompt},
123
- {"type": "text", "text": text_input},
124
- ],
125
- }
126
- ]
127
-
128
- text = processor.apply_chat_template(
129
- messages, tokenize=False, add_generation_prompt=True
130
- )
131
- image_inputs, video_inputs = process_vision_info(messages)
132
- inputs = processor(
133
- text=[text],
134
- images=image_inputs,
135
- videos=video_inputs,
136
- padding=True,
137
- return_tensors="pt",
138
- )
139
- inputs = inputs.to("cuda")
140
-
141
- generated_ids = model.generate(**inputs, max_new_tokens=256)
142
- generated_ids_trimmed = [
143
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
144
- ]
145
- output_text = processor.batch_decode(
146
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
147
- )
148
- pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
149
- matches = re.findall(pattern, str(output_text))
150
- parsed_boxes = [[int(num) for num in match] for match in matches]
151
- scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
152
- annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
153
- return output_text[0], str(parsed_boxes), annotated_image
154
-
155
  def downsample_video(video_path):
156
  """
157
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -220,7 +125,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
220
  ).to(device)
221
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
222
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
223
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
224
  thread.start()
225
  buffer = ""
226
  for new_text in streamer:
@@ -287,7 +192,7 @@ def generate_video(model_name: str, text: str, video_path: str,
287
  "top_k": top_k,
288
  "repetition_penalty": repetition_penalty,
289
  }
290
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
291
  thread.start()
292
  buffer = ""
293
  for new_text in streamer:
@@ -309,12 +214,6 @@ video_examples = [
309
  ["explain the video in detail.", "videos/2.mp4"]
310
  ]
311
 
312
- # Define examples for object detection
313
- object_detection_examples = [
314
- ["Detect Spider-Man T-shirt.", "images/22.png"],
315
- ["Detect Green Car.", "images/11.png"]
316
- ]
317
-
318
  # Added CSS to style the output area as a "Canvas"
319
  css = """
320
  .submit-btn {
@@ -333,7 +232,7 @@ css = """
333
 
334
  # Create the Gradio Interface
335
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
336
- gr.Markdown("# **[Doc VLMs v2 [Localization]](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
337
  with gr.Row():
338
  with gr.Column():
339
  with gr.Tabs():
@@ -353,27 +252,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
353
  examples=video_examples,
354
  inputs=[video_query, video_upload]
355
  )
356
- with gr.TabItem("Object Detection / Localization"):
357
- with gr.Row():
358
- with gr.Column():
359
- input_img = gr.Image(label="Input Image [ 1024x1024 ]", type="pil")
360
- system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt, visible=False)
361
- text_input = gr.Textbox(label="Query Input", placeholder="Enter query...")
362
- submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
363
- gr.Examples(
364
- examples=object_detection_examples,
365
- inputs=[text_input, input_img]
366
- )
367
- with gr.Column():
368
- model_output_text = gr.Textbox(label="Model Output Text")
369
- parsed_boxes = gr.Textbox(label="Parsed Boxes")
370
- annotated_image = gr.Image(label="Annotated Image")
371
-
372
- submit_btn.click(
373
- fn=run_example,
374
- inputs=[input_img, text_input, system_prompt],
375
- outputs=[model_output_text, parsed_boxes, annotated_image]
376
- )
377
 
378
  with gr.Accordion("Advanced options", open=False):
379
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -397,22 +275,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
397
  )
398
 
399
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
400
- gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
401
- gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
402
- gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
403
- gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
404
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
405
-
406
- image_submit.click(
407
- fn=generate_image,
408
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
409
- outputs=[output, markdown_output]
410
- )
411
- video_submit.click(
412
- fn=generate_video,
413
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
414
- outputs=[output, markdown_output]
415
- )
416
 
417
  if __name__ == "__main__":
418
  demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
 
1
  import os
 
 
 
2
  import time
3
+ import threading
 
 
 
 
 
4
  import gradio as gr
5
  import spaces
6
  import torch
7
  import numpy as np
8
+ from PIL import Image
9
  import cv2
 
10
  from transformers import (
 
11
  Qwen2_5_VLForConditionalGeneration,
12
  AutoProcessor,
13
  TextIteratorStreamer,
 
57
  torch_dtype=torch.float16
58
  ).to(device).eval()
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def downsample_video(video_path):
61
  """
62
  Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
 
125
  ).to(device)
126
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
127
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
128
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
129
  thread.start()
130
  buffer = ""
131
  for new_text in streamer:
 
192
  "top_k": top_k,
193
  "repetition_penalty": repetition_penalty,
194
  }
195
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
196
  thread.start()
197
  buffer = ""
198
  for new_text in streamer:
 
214
  ["explain the video in detail.", "videos/2.mp4"]
215
  ]
216
 
 
 
 
 
 
 
217
  # Added CSS to style the output area as a "Canvas"
218
  css = """
219
  .submit-btn {
 
232
 
233
  # Create the Gradio Interface
234
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
235
+ gr.Markdown("# **Doc VLMs v2**")
236
  with gr.Row():
237
  with gr.Column():
238
  with gr.Tabs():
 
252
  examples=video_examples,
253
  inputs=[video_query, video_upload]
254
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  with gr.Accordion("Advanced options", open=False):
257
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
275
  )
276
 
277
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  if __name__ == "__main__":
280
  demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)