prithivMLmods commited on
Commit
fd7a385
·
verified ·
1 Parent(s): 7d59d39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py CHANGED
@@ -21,6 +21,8 @@ from transformers import (
21
  Qwen2_5_VLForConditionalGeneration,
22
  AutoProcessor,
23
  TextIteratorStreamer,
 
 
24
  )
25
  from qwen_vl_utils import process_vision_info
26
 
@@ -67,6 +69,16 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
67
  torch_dtype=torch.float16
68
  ).to(device).eval()
69
 
 
 
 
 
 
 
 
 
 
 
70
  # Helper functions for object detection
71
  def image_to_base64(image):
72
  """Convert a PIL image to a base64-encoded string."""
@@ -83,6 +95,14 @@ def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2
83
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
84
  return image
85
 
 
 
 
 
 
 
 
 
86
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
87
  """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
88
  x_scale = original_width / scaled_width
@@ -296,6 +316,29 @@ def generate_video(model_name: str, text: str, video_path: str,
296
  time.sleep(0.01)
297
  yield buffer, buffer
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  # Define examples for image and video inference
300
  image_examples = [
301
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
@@ -315,6 +358,12 @@ object_detection_examples = [
315
  ["Detect Green Car.", "images/11.png"]
316
  ]
317
 
 
 
 
 
 
 
318
  # Added CSS to style the output area as a "Canvas"
319
  css = """
320
  .submit-btn {
@@ -374,6 +423,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
374
  inputs=[input_img, text_input, system_prompt],
375
  outputs=[model_output_text, parsed_boxes, annotated_image]
376
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
  with gr.Accordion("Advanced options", open=False):
379
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
21
  Qwen2_5_VLForConditionalGeneration,
22
  AutoProcessor,
23
  TextIteratorStreamer,
24
+ AutoModelForCausalLM,
25
+ AutoTokenizer
26
  )
27
  from qwen_vl_utils import process_vision_info
28
 
 
69
  torch_dtype=torch.float16
70
  ).to(device).eval()
71
 
72
+ # Load Moondream2
73
+ @spaces.GPU
74
+ def load_moondream_model():
75
+ return AutoModelForCausalLM.from_pretrained(
76
+ "vikhyatk/moondream2",
77
+ revision="2025-04-14",
78
+ trust_remote_code=True,
79
+ device_map={"": "cuda"},
80
+ )
81
+
82
  # Helper functions for object detection
83
  def image_to_base64(image):
84
  """Convert a PIL image to a base64-encoded string."""
 
95
  draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
96
  return image
97
 
98
+ def draw_points(image, points, color="red", radius=5):
99
+ """Draw points on an image."""
100
+ draw = ImageDraw.Draw(image)
101
+ for point in points:
102
+ x, y = point
103
+ draw.ellipse([x - radius, y - radius, x + radius, y + radius], fill=color)
104
+ return image
105
+
106
  def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
107
  """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
108
  x_scale = original_width / scaled_width
 
316
  time.sleep(0.01)
317
  yield buffer, buffer
318
 
319
+ # Moondream Vision Detection Function
320
+ @spaces.GPU
321
+ def detect_moondream(im: Image.Image, object_name: str, mode: str):
322
+ """
323
+ Open Vocabulary Detection using moondream2
324
+ Args:
325
+ im: Pillow Image
326
+ object_name: the object you would like to detect
327
+ mode: point or object_detection
328
+ Returns:
329
+ list: a list of bounding boxes (xyxy) or points (xy) coordinates that are normalized
330
+ annotated_image: Image with detections drawn
331
+ """
332
+ model = load_moondream_model()
333
+ if mode == "point":
334
+ points = model.point(im, object_name)["points"]
335
+ annotated_image = draw_points(im.copy(), points)
336
+ return points, annotated_image
337
+ elif mode == "object_detection":
338
+ boxes = model.detect(im, object_name)["objects"]
339
+ annotated_image = draw_bounding_boxes(im.copy(), boxes)
340
+ return boxes, annotated_image
341
+
342
  # Define examples for image and video inference
343
  image_examples = [
344
  ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
 
358
  ["Detect Green Car.", "images/11.png"]
359
  ]
360
 
361
+ # Define examples for Moondream Vision
362
+ moondream_examples = [
363
+ ["Spider-Man T-shirt", "images/22.png", "point"],
364
+ ["Green Car", "images/11.png", "object_detection"]
365
+ ]
366
+
367
  # Added CSS to style the output area as a "Canvas"
368
  css = """
369
  .submit-btn {
 
423
  inputs=[input_img, text_input, system_prompt],
424
  outputs=[model_output_text, parsed_boxes, annotated_image]
425
  )
426
+ with gr.TabItem("Moondream Vision"):
427
+ with gr.Row():
428
+ with gr.Column():
429
+ moon_image = gr.Image(label="Input Image", type="pil")
430
+ moon_object = gr.Textbox(label="Object to Detect", placeholder="e.g., Spider-Man T-shirt")
431
+ moon_mode = gr.Dropdown(label="Mode", choices=["point", "object_detection"], value="point")
432
+ moon_submit = gr.Button("Detect", elem_classes="submit-btn")
433
+ gr.Examples(
434
+ examples=moondream_examples,
435
+ inputs=[moon_object, moon_image, moon_mode]
436
+ )
437
+ with gr.Column():
438
+ moon_output_json = gr.JSON(label="Detection Results")
439
+ moon_annotated_image = gr.Image(label="Annotated Image")
440
+
441
+ moon_submit.click(
442
+ fn=detect_moondream,
443
+ inputs=[moon_image, moon_object, moon_mode],
444
+ outputs=[moon_output_json, moon_annotated_image]
445
+ )
446
 
447
  with gr.Accordion("Advanced options", open=False):
448
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)