Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on 21 days ago

Commit

760a8e5

verified ·

1 Parent(s): 035a499

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -143

app.py CHANGED Viewed

@@ -1,23 +1,13 @@
 import os
-import random
-import uuid
-import json
 import time
-import asyncio
-from threading import Thread
-import base64
-from io import BytesIO
-import re
 import gradio as gr
 import spaces
 import torch
 import numpy as np
-from PIL import Image, ImageDraw
 import cv2
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
@@ -67,91 +57,6 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to(device).eval()
-# Helper functions for object detection
-def image_to_base64(image):
-    """Convert a PIL image to a base64-encoded string."""
-    buffered = BytesIO()
-    image.save(buffered, format="PNG")
-    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    return img_str
-def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
-    """Draw bounding boxes on an image."""
-    draw = ImageDraw.Draw(image)
-    for box in bounding_boxes:
-        xmin, ymin, xmax, ymax = box
-        draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
-    return image
-def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
-    """Rescale bounding boxes from normalized (1000x1000) to original image dimensions."""
-    x_scale = original_width / scaled_width
-    y_scale = original_height / scaled_height
-    rescaled_boxes = []
-    for box in bounding_boxes:
-        xmin, ymin, xmax, ymax = box
-        rescaled_box = [
-            xmin * x_scale,
-            ymin * y_scale,
-            xmax * x_scale,
-            ymax * y_scale
-        ]
-        rescaled_boxes.append(rescaled_box)
-    return rescaled_boxes
-# Default system prompt for object detection
-default_system_prompt = (
-    "You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, Parse only the boxes; don't write unnecessary content."
-    "you return bounding boxes for all elements in the form of [xmin, ymin, xmax, ymax] with the values being scaled "
-    "to 512 by 512 pixels. When there are more than one result, answer with a list of bounding boxes in the form "
-    "of [[xmin, ymin, xmax, ymax], [xmin, ymin, xmax, ymax], ...]."
-)
-# Function for object detection
-@spaces.GPU
-def run_example(image, text_input, system_prompt):
-    """Detect objects in an image and return bounding box annotations."""
-    model = model_x
-    processor = processor_x
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
-                {"type": "text", "text": system_prompt},
-                {"type": "text", "text": text_input},
-            ],
-        }
-    ]
-    text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to("cuda")
-    generated_ids = model.generate(**inputs, max_new_tokens=256)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )
-    pattern = r'\[\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\]'
-    matches = re.findall(pattern, str(output_text))
-    parsed_boxes = [[int(num) for num in match] for match in matches]
-    scaled_boxes = rescale_bounding_boxes(parsed_boxes, image.width, image.height)
-    annotated_image = draw_bounding_boxes(image.copy(), scaled_boxes)
-    return output_text[0], str(parsed_boxes), annotated_image
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
@@ -220,7 +125,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
@@ -287,7 +192,7 @@ def generate_video(model_name: str, text: str, video_path: str,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
@@ -309,12 +214,6 @@ video_examples = [
     ["explain the video in detail.", "videos/2.mp4"]
 ]
-# Define examples for object detection
-object_detection_examples = [
-    ["Detect Spider-Man T-shirt.", "images/22.png"],
-    ["Detect Green Car.", "images/11.png"]
-]
 # Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
@@ -333,7 +232,7 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Doc VLMs v2 [Localization]](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
@@ -353,27 +252,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
-                with gr.TabItem("Object Detection / Localization"):
-                    with gr.Row():
-                        with gr.Column():
-                            input_img = gr.Image(label="Input Image [ 1024x1024 ]", type="pil")
-                            system_prompt = gr.Textbox(label="System Prompt", value=default_system_prompt, visible=False)
-                            text_input = gr.Textbox(label="Query Input", placeholder="Enter query...")
-                            submit_btn = gr.Button(value="Submit", elem_classes="submit-btn")
-                            gr.Examples(
-                                examples=object_detection_examples,
-                                inputs=[text_input, input_img]
-                            )
-                        with gr.Column():
-                            model_output_text = gr.Textbox(label="Model Output Text")
-                            parsed_boxes = gr.Textbox(label="Parsed Boxes")
-                            annotated_image = gr.Image(label="Annotated Image")
-                    submit_btn.click(
-                        fn=run_example,
-                        inputs=[input_img, text_input, system_prompt],
-                        outputs=[model_output_text, parsed_boxes, annotated_image]
-                    )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -397,22 +275,6 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
-            gr.Markdown("> [Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825) : camel-doc-ocr-062825 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for document retrieval, content extraction, and analysis recognition. built on top of the qwen2.5-vl architecture, this model enhances document comprehension capabilities.")
-            gr.Markdown("> [OCRFlux-3B](https://huggingface.co/ChatDOC/OCRFlux-3B) : ocrflux-3b model that's fine-tuned from qwen2.5-vl-3b-instruct using our private document datasets and some data from olmocr-mix-0225 dataset. optimized for document retrieval, content extraction, and analysis recognition. the best way to use this model is via the ocrflux toolkit.")
-            gr.Markdown("> [ViLaSR](https://huggingface.co/AntResearchNLP/ViLaSR) : vilasr-7b model as presented in reinforcing spatial reasoning in vision-language models with interwoven thinking and visual drawing. efficient reasoning capabilities.")
-            gr.Markdown("> [ShotVL-7B](https://huggingface.co/Vchitect/ShotVL-7B) : shotvl-7b is a fine-tuned version of qwen2.5-vl-7b-instruct, trained by supervised fine-tuning on the largest and high-quality dataset for cinematic language understanding to date. it currently achieves state-of-the-art performance on shotbench.")
-            gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
-    image_submit.click(
-        fn=generate_image,
-        inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output, markdown_output]
-    )
-    video_submit.click(
-        fn=generate_video,
-        inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output, markdown_output]
-    )
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)

 import os
 import time
+import threading
 import gradio as gr
 import spaces
 import torch
 import numpy as np
+from PIL import Image
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
     }
+    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
     ["explain the video in detail.", "videos/2.mp4"]
 ]
 # Added CSS to style the output area as a "Canvas"
 css = """
 .submit-btn {
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **Doc VLMs v2**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                         examples=video_examples,
                         inputs=[video_query, video_upload]
                     )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
             )
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Doc-VLMs-v2-Localization/discussions)")
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)