Spaces:

vvmnnnkv
/

owlv2-visual-prompt

Running on Zero

File size: 10,566 Bytes

import sys

# Mock audio modules to avoid installing them
sys.modules["audioop"] = type("audioop", (), {"__file__": ""})()
sys.modules["pyaudioop"] = type("pyaudioop", (), {"__file__": ""})()

import torch
import gradio as gr
import supervision as sv
import spaces
from transformers import AutoProcessor, Owlv2ForObjectDetection

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

@spaces.GPU
def init_model(model_id):
    processor = AutoProcessor.from_pretrained(model_id)
    model = Owlv2ForObjectDetection.from_pretrained(model_id)
    model.eval()
    model.to(DEVICE)
    return processor, model

@spaces.GPU
def inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type):
    processor, model = init_model(model_id)

    result = None
    class_names = {}

    if prompt_type == "Text":
        inputs = processor(
            images=target_image, 
            text=prompts["texts"],
            return_tensors="pt"
        ).to(DEVICE)

        with torch.no_grad():
            outputs = model(**inputs)

        target_sizes = torch.tensor([target_image.size[::-1]])
        result = processor.post_process_grounded_object_detection(
            outputs=outputs,
            target_sizes=target_sizes,
            threshold=conf_thresh
        )[0]
        class_names = {k: v for k, v in enumerate(prompts["texts"])}

    elif prompt_type == "Visual":
        inputs = processor(
            images=target_image, 
            query_images=prompts["images"],
            return_tensors="pt"
        ).to(DEVICE)
        with torch.no_grad():
            outputs = model.image_guided_detection(**inputs)
            
        # Post-process results
        target_sizes = torch.tensor([target_image.size[::-1]])
        result = processor.post_process_image_guided_detection(
            outputs=outputs,
            target_sizes=target_sizes,
            threshold=conf_thresh,
            nms_threshold=iou_thresh
        )[0]

        # prepare for supervision: add 0 label for all boxes
        result['labels'] = torch.zeros(len(result['boxes']), dtype=torch.int64)
        class_names = {0: "object"}

    detections = sv.Detections.from_transformers(result, class_names)

    resolution_wh = target_image.size
    thickness = sv.calculate_optimal_line_thickness(resolution_wh=resolution_wh)
    text_scale = sv.calculate_optimal_text_scale(resolution_wh=resolution_wh)

    labels = [
        f"{class_name} {confidence:.2f}"
        for class_name, confidence
        in zip(detections['class_name'], detections.confidence)
    ]

    annotated_image = target_image.copy()
    annotated_image = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=thickness).annotate(
        scene=annotated_image, detections=detections)
    annotated_image = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX, text_scale=text_scale, smart_position=True).annotate(
        scene=annotated_image, detections=detections, labels=labels)

    return annotated_image


def app():
    with gr.Blocks():
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    target_image = gr.Image(type="pil", label="Target Image", visible=True, interactive=True)
                
                detect_button = gr.Button(value="Detect Objects")
                prompt_type = gr.Textbox(value='Visual', visible=False)  # Default prompt type

                with gr.Tab("Visual") as visual_tab:
                    with gr.Row():
                        prompt_image = gr.Image(type="pil", label="Prompt Image", visible=True, interactive=True)

                with gr.Tab("Text") as text_tab:
                    texts = gr.Textbox(label="Input Texts", value='', placeholder='person,bus', visible=True, interactive=True)
                
                visual_tab.select(
                    fn=lambda: ("Visual", gr.update(visible=True)),
                    inputs=None,
                    outputs=[prompt_type, prompt_image]
                )

                text_tab.select(
                    fn=lambda: ("Text", gr.update(value=None, visible=False)),
                    inputs=None,
                    outputs=[prompt_type, prompt_image]
                )

                model_id = gr.Dropdown(
                    label="Model",
                    choices=[
                        "google/owlv2-base-patch16-ensemble",
                        "google/owlv2-large-patch14"
                    ],
                    value="google/owlv2-base-patch16-ensemble",
                )
                conf_thresh = gr.Slider(
                    label="Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.25,
                )
                iou_thresh = gr.Slider(
                    label="IoU Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.70,
                )

            with gr.Column():
                output_image = gr.Image(type="numpy", label="Annotated Image", visible=True)
        

        def run_inference(prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type):
            # add text/built-in prompts
            if prompt_type == "Text":
                texts = [text.strip() for text in texts.split(',')]
                prompts = {
                    "texts": texts
                }
            # add visual prompt
            elif prompt_type == "Visual":
                prompts = {
                    "images": prompt_image,
                }

            return inference(prompts, target_image, model_id, conf_thresh, iou_thresh, prompt_type)

        detect_button.click(
            fn=run_inference,
            inputs=[prompt_image, target_image, texts, model_id, conf_thresh, iou_thresh, prompt_type],
            outputs=[output_image],
        )

        ###################### Examples ##########################
        image_examples_list = [[
                "test-data/target1.jpg",
                "test-data/prompt1.jpg",
                "google/owlv2-base-patch16-ensemble",
                0.9,
                0.3,
            ], 
            [
                "test-data/target2.jpg",
                "test-data/prompt2.jpg",
                "google/owlv2-base-patch16-ensemble",
                0.9,
                0.3,
            ],
            [
                "test-data/target3.jpg",
                "test-data/prompt3.jpg",
                "google/owlv2-base-patch16-ensemble",
                0.9,
                0.3,
            ],
            [
                "test-data/target4.jpg",
                "test-data/prompt4.jpg",
                "google/owlv2-base-patch16-ensemble",
                0.9,
                0.3,
            ]
            ]

        text_examples = gr.Examples(
            examples=[[
                "test-data/target1.jpg",
                "logo",
                "google/owlv2-base-patch16-ensemble",
                0.3],
                [
                "test-data/target2.jpg",
                "cat,remote",
                "google/owlv2-base-patch16-ensemble",
                0.3],
                [
                "test-data/target3.jpg",
                "frog,spider,lizard",
                "google/owlv2-base-patch16-ensemble",
                0.3],
                [
                "test-data/target4.jpg",
                "cat",
                "google/owlv2-base-patch16-ensemble",
                0.3]
            ], 
            inputs=[target_image, texts, model_id, conf_thresh], 
            visible=False, cache_examples=False, label="Text Prompt Examples")

        image_examples = gr.Examples(
            examples=image_examples_list, 
            inputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh], 
            visible=True, cache_examples=False, label="Box Visual Prompt Examples")

        # Examples update
        def update_text_examples():
            return gr.Dataset(visible=True), gr.Dataset(visible=False), gr.update(visible=False)

        def update_visual_examples():
            return gr.Dataset(visible=False), gr.Dataset(visible=True), gr.update(visible=True)

        text_tab.select(
            fn=update_text_examples,
            inputs=None,
            outputs=[text_examples.dataset, image_examples.dataset, iou_thresh]
        )
        
        visual_tab.select(
            fn=update_visual_examples,
            inputs=None,
            outputs=[text_examples.dataset, image_examples.dataset, iou_thresh]
        )
        
        return target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list

gradio_app = gr.Blocks()
with gradio_app:
    gr.HTML(
        """
    <h1 style='text-align: center'>OWLv2: Zero-shot detection with visual prompt 👀</h1>
    """)
    gr.Markdown("""
    This demo showcases the OWLv2 model's ability to perform zero-shot object detection using visual and text prompts. 

    You can either provide a text prompt or an image as a visual prompt to detect objects in the target image.

    For visual prompting, following sample code is used, taken from the HF documentation:
    ```python
       processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
       model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

       target_image = Image.open(...)
       prompt_image = Image.open(...)
       inputs = processor(images=target_image, query_images=prompt_image, return_tensors="pt")

       # forward pass
       with torch.no_grad():
           outputs = model.image_guided_detection(**inputs)

       target_sizes = torch.Tensor([image.size[::-1]])

       results = processor.post_process_image_guided_detection(outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes)
    ```

    For some reason, visual prompt works much worse than text, perhaps it's HF implementation issue.
    """)

    with gr.Row():
        with gr.Column():
            # Create a list of all UI components
            ui_components = app()
            # Unpack the components
            target_image, prompt_image, model_id, conf_thresh, iou_thresh, image_examples_list = ui_components

    gradio_app.load(
        fn=lambda: image_examples_list[1],
        outputs=[target_image, prompt_image, model_id, conf_thresh, iou_thresh]
    )


gradio_app.launch(allowed_paths=["figures"])