Spaces:

SkalskiP
/

better-florence-2

Running on Zero

File size: 6,658 Bytes

1858b2a
 
9c79daa
1858b2a
9c79daa
 
1858b2a
 
9c79daa
 
 
5d15f06
c3f2745
1858b2a
 
 
9c79daa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b99a8a
 
 
 
 
 
 
 
 
9c79daa
1858b2a
 
 
 
 
 
 
 
3b99a8a
9c79daa
eba8e42
 
9c79daa
 
 
 
1858b2a
 
 
 
 
 
9c79daa
 
 
1858b2a
3b99a8a
9c79daa
 
 
 
1858b2a
3b99a8a
9c79daa
 
1858b2a
9c79daa
 
 
 
 
 
 
 
1858b2a
 
 
9c79daa
 
 
1858b2a
 
 
9c79daa
 
 
1858b2a
 
 
 
9c79daa
 
 
eba8e42
 
1858b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eba8e42
1858b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c79daa

from typing import Tuple, Optional

import gradio as gr
import spaces
import supervision as sv
import torch
from gradio_image_prompter import ImagePrompter
from PIL import Image

from utils.annotate import annotate_with_boxes
from utils.models import load_models, run_inference, CHECKPOINTS
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
    CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
    IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
    TEXTBOX_OUTPUT_TASK_NAMES

MARKDOWN = """
# Better Florence-2 Playground 🔥
<div>
    <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
    </a>
    <a href="https://blog.roboflow.com/florence-2/">
        <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
    </a>
    <a href="https://arxiv.org/abs/2311.06242">
        <img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
    </a>
    <a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
        <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
    </a>
</div>

Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the 
MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities 
across tasks such as captioning, object detection, grounding, and segmentation.

The model takes images and task prompts as input, generating the desired results in 
text format. It uses a DaViT vision encoder to convert images into visual token 
embeddings. These are then concatenated with BERT-generated text embeddings and 
processed by a transformer-based multi-modal encoder-decoder to generate the response.
"""
EXAMPLES = [
    ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None]
]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = "cuda"
MODELS, PROCESSORS = load_models(DEVICE)


@spaces.GPU
def process(
    checkpoint_dropdown,
    task_dropdown,
    image_input,
    image_prompter_input
) -> Tuple[Optional[Image.Image], Optional[str]]:
    model = MODELS[checkpoint_dropdown]
    processor = PROCESSORS[checkpoint_dropdown]
    task = TASKS[task_dropdown]

    if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
        _, response = run_inference(
            model, processor, DEVICE, image_input, task)
        detections = sv.Detections.from_lmm(
            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
        return annotate_with_boxes(image_input, detections), None
    elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
        _, response = run_inference(
            model, processor, DEVICE, image_input, task)
        return None, response[task]


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        checkpoint_dropdown_component = gr.Dropdown(
            choices=CHECKPOINTS,
            value=CHECKPOINTS[0],
            label="Model", info="Select a Florence 2 model to use.",
            interactive=True
        )
        task_dropdown_component = gr.Dropdown(
            choices=TASK_NAMES,
            value=TASK_NAMES[0],
            label="Task", info="Select a task to perform with the model.",
            interactive=True
        )

    with gr.Row():
        with gr.Column():
            image_input_component = gr.Image(
                type='pil', label='Upload image')
            image_prompter_input_component = ImagePrompter(
                type='pil', label='Upload image and draw box prompt', visible=False)
            submit_button_component = gr.Button(value='Submit', variant='primary')

        with gr.Column():
            image_output_component = gr.Image(type='pil', label='Image Output')
            text_output_component = gr.Textbox(label='Caption Output', visible=False)
    with gr.Row():
        gr.Examples(
            fn=process,
            examples=EXAMPLES,
            inputs=[
                checkpoint_dropdown_component,
                task_dropdown_component,
                image_input_component,
                image_prompter_input_component
            ],
            outputs=[
                image_output_component,
                text_output_component
            ],
            run_on_click=True
        )

    def on_dropdown_change(text):
        return [
            gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
            ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
            gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
            gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
        ]

    task_dropdown_component.change(
        on_dropdown_change,
        inputs=[task_dropdown_component],
        outputs=[
            image_input_component,
            image_prompter_input_component,
            image_output_component,
            text_output_component
        ]
    )
    submit_button_component.click(
        fn=process,
        inputs=[
            checkpoint_dropdown_component,
            task_dropdown_component,
            image_input_component,
            image_prompter_input_component
        ],
        outputs=[
            image_output_component,
            text_output_component
        ]
    )

demo.launch(debug=False, show_error=True, max_threads=1)