File size: 7,558 Bytes
9c79daa
 
 
 
 
 
 
5d15f06
c3f2745
3b99a8a
9c79daa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b99a8a
 
 
 
 
 
 
 
 
9c79daa
 
c3f2745
 
 
 
 
 
 
 
3b99a8a
 
 
 
 
 
 
9c79daa
 
 
 
 
 
 
 
 
 
3b99a8a
9c79daa
 
 
 
 
3b99a8a
9c79daa
 
 
 
 
c3f2745
 
 
 
9c79daa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b99a8a
 
 
9c79daa
 
 
 
 
 
 
 
 
 
3b99a8a
9c79daa
 
 
 
 
 
 
 
 
 
 
c3f2745
 
3b99a8a
 
c3f2745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b99a8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c79daa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import gradio as gr
import supervision as sv
import torch
import spaces

from utils.annotate import annotate_with_boxes
from utils.models import load_models, run_inference, CHECKPOINTS
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
    CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
    MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME

MARKDOWN = """
# Better Florence-2 Playground 🔥
<div>
    <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
    </a>
    <a href="https://blog.roboflow.com/florence-2/">
        <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
    </a>
    <a href="https://arxiv.org/abs/2311.06242">
        <img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
    </a>
    <a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
        <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
    </a>
</div>

Florence-2 is a lightweight vision-language model open-sourced by Microsoft under the 
MIT license. The model demonstrates strong zero-shot and fine-tuning capabilities 
across tasks such as captioning, object detection, grounding, and segmentation.

The model takes images and task prompts as input, generating the desired results in 
text format. It uses a DaViT vision encoder to convert images into visual token 
embeddings. These are then concatenated with BERT-generated text embeddings and 
processed by a transformer-based multi-modal encoder-decoder to generate the response.
"""

OBJECT_DETECTION_EXAMPLES = [
    ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
]
CAPTION_EXAMPLES = [
    ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
    ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
    ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
]
OCR_EXAMPLES = [
    ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
]
OCR_WITH_REGION_EXAMPLES = [
    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
    ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg"]
]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODELS, PROCESSORS = load_models(DEVICE)


@spaces.GPU
def process(checkpoint_dropdown, task_dropdown, image_input):
    model = MODELS[checkpoint_dropdown]
    processor = PROCESSORS[checkpoint_dropdown]
    task = TASKS[task_dropdown]
    if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
        _, response = run_inference(
            model, processor, DEVICE, image_input, task)
        detections = sv.Detections.from_lmm(
            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
        return annotate_with_boxes(image_input, detections)
    elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
        _, response = run_inference(
            model, processor, DEVICE, image_input, task)
        return response[task]


image_output_component = None
text_output_component = None


with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        checkpoint_dropdown_component = gr.Dropdown(
            choices=CHECKPOINTS,
            value=CHECKPOINTS[0],
            label="Model", info="Select a Florence 2 model to use.")
        task_dropdown_component = gr.Dropdown(
            choices=TASK_NAMES,
            value=TASK_NAMES[0],
            label="Task", info="Select a task to perform with the model.")

    with gr.Row():
        with gr.Column():
            image_input_component = gr.Image(type='pil', label='Image Input')
            submit_button_component = gr.Button(value='Submit', variant='primary')

        with gr.Column():
            @gr.render(inputs=task_dropdown_component)
            def show_output(text):
                global image_output_component
                global text_output_component
                if text in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
                    image_output_component = gr.Image(type='pil', label='Image Output')
                    submit_button_component.click(
                        fn=process,
                        inputs=[
                            checkpoint_dropdown_component,
                            task_dropdown_component,
                            image_input_component
                        ],
                        outputs=image_output_component
                    )
                elif text in CAPTION_TASK_NAMES or text == OCR_TASK_NAME:
                    text_output_component = gr.Textbox(label='Caption Output')
                    submit_button_component.click(
                        fn=process,
                        inputs=[
                            checkpoint_dropdown_component,
                            task_dropdown_component,
                            image_input_component
                        ],
                        outputs=text_output_component
                    )

    @gr.render(inputs=task_dropdown_component)
    def show_examples(text):
        global image_output_component
        global text_output_component
        if text == OBJECT_DETECTION_TASK_NAME:
            gr.Examples(
                fn=process,
                examples=OBJECT_DETECTION_EXAMPLES,
                inputs=[
                    checkpoint_dropdown_component,
                    task_dropdown_component,
                    image_input_component
                ],
                outputs=image_output_component
            )
        elif text in CAPTION_TASK_NAMES:
            gr.Examples(
                fn=process,
                examples=CAPTION_EXAMPLES,
                inputs=[
                    checkpoint_dropdown_component,
                    task_dropdown_component,
                    image_input_component
                ],
                outputs=text_output_component
            )
        elif text == OCR_TASK_NAME:
            gr.Examples(
                fn=process,
                examples=OCR_EXAMPLES,
                inputs=[
                    checkpoint_dropdown_component,
                    task_dropdown_component,
                    image_input_component
                ],
                outputs=text_output_component
            )
        elif text == OCR_WITH_REGION_TASK_NAME:
            gr.Examples(
                fn=process,
                examples=OCR_WITH_REGION_EXAMPLES,
                inputs=[
                    checkpoint_dropdown_component,
                    task_dropdown_component,
                    image_input_component
                ],
                outputs=image_output_component
            )

demo.launch(debug=False, show_error=True, max_threads=1)