File size: 5,561 Bytes
46d803c
 
 
96cec35
 
 
 
 
0d32e06
 
 
 
03ca516
 
 
96cec35
 
2053bec
 
32d3d67
96cec35
 
 
2053bec
 
32d3d67
96cec35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46d803c
 
5cdfef0
03ca516
5cdfef0
03ca516
 
5cdfef0
03ca516
 
 
 
 
 
42b759b
0d32e06
03ca516
 
 
 
 
87fc8c6
03ca516
 
f659d73
03ca516
 
 
 
 
 
 
 
0d32e06
03ca516
 
 
 
 
 
 
 
 
 
0d32e06
f1a4346
86ce79d
f1a4346
 
c333dca
 
48483a7
86ce79d
c333dca
7d35129
c333dca
dc09f8c
7d35129
f1a4346
0d32e06
46d803c
7d35129
5cdfef0
03ca516
96cec35
 
 
 
 
 
 
 
 
 
55bbc1c
96cec35
 
 
 
f9d3807
96cec35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46d803c
96cec35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import gradio as gr
import spaces
import torch
import base64
from PIL import Image, ImageDraw
from io import BytesIO
import re

from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images


from transformers import AutoModelForCausalLM



models = {
    "deepseek-ai/deepseek-vl2-tiny": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2-tiny", trust_remote_code=True),
    #"deepseek-ai/deepseek-vl2-small": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2-small", trust_remote_code=True),
    #"deepseek-ai/deepseek-vl2": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2", trust_remote_code=True)
}

processors = {
    "deepseek-ai/deepseek-vl2-tiny": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2-tiny",),
    #"deepseek-ai/deepseek-vl2-small": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2-small",),
    #"deepseek-ai/deepseek-vl2": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2",),
}


def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
    draw = ImageDraw.Draw(image)
    for box in bounding_boxes:
        xmin, ymin, xmax, ymax = box
        draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
    return image


def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
    x_scale = original_width / scaled_width
    y_scale = original_height / scaled_height
    rescaled_boxes = []
    for box in bounding_boxes:
        xmin, ymin, xmax, ymax = box
        rescaled_box = [
            xmin * x_scale,
            ymin * y_scale,
            xmax * x_scale,
            ymax * y_scale
        ]
        rescaled_boxes.append(rescaled_box)
    return rescaled_boxes


def deepseek(image, text_input, model_id):
    # specify the path to the model
    vl_chat_processor: DeepseekVLV2Processor = processors[model_id]
    tokenizer = vl_chat_processor.tokenizer

    vl_gpt: DeepseekVLV2ForCausalLM = models[model_id]
    vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

    ## single image conversation example
    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image><|ref|>{text_input}<|/ref|>.",
            "images": ["./images/visual_grounding_1.jpeg"],
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    # load images and prepare for inputs
    #pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation,
        images=[image],
        force_batchify=True,
        system_prompt=""
    ).to(vl_gpt.device)

    # run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # run the model to get the response
    outputs = vl_gpt.language.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=False)

    print(f"{prepare_inputs['sft_format'][0]}", answer)
    det_pattern = r"<\|det\|>\[\[(.+)]]<\|\/det\|>"

    det_match = re.search(det_pattern, answer)
    if det_match is None:
        return text_input, [], image
    
    det_content = det_match.group(1)
    bbox = [int(v.strip()) for v in det_content.split(",")]

    scaled_boxes = rescale_bounding_boxes([bbox], image.width, image.height)
    return answer, scaled_boxes, draw_bounding_boxes(image, scaled_boxes)


@spaces.GPU
def run_example(image, text_input, model_id="eepseek-ai/deepseek-vl2-tiny"):
    return deepseek(image, text_input, model_id)
    
css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""
with gr.Blocks(css=css) as demo:
    gr.Markdown(
    """
    # Demo for Deepseek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding
    """)
    with gr.Row():
        with gr.Column():
            input_img = gr.Image(label="Input Image", type="pil")
            model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="deepseek-ai/deepseek-vl2-tiny")
            text_input = gr.Textbox(label="User Prompt")
            submit_btn = gr.Button(value="Submit")
        with gr.Column():
            model_output_text = gr.Textbox(label="Model Output Text")
            model_output_box = gr.Textbox(label="Model Output Box")
            annotated_image = gr.Image(label="Annotated Image")

    gr.Examples(
        examples=[
            ["assets/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png", "select search textfield"],
            ["assets/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png", "switch to discussions"],
        ],
        inputs=[input_img, text_input],
        outputs=[model_output_text, model_output_box, annotated_image],
        fn=run_example,
        cache_examples=True,
        label="Try examples"
    )

    submit_btn.click(run_example, [input_img, text_input, model_selector], [model_output_text, model_output_box, annotated_image])

demo.launch(debug=True)