DeepSeek-Vl-UI / app.py
programmnix-askui's picture
Enable small models
5b056c1
raw
history blame
6.38 kB
import gradio as gr
import spaces
import torch
import base64
from PIL import Image, ImageDraw
from io import BytesIO
import re
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from deepseek_vl2.utils.io import load_pil_images
from transformers import AutoModelForCausalLM
models = {
"deepseek-ai/deepseek-vl2-tiny": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2-tiny", trust_remote_code=True),
#"deepseek-ai/deepseek-vl2-small": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2-small", trust_remote_code=True),
#"deepseek-ai/deepseek-vl2": AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-vl2", trust_remote_code=True)
}
processors = {
"deepseek-ai/deepseek-vl2-tiny": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2-tiny",),
#"deepseek-ai/deepseek-vl2-small": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2-small",),
#"deepseek-ai/deepseek-vl2": DeepseekVLV2Processor.from_pretrained("deepseek-ai/deepseek-vl2",),
}
def image_to_base64(image):
buffered = BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
return img_str
def draw_bounding_boxes(image, bounding_boxes, outline_color="red", line_width=2):
draw = ImageDraw.Draw(image)
for box in bounding_boxes:
xmin, ymin, xmax, ymax = box
draw.rectangle([xmin, ymin, xmax, ymax], outline=outline_color, width=line_width)
return image
def rescale_bounding_boxes(bounding_boxes, original_width, original_height, scaled_width=1000, scaled_height=1000):
x_scale = original_width / scaled_width
y_scale = original_height / scaled_height
rescaled_boxes = []
for box in bounding_boxes:
xmin, ymin, xmax, ymax = box
rescaled_box = [
xmin * x_scale,
ymin * y_scale,
xmax * x_scale,
ymax * y_scale
]
rescaled_boxes.append(rescaled_box)
return rescaled_boxes
def deepseek(image, text_input, model_id):
# specify the path to the model
vl_chat_processor: DeepseekVLV2Processor = processors[model_id]
tokenizer = vl_chat_processor.tokenizer
vl_gpt: DeepseekVLV2ForCausalLM = models[model_id]
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
## single image conversation example
conversation = [
{
"role": "<|User|>",
"content": f"<image><|ref|>{text_input}<|/ref|>.",
"images": ["./images/visual_grounding_1.jpeg"],
},
{"role": "<|Assistant|>", "content": ""},
]
# load images and prepare for inputs
#pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
conversations=conversation,
images=[image],
force_batchify=True,
system_prompt=""
).to(vl_gpt.device)
with torch.no_grad():
# run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
inputs_embeds, past_key_values = vl_gpt.incremental_prefilling(
input_ids=prepare_inputs.input_ids,
images=prepare_inputs.images,
images_seq_mask=prepare_inputs.images_seq_mask,
images_spatial_crop=prepare_inputs.images_spatial_crop,
attention_mask=prepare_inputs.attention_mask,
chunk_size=512 # prefilling size
)
# run the model to get the response
outputs = vl_gpt.generate(
inputs_embeds=inputs_embeds,
input_ids=prepare_inputs.input_ids,
images=prepare_inputs.images,
images_seq_mask=prepare_inputs.images_seq_mask,
images_spatial_crop=prepare_inputs.images_spatial_crop,
attention_mask=prepare_inputs.attention_mask,
past_key_values=past_key_values,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
do_sample=False,
use_cache=True,
)
answer = tokenizer.decode(outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(), skip_special_tokens=False)
print(f"{prepare_inputs['sft_format'][0]}", answer)
det_pattern = r"<\|det\|>\[\[(.+)]]<\|\/det\|>"
det_match = re.search(det_pattern, answer)
if det_match is None:
return text_input, [], image
det_content = det_match.group(1)
bbox = [int(v.strip()) for v in det_content.split(",")]
scaled_boxes = rescale_bounding_boxes([bbox], image.width, image.height)
return answer, scaled_boxes, draw_bounding_boxes(image, scaled_boxes)
@spaces.GPU
def run_example(image, text_input, model_id="deepseek-ai/deepseek-vl2-tiny"):
return deepseek(image, text_input, model_id)
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(
"""
# Demo for Deepseek-VL2: Mixture-of-Experts Vision-Language Models for Advanced Multimodal Understanding
""")
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Image", type="pil")
model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="deepseek-ai/deepseek-vl2-tiny")
text_input = gr.Textbox(label="User Prompt")
submit_btn = gr.Button(value="Submit")
with gr.Column():
model_output_text = gr.Textbox(label="Model Output Text")
model_output_box = gr.Textbox(label="Model Output Box")
annotated_image = gr.Image(label="Annotated Image")
gr.Examples(
examples=[
["assets/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png", "select search textfield"],
["assets/web_6f93090a-81f6-489e-bb35-1a2838b18c01.png", "switch to discussions"],
],
inputs=[input_img, text_input],
outputs=[model_output_text, model_output_box, annotated_image],
fn=run_example,
cache_examples=True,
label="Try examples"
)
submit_btn.click(run_example, [input_img, text_input, model_selector], [model_output_text, model_output_box, annotated_image])
demo.launch(debug=True)