Spaces:

HarborYuan
/

ovsam

Runtime error

File size: 14,612 Bytes

9cc3eb2
 
 
 
 
 
 
 
 
4a2e688
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fb6bea
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1435cf
 
 
 
 
359d34b
 
f1435cf
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
4a2e688
9cc3eb2
 
 
 
 
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a2e688
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
 
 
9cb6e3b
 
9cc3eb2
 
 
 
 
 
 
 
9cb6e3b
f1435cf
9cb6e3b
f1435cf
9cb6e3b
 
 
9cc3eb2
9cb6e3b
 
 
 
 
 
 
 
 
 
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
 
 
9cb6e3b
 
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cb6e3b
f1435cf
9cb6e3b
f1435cf
9cb6e3b
 
 
9cc3eb2
9cb6e3b
 
 
 
 
 
 
 
 
 
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aaf167c
9cc3eb2
 
 
 
 
 
 
 
 
4a2e688
 
 
 
 
 
 
 
f1435cf
4a2e688
 
 
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
9cb6e3b
 
 
1cf72c0
9cc3eb2
 
 
9a52c51
 
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cf72c0
9cc3eb2
 
70aa930
 
9cc3eb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cf72c0
9cc3eb2
 
70aa930
 
9cc3eb2
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
aaf167c
9cc3eb2
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2
 
 
 
1cf72c0
9cc3eb2

import gradio as gr
import numpy as np
from PIL import ImageDraw, Image

import torch
import torch.nn.functional as F

# mm libs
from mmdet.registry import MODELS
from mmengine import Config, print_log
from mmengine.structures import InstanceData

from ext.class_names.lvis_list import LVIS_CLASSES

LVIS_NAMES = LVIS_CLASSES

# Description
title = "<center><strong><font size='8'>Open-Vocabulary SAM<font></strong></center>"

css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }"

model_cfg = Config.fromfile('app/configs/sam_r50x16_fpn.py')

examples = [
    ["app/assets/sa_01.jpg"],
    ["app/assets/sa_224028.jpg"],
    ["app/assets/sa_227490.jpg"],
    ["app/assets/sa_228025.jpg"],
    ["app/assets/sa_234958.jpg"],
    ["app/assets/sa_235005.jpg"],
    ["app/assets/sa_235032.jpg"],
    ["app/assets/sa_235036.jpg"],
    ["app/assets/sa_235086.jpg"],
    ["app/assets/sa_235094.jpg"],
    ["app/assets/sa_235113.jpg"],
    ["app/assets/sa_235130.jpg"],
]
model = MODELS.build(model_cfg.model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device=device)
model = model.eval()
model.init_weights()

mean = torch.tensor([123.675, 116.28, 103.53], device=device)[:, None, None]
std = torch.tensor([58.395, 57.12, 57.375], device=device)[:, None, None]


class IMGState:
    def __init__(self):
        self.img = None
        self.img_feat = None
        self.selected_points = []
        self.selected_points_labels = []
        self.selected_bboxes = []

        self.available_to_set = True

    def set_img(self, img, img_feat):
        self.img = img
        self.img_feat = img_feat

        self.available_to_set = False

    def clear(self):
        self.img = None
        self.img_feat = None
        self.selected_points = []
        self.selected_points_labels = []
        self.selected_bboxes = []

        self.available_to_set = True

    def clean(self):
        self.selected_points = []
        self.selected_points_labels = []
        self.selected_bboxes = []

    def to_device(self, device=device):
        if self.img_feat is not None:
            for k in self.img_feat:
                if isinstance(self.img_feat[k], torch.Tensor):
                    self.img_feat[k] = self.img_feat[k].to(device)
                elif isinstance(self.img_feat[k], tuple):
                    self.img_feat[k] = tuple(v.to(device) for v in self.img_feat[k])

    @property
    def available(self):
        return self.available_to_set


IMG_SIZE = 1024


def get_points_with_draw(image, img_state, evt: gr.SelectData):
    label = 'Add Mask'

    x, y = evt.index[0], evt.index[1]
    print_log(f"Point: {x}_{y}", logger='current')
    point_radius, point_color = 10, (97, 217, 54) if label == "Add Mask" else (237, 34, 13)

    img_state.selected_points.append([x, y])
    img_state.selected_points_labels.append(1 if label == "Add Mask" else 0)

    draw = ImageDraw.Draw(image)
    draw.ellipse(
        [(x - point_radius, y - point_radius), (x + point_radius, y + point_radius)],
        fill=point_color,
    )
    return img_state, image


def get_bbox_with_draw(image, img_state, evt: gr.SelectData):
    x, y = evt.index[0], evt.index[1]
    point_radius, point_color, box_outline = 5, (237, 34, 13), 2
    box_color = (237, 34, 13)

    if len(img_state.selected_bboxes) in [0, 1]:
        img_state.selected_bboxes.append([x, y])
    elif len(img_state.selected_bboxes) == 2:
        img_state.selected_bboxes = [[x, y]]
        image = Image.fromarray(img_state.img)
    else:
        raise ValueError(f"Cannot be {len(img_state.selected_bboxes)}")

    print_log(f"box_list: {img_state.selected_bboxes}", logger='current')

    draw = ImageDraw.Draw(image)
    draw.ellipse(
        [(x - point_radius, y - point_radius), (x + point_radius, y + point_radius)],
        fill=point_color,
    )

    if len(img_state.selected_bboxes) == 2:
        box_points = img_state.selected_bboxes
        bbox = (min(box_points[0][0], box_points[1][0]),
                min(box_points[0][1], box_points[1][1]),
                max(box_points[0][0], box_points[1][0]),
                max(box_points[0][1], box_points[1][1]),
                )
        draw.rectangle(
            bbox,
            outline=box_color,
            width=box_outline
        )
    return img_state, image


def segment_with_points(
        image,
        img_state,
):
    if img_state.available:
        return None, None, "State Error, please try again."
    output_img = img_state.img
    h, w = output_img.shape[:2]

    input_points = torch.tensor(img_state.selected_points, dtype=torch.float32, device=device)
    prompts = InstanceData(
        point_coords=input_points[None],
    )

    try:
        img_state.to_device()
        masks, cls_pred = model.extract_masks(img_state.img_feat, prompts)
        img_state.to_device('cpu')

        masks = masks[0, 0, :h, :w]
        masks = masks > 0.5

        cls_pred = cls_pred[0][0]
        scores, indices = torch.topk(cls_pred, 1)
        scores, indices = scores.tolist(), indices.tolist()
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            img_state.clear()
            print_log(f"CUDA OOM! please try again later", logger='current')
            return None, None, "CUDA OOM, please try again later."
        else:
            raise
    names = []
    for ind in indices:
        names.append(LVIS_NAMES[ind].replace('_', ' '))

    cls_info = ""
    for name, score in zip(names, scores):
        cls_info += "{} ({:.2f})".format(name, score)

    rgb_shape = tuple(list(masks.shape) + [3])
    color = np.zeros(rgb_shape, dtype=np.uint8)
    color[masks] = np.array([97, 217, 54])
    # color[masks] = np.array([217, 90, 54])
    output_img = (output_img * 0.7 + color * 0.3).astype(np.uint8)

    output_img = Image.fromarray(output_img)
    return img_state, image, output_img, cls_info


def segment_with_bbox(
        image,
        img_state
):
    if img_state.available:
        return None, None, "State Error, please try again."
    if len(img_state.selected_bboxes) != 2:
        return image, None, ""
    output_img = img_state.img
    h, w = output_img.shape[:2]

    box_points = img_state.selected_bboxes
    bbox = (
        min(box_points[0][0], box_points[1][0]),
        min(box_points[0][1], box_points[1][1]),
        max(box_points[0][0], box_points[1][0]),
        max(box_points[0][1], box_points[1][1]),
    )
    input_bbox = torch.tensor(bbox, dtype=torch.float32, device=device)
    prompts = InstanceData(
        bboxes=input_bbox[None],
    )

    try:
        img_state.to_device()
        masks, cls_pred = model.extract_masks(img_state.img_feat, prompts)
        img_state.to_device('cpu')

        masks = masks[0, 0, :h, :w]
        masks = masks > 0.5

        cls_pred = cls_pred[0][0]
        scores, indices = torch.topk(cls_pred, 1)
        scores, indices = scores.tolist(), indices.tolist()
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            img_state.clear()
            print_log(f"CUDA OOM! please try again later", logger='current')
            return None, None, "CUDA OOM, please try again later."
        else:
            raise
    names = []
    for ind in indices:
        names.append(LVIS_NAMES[ind].replace('_', ' '))

    cls_info = ""
    for name, score in zip(names, scores):
        cls_info += "{} ({:.2f})\n".format(name, score)

    rgb_shape = tuple(list(masks.shape) + [3])
    color = np.zeros(rgb_shape, dtype=np.uint8)
    color[masks] = np.array([97, 217, 54])
    # color[masks] = np.array([217, 90, 54])
    output_img = (output_img * 0.7 + color * 0.3).astype(np.uint8)

    output_img = Image.fromarray(output_img)
    return image, output_img, cls_info


def extract_img_feat(img, img_state):
    w, h = img.size
    scale = IMG_SIZE / max(w, h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    img = img.resize((new_w, new_h), resample=Image.Resampling.BILINEAR)
    img_numpy = np.array(img)
    print_log(f"Successfully loaded an image with size {new_w} x {new_h}", logger='current')

    try:
        img_tensor = torch.tensor(img_numpy, device=device, dtype=torch.float32).permute((2, 0, 1))[None]
        img_tensor = (img_tensor - mean) / std
        img_tensor = F.pad(img_tensor, (0, IMG_SIZE - new_w, 0, IMG_SIZE - new_h), 'constant', 0)
        feat_dict = model.extract_feat(img_tensor)
        img_state.set_img(img_numpy, feat_dict)
        img_state.to_device('cpu')
        print_log(f"Successfully generated the image feats.", logger='current')
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            img_state.clear()
            print_log(f"CUDA OOM! please try again later", logger='current')
            return None, None, "CUDA OOM, please try again later."
        else:
            raise
    return img_state, img, None, "Please try to click something."


def clear_everything(img_state):
    img_state.clear()
    return img_state, None, None, "Please try to click something."


def clean_prompts(img_state):
    img_state.clean()
    if img_state.img is None:
        img_state.clear()
        return None, None, "Please try to click something."
    return img_state, Image.fromarray(img_state.img), None, "Please try to click something."


def register_point_mode():
    img_state_points = gr.State(value=IMGState())
    img_state_bbox = gr.State(value=IMGState())
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown(title)

    # Point mode tab
    with gr.Tab("Point mode"):
        with gr.Row(variant="panel"):
            with gr.Column(scale=1):
                cond_img_p = gr.Image(label="Input Image", height=512, type="pil")

            with gr.Column(scale=1):
                segm_img_p = gr.Image(label="Segment", interactive=False, height=512, type="pil")

        with gr.Row():
            with gr.Column():
                with gr.Row():
                    with gr.Column():
                        clean_btn_p = gr.Button("Clean Prompts", variant="secondary")
                        clear_btn_p = gr.Button("Restart", variant="secondary")
            with gr.Column():
                cls_info = gr.Textbox("", label='Labels')

        with gr.Row():
            with gr.Column():
                gr.Markdown("Try some of the examples below ⬇️")
                gr.Examples(
                    examples=examples,
                    inputs=[cond_img_p, img_state_points],
                    outputs=[img_state_points, cond_img_p, segm_img_p, cls_info],
                    examples_per_page=12,
                    fn=extract_img_feat,
                    run_on_click=True,
                    cache_examples=False,
                )

    # box mode tab
    with gr.Tab("Box mode"):
        with gr.Row(variant="panel"):
            with gr.Column(scale=1):
                cond_img_bbox = gr.Image(label="Input Image", height=512, type="pil")

            with gr.Column(scale=1):
                segm_img_bbox = gr.Image(label="Segment", interactive=False, height=512, type="pil")

        with gr.Row():
            with gr.Column():
                with gr.Row():
                    with gr.Column():
                        clean_btn_bbox = gr.Button("Clean Prompts", variant="secondary")
                        clear_btn_bbox = gr.Button("Restart", variant="secondary")
            with gr.Column():
                cls_info_bbox = gr.Textbox("", label='Labels')

        with gr.Row():
            with gr.Column():
                gr.Markdown("Try some of the examples below ⬇️")
                gr.Examples(
                    examples=examples,
                    inputs=[cond_img_bbox, img_state_bbox],
                    outputs=[img_state_bbox, cond_img_bbox, segm_img_bbox, cls_info_bbox],
                    examples_per_page=12,
                    fn=extract_img_feat,
                    run_on_click=True,
                    cache_examples=False,
                )

    # extract image feature
    cond_img_p.upload(
        extract_img_feat,
        [cond_img_p, img_state_points],
        outputs=[img_state_points, cond_img_p, segm_img_p, cls_info]
    )
    cond_img_bbox.upload(
        extract_img_feat,
        [cond_img_bbox, img_state_bbox],
        outputs=[img_state_bbox, cond_img_bbox, segm_img_bbox, cls_info]
    )

    # get user added points
    cond_img_p.select(
        get_points_with_draw,
        [cond_img_p, img_state_points],
        outputs=[img_state_points, cond_img_p]
    ).then(
        segment_with_points,
        inputs=[cond_img_p, img_state_points],
        outputs=[img_state_points, cond_img_p, segm_img_p, cls_info]
    )
    cond_img_bbox.select(
        get_bbox_with_draw,
        [cond_img_bbox, img_state_bbox],
        outputs=[img_state_bbox, cond_img_bbox]
    ).then(
        segment_with_bbox,
        inputs=[cond_img_bbox, img_state_bbox],
        outputs=[cond_img_bbox, segm_img_bbox, cls_info_bbox]
    )

    # clean prompts
    clean_btn_p.click(
        clean_prompts,
        inputs=[img_state_points],
        outputs=[img_state_points, cond_img_p, segm_img_p, cls_info]
    )
    clean_btn_bbox.click(
        clean_prompts,
        inputs=[img_state_bbox],
        outputs=[img_state_bbox, cond_img_bbox, segm_img_bbox, cls_info_bbox]
    )

    # clear
    clear_btn_p.click(
        clear_everything,
        inputs=[img_state_points],
        outputs=[img_state_points, cond_img_p, segm_img_p, cls_info]
    )
    cond_img_p.clear(
        clear_everything,
        inputs=[img_state_points],
        outputs=[img_state_points, cond_img_p, segm_img_p, cls_info]
    )
    segm_img_p.clear(
        clear_everything,
        inputs=[img_state_points],
        outputs=[img_state_points, cond_img_p, segm_img_p, cls_info]
    )
    clear_btn_bbox.click(
        clear_everything,
        inputs=[img_state_bbox],
        outputs=[img_state_bbox, cond_img_bbox, segm_img_bbox, cls_info_bbox]
    )
    cond_img_bbox.clear(
        clear_everything,
        inputs=[img_state_bbox],
        outputs=[img_state_bbox, cond_img_bbox, segm_img_bbox, cls_info_bbox]
    )
    segm_img_bbox.clear(
        clear_everything,
        inputs=[img_state_bbox],
        outputs=[img_state_bbox, cond_img_bbox, segm_img_bbox, cls_info_bbox]
    )


if __name__ == '__main__':
    with gr.Blocks(css=css, title="Open-Vocabulary SAM") as demo:
        register_point_mode()
    demo.queue()
    demo.launch(show_api=False)