File size: 10,029 Bytes

import spaces
import multiprocessing as mp
import numpy as np
from PIL import Image,ImageDraw
import torch
try:
    import detectron2
except:
    import os
    os.system('pip install git+https://github.com/facebookresearch/detectron2.git')

from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.data.detection_utils import read_image
from mask_adapter import add_maskformer2_config, add_fcclip_config, add_mask_adapter_config
from mask_adapter.sam_maskadapter import SAMVisualizationDemo, SAMPointVisualizationDemo
import gradio as gr
import open_clip
from sam2.build_sam import build_sam2
from mask_adapter.modeling.meta_arch.mask_adapter_head import build_mask_adapter




def setup_cfg(config_file):
    cfg = get_cfg()
    add_deeplab_config(cfg)
    add_maskformer2_config(cfg)
    add_fcclip_config(cfg)
    add_mask_adapter_config(cfg)
    cfg.merge_from_file(config_file)
    cfg.freeze()
    return cfg

class IMGState:
    def __init__(self):
        self.img = None
        self.img_feat = None
        self.selected_points = []
        self.selected_points_labels = []
        self.selected_bboxes = []

        self.available_to_set = True

    def set_img(self, img, img_feat):
        self.img = img
        self.img_feat = img_feat

        self.available_to_set = False

    def clear(self):
        self.img = None
        self.img_feat = None
        self.selected_points = []
        self.selected_points_labels = []
        self.selected_bboxes = []

        self.available_to_set = True

    def clean(self):
        self.selected_points = []
        self.selected_points_labels = []
        self.selected_bboxes = []

    def to_device(self, device=torch.device("cuda")):
        if self.img_feat is not None:
            for k in self.img_feat:
                if isinstance(self.img_feat[k], torch.Tensor):
                    self.img_feat[k] = self.img_feat[k].to(device)
                elif isinstance(self.img_feat[k], tuple):
                    self.img_feat[k] = tuple(v.to(device) for v in self.img_feat[k])

    @property
    def available(self):
        return self.available_to_set
    
    
@spaces.GPU
@torch.no_grad()
@torch.autocast(device_type="cuda", dtype=torch.float32)
def inference_automatic(input_img, class_names):
    mp.set_start_method("spawn", force=True)
    config_file = './configs/ground-truth-warmup/mask-adapter/mask_adapter_convnext_large_cocopan_eval_ade20k.yaml'
    cfg = setup_cfg(config_file)
    
    demo = SAMVisualizationDemo(cfg, 0.8, sam2_model, clip_model,mask_adapter)
    
    class_names = class_names.split(',')
    img = read_image(input_img, format="BGR")
    if len(class_names) == 1:
        class_names.append('others')
    txts = [f'a photo of {cls_name}' for cls_name in class_names]
    text = open_clip.tokenize(txts)


    text_features = clip_model.encode_text(text.cuda())
    text_features /= text_features.norm(dim=-1, keepdim=True)
        
    _, visualized_output = demo.run_on_image(img, class_names,text_features)

    return Image.fromarray(np.uint8(visualized_output.get_image())).convert('RGB')

@spaces.GPU
@torch.no_grad()
@torch.autocast(device_type="cuda", dtype=torch.float32)
def inference_point(input_img,  img_state,):


    mp.set_start_method("spawn", force=True)
    
    points = img_state.selected_points
    print(f"Selected point: {points}")
    
    config_file = './configs/ground-truth-warmup/mask-adapter/mask_adapter_convnext_large_cocopan_eval_ade20k.yaml'
    cfg = setup_cfg(config_file)
    
    demo = SAMPointVisualizationDemo(cfg, 0.8, sam2_model, clip_model,mask_adapter)
    
    text_features = torch.from_numpy(np.load("./text_embedding/lvis_coco_text_embedding.npy")).cuda()
    _, visualized_output = demo.run_on_image_with_points(img_state.img, points,text_features)
    return visualized_output


sam2_model = None
clip_model = None
mask_adapter = None

def get_points_with_draw(image, img_state, evt: gr.SelectData):
    label = 'Add Mask'

    x, y = evt.index[0], evt.index[1]
    point_radius, point_color = 10, (97, 217, 54) if label == "Add Mask" else (237, 34, 13)

    img_state.selected_points.append([x, y])
    img_state.selected_points_labels.append(1 if label == "Add Mask" else 0)
    img_state.set_img(np.array(image), None)
    draw = ImageDraw.Draw(image)
    draw.ellipse(
        [(x - point_radius, y - point_radius), (x + point_radius, y + point_radius)],
        fill=point_color,
    )
    return img_state, image
def initialize_models(sam_path, adapter_pth, model_cfg, cfg):
    cfg = setup_cfg(cfg)
    global sam2_model, clip_model, mask_adapter

    if sam2_model is None:
        sam2_model = build_sam2(model_cfg, sam_path, device="cpu", apply_postprocessing=False)
        sam2_model = sam2_model.to("cuda")
        print("SAM2 model initialized.")
    
    if clip_model is None:
        clip_model, _, _ = open_clip.create_model_and_transforms("convnext_large_d_320", pretrained="laion2b_s29b_b131k_ft_soup")
        clip_model = clip_model.eval()
        clip_model = clip_model.to("cuda")
        print("CLIP model initialized.")
    
    if mask_adapter is None:
        mask_adapter = build_mask_adapter(cfg, "MASKAdapterHead").to("cuda")
        mask_adapter = mask_adapter.eval()
        adapter_state_dict = torch.load(adapter_pth)
        mask_adapter.load_state_dict(adapter_state_dict)
        print("Mask Adapter model initialized.")

def clear_everything(img_state):
    img_state.clear()
    return img_state, None, None


def clean_prompts(img_state):
    img_state.clean()
    return img_state, Image.fromarray(img_state.img), None

# 初始化配置和模型
model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
sam_path = './sam2.1_hiera_large.pt'
adapter_pth = './model_0279999_with_sem_new.pth'
cfg = './configs/ground-truth-warmup/mask-adapter/mask_adapter_convnext_large_cocopan_eval_ade20k.yaml'

initialize_models(sam_path, adapter_pth, model_cfg, cfg)

# Examples for testing
examples = [
    ['./demo/images/000000001025.jpg', 'dog, beach, trees, sea, sky, snow, person, rocks, buildings, birds, beach umbrella, beach chair'],
    ['./demo/images/ADE_val_00000979.jpg', 'sky,sea,mountain,pier,beach,island,,landscape,horizon'],
    ['./demo/images/ADE_val_00001200.jpg', 'bridge, mountains, trees, water, sky, buildings, boats, animals, flowers, waterfalls, grasslands, rocks'],
]

examples_point = [
    ['./demo/images/ADE_val_00000739.jpg'],
    ['./demo/images/000000290833.jpg'],
    ['./demo/images/2010_001315.jpg'],
    ['./demo/images/ADE_val_00000001.jpg'],
    ['./demo/images/000000000785.jpg'],
]

output_labels = ['segmentation map']

title = '<center><h2>Mask-Adapter + Segment Anything-2</h2></center>'

description = """
<b>Mask-Adapter: The Devil is in the Masks for Open-Vocabulary Segmentation</b><br>
Mask-Adapter effectively extends to SAM or SAM-2 without additional training, achieving impressive results across multiple open-vocabulary segmentation benchmarks.<br>
<div style="display: flex; gap: 20px;">
    <a href="https://arxiv.org/abs/2406.20076">
        <img src="https://img.shields.io/badge/arXiv-Paper-red" alt="arXiv Paper">
    </a>
    <a href="https://github.com/hustvl/MaskAdapter">
        <img src="https://img.shields.io/badge/GitHub-Code-blue" alt="GitHub Code">
    </a>
</div>
"""

with gr.Blocks() as demo:
    gr.Markdown(title)  # Title
    gr.Markdown(description)  # Description

    with gr.Tabs():
        with gr.TabItem("Automatic Mode"):
            with gr.Row():
                with gr.Column():
                    input_image = gr.Image(type='filepath', label="Input Image")
                    class_names = gr.Textbox(lines=1, placeholder=None, label='Class Names')
                with gr.Column():
                    output_image = gr.Image(type="pil", label='Segmentation Map')

                    # Buttons below segmentation map (now placed under segmentation map)
                    run_button = gr.Button("Run Automatic Segmentation")
                    run_button.click(inference_automatic, inputs=[input_image, class_names], outputs=output_image)
                    
                    clear_button = gr.Button("Clear")
                    clear_button.click(lambda: None, inputs=None, outputs=output_image)
            
            with gr.Row():
                gr.Examples(examples=examples, inputs=[input_image, class_names], outputs=output_image)

        with gr.TabItem("Point Mode"):
            img_state_points = gr.State(value=IMGState())
            with gr.Row():  # 水平排列
                with gr.Column(scale=1): 
                    input_image = gr.Image( label="Input Image", type="pil") 
                with gr.Column(scale=1):  # 第二列：分割图输出
                    output_image_point = gr.Image(type="pil", label='Segmentation Map',interactive=False)  # 输出分割图

            input_image.select(
                get_points_with_draw,
                [input_image, img_state_points],
                outputs=[img_state_points, input_image]
            ).then(
                inference_point,
                inputs=[input_image, img_state_points],
                outputs=[output_image_point]
            )
            clear_prompt_button_point = gr.Button("Clean Prompt")
            clear_prompt_button_point.click(
                clean_prompts,
                inputs=[img_state_points],
                outputs=[img_state_points, input_image, output_image_point]
            )
            clear_button_point = gr.Button("Restart")
            clear_button_point.click(
                clear_everything,
                inputs=[img_state_points],
                outputs=[img_state_points, input_image, output_image_point]
            )

            with gr.Row():
                gr.Examples(examples=examples_point, inputs=input_image, outputs=output_image_point,examples_per_page=5)


            
    # Example images below buttons

    demo.launch()