Spaces:

TencentARC
/

Caption-Anything

Runtime error

App Files Files Community

ttengwang commited on Apr 23, 2023

Commit

f1a2810

1 Parent(s): 89e01b9

update

Browse files

Files changed (10) hide show

app.py +27 -6
app_wo_langchain.py +0 -588
caption_anything/captioner/base_captioner.py +56 -63
caption_anything/captioner/blip.py +15 -16
caption_anything/captioner/blip2.py +3 -4
caption_anything/captioner/git.py +12 -9
caption_anything/model.py +6 -0
caption_anything/segmenter/base_segmenter.py +4 -16
caption_anything/utils/chatbot.py +10 -21
caption_anything/utils/utils.py +144 -98

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from gradio import processing_utils
 from packaging import version
 from PIL import Image, ImageDraw
 from caption_anything.model import CaptionAnything
 from caption_anything.utils.image_editing_utils import create_bubble_frame
@@ -22,7 +23,6 @@ from segment_anything import sam_model_registry
 args = parse_augment()
 args.segmenter = "huge"
 args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
 if args.segmenter_checkpoint is None:
     _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
 else:
@@ -131,7 +131,7 @@ def chat_input_callback(*args):
         return state, state
 def upload_callback(image_input, state, visual_chatgpt=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
@@ -162,7 +162,8 @@ def upload_callback(image_input, state, visual_chatgpt=None):
         img_caption, _ = model.captioner.inference_seg(image_input)
         Human_prompt = f'\nHuman: provide a new figure with path {new_image_path}. The description is: {img_caption}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received."
-        visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
     state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
     return state, state, image_input, click_state, image_input, image_input, image_embedding, \
@@ -309,12 +310,16 @@ def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuali
         yield state, state, refined_image_input, wiki
-def clear_chat_memory(visual_chatgpt):
     if visual_chatgpt is not None:
         visual_chatgpt.memory.clear()
-        visual_chatgpt.current_image = None
         visual_chatgpt.point_prompt = ""
 def get_style():
     current_version = version.parse(gr.__version__)
     if current_version <= version.parse('3.24.1'):
@@ -465,6 +470,21 @@ def create_ui():
                                               modules_not_need_gpt,
                                               modules_not_need_gpt2, modules_not_need_gpt3, text_refiner, visual_chatgpt])
         clear_button_click.click(
             lambda x: ([[], [], []], x, ""),
             [origin_image],
@@ -472,6 +492,7 @@ def create_ui():
             queue=False,
             show_progress=False
         )
         clear_button_image.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],

 from packaging import version
 from PIL import Image, ImageDraw
+import functools
 from caption_anything.model import CaptionAnything
 from caption_anything.utils.image_editing_utils import create_bubble_frame
 args = parse_augment()
 args.segmenter = "huge"
 args.segmenter_checkpoint = "sam_vit_h_4b8939.pth"
 if args.segmenter_checkpoint is None:
     _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
 else:
         return state, state
 def upload_callback(image_input, state, visual_chatgpt=None):
     if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
         image_input, mask = image_input['image'], image_input['mask']
         img_caption, _ = model.captioner.inference_seg(image_input)
         Human_prompt = f'\nHuman: provide a new figure with path {new_image_path}. The description is: {img_caption}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
         AI_prompt = "Received."
+        visual_chatgpt.global_prompt = Human_prompt + 'AI: ' + AI_prompt
+        visual_chatgpt.agent.memory.buffer = visual_chatgpt.agent.memory.buffer + visual_chatgpt.global_prompt
     state = [(None, 'Received new image, resize it to width {} and height {}: '.format(image_input.size[0], image_input.size[1]))]
     return state, state, image_input, click_state, image_input, image_input, image_embedding, \
         yield state, state, refined_image_input, wiki
+def clear_chat_memory(visual_chatgpt, keep_global=False):
     if visual_chatgpt is not None:
         visual_chatgpt.memory.clear()
         visual_chatgpt.point_prompt = ""
+        if keep_global:
+            visual_chatgpt.agent.memory.buffer = visual_chatgpt.global_prompt
+        else:
+            visual_chatgpt.current_image = None
+            visual_chatgpt.global_prompt = ""
 def get_style():
     current_version = version.parse(gr.__version__)
     if current_version <= version.parse('3.24.1'):
                                               modules_not_need_gpt,
                                               modules_not_need_gpt2, modules_not_need_gpt3, text_refiner, visual_chatgpt])
+        enable_chatGPT_button.click(
+            lambda: (None, [], [], [[], [], []], "", "", ""),
+            [],
+            [image_input, chatbot, state, click_state, wiki_output, origin_image],
+            queue=False,
+            show_progress=False
+        )
+        openai_api_key.submit(
+            lambda: (None, [], [], [[], [], []], "", "", ""),
+            [],
+            [image_input, chatbot, state, click_state, wiki_output, origin_image],
+            queue=False,
+            show_progress=False
+        )
         clear_button_click.click(
             lambda x: ([[], [], []], x, ""),
             [origin_image],
             queue=False,
             show_progress=False
         )
+        clear_button_click.click(functools.partial(clear_chat_memory, keep_global=True), inputs=[visual_chatgpt])
         clear_button_image.click(
             lambda: (None, [], [], [[], [], []], "", "", ""),
             [],

app_wo_langchain.py DELETED Viewed

@@ -1,588 +0,0 @@
-import os
-import json
-from typing import List
-import PIL
-import gradio as gr
-import numpy as np
-from gradio import processing_utils
-from packaging import version
-from PIL import Image, ImageDraw
-from caption_anything.model import CaptionAnything
-from caption_anything.utils.image_editing_utils import create_bubble_frame
-from caption_anything.utils.utils import mask_painter, seg_model_map, prepare_segmenter
-from caption_anything.utils.parser import parse_augment
-from caption_anything.captioner import build_captioner
-from caption_anything.text_refiner import build_text_refiner
-from caption_anything.segmenter import build_segmenter
-from caption_anything.utils.chatbot import ConversationBot, build_chatbot_tools, get_new_image_name
-from segment_anything import sam_model_registry
-args = parse_augment()
-args = parse_augment()
-if args.segmenter_checkpoint is None:
-    _, segmenter_checkpoint = prepare_segmenter(args.segmenter)
-else:
-    segmenter_checkpoint = args.segmenter_checkpoint
-shared_captioner = build_captioner(args.captioner, args.device, args)
-shared_sam_model = sam_model_registry[seg_model_map[args.segmenter]](checkpoint=segmenter_checkpoint).to(args.device)
-class ImageSketcher(gr.Image):
-    """
-    Fix the bug of gradio.Image that cannot upload with tool == 'sketch'.
-    """
-    is_template = True  # Magic to make this work with gradio.Block, don't remove unless you know what you're doing.
-    def __init__(self, **kwargs):
-        super().__init__(tool="sketch", **kwargs)
-    def preprocess(self, x):
-        if self.tool == 'sketch' and self.source in ["upload", "webcam"]:
-            assert isinstance(x, dict)
-            if x['mask'] is None:
-                decode_image = processing_utils.decode_base64_to_image(x['image'])
-                width, height = decode_image.size
-                mask = np.zeros((height, width, 4), dtype=np.uint8)
-                mask[..., -1] = 255
-                mask = self.postprocess(mask)
-                x['mask'] = mask
-        return super().preprocess(x)
-def build_caption_anything_with_models(args, api_key="", captioner=None, sam_model=None, text_refiner=None,
-                                       session_id=None):
-    segmenter = build_segmenter(args.segmenter, args.device, args, model=sam_model)
-    captioner = captioner
-    if session_id is not None:
-        print('Init caption anything for session {}'.format(session_id))
-    return CaptionAnything(args, api_key, captioner=captioner, segmenter=segmenter, text_refiner=text_refiner)
-def init_openai_api_key(api_key=""):
-    text_refiner = None
-    if api_key and len(api_key) > 30:
-        try:
-            text_refiner = build_text_refiner(args.text_refiner, args.device, args, api_key)
-            text_refiner.llm('hi')  # test
-        except:
-            text_refiner = None
-    openai_available = text_refiner is not None
-    return gr.update(visible=openai_available), gr.update(visible=openai_available), gr.update(
-        visible=openai_available), gr.update(visible=True), gr.update(visible=True), gr.update(
-        visible=True), text_refiner
-def get_click_prompt(chat_input, click_state, click_mode):
-    inputs = json.loads(chat_input)
-    if click_mode == 'Continuous':
-        points = click_state[0]
-        labels = click_state[1]
-        for input in inputs:
-            points.append(input[:2])
-            labels.append(input[2])
-    elif click_mode == 'Single':
-        points = []
-        labels = []
-        for input in inputs:
-            points.append(input[:2])
-            labels.append(input[2])
-        click_state[0] = points
-        click_state[1] = labels
-    else:
-        raise NotImplementedError
-    prompt = {
-        "prompt_type": ["click"],
-        "input_point": click_state[0],
-        "input_label": click_state[1],
-        "multimask_output": "True",
-    }
-    return prompt
-def update_click_state(click_state, caption, click_mode):
-    if click_mode == 'Continuous':
-        click_state[2].append(caption)
-    elif click_mode == 'Single':
-        click_state[2] = [caption]
-    else:
-        raise NotImplementedError
-def chat_with_points(chat_input, click_state, chat_state, state, text_refiner, img_caption):
-    if text_refiner is None:
-        response = "Text refiner is not initilzed, please input openai api key."
-        state = state + [(chat_input, response)]
-        return state, state, chat_state
-    points, labels, captions = click_state
-    # point_chat_prompt = "I want you act as a chat bot in terms of image. I will give you some points (w, h) in the image and tell you what happed on the point in natural language. Note that (0, 0) refers to the top-left corner of the image, w refers to the width and h refers the height. You should chat with me based on the fact in the image instead of imagination. Now I tell you the points with their visual description:\n{points_with_caps}\nNow begin chatting!"
-    suffix = '\nHuman: {chat_input}\nAI: '
-    qa_template = '\nHuman: {q}\nAI: {a}'
-    # # "The image is of width {width} and height {height}."
-    point_chat_prompt = "I am an AI trained to chat with you about an image. I am greate at what is going on in any image based on the image information your provide. The overall image description is \"{img_caption}\". You will also provide me objects in the image in details, i.e., their location and visual descriptions. Here are the locations and descriptions of events that happen in the image: {points_with_caps} \nYou are required to use language instead of number to describe these positions. Now, let's chat!"
-    prev_visual_context = ""
-    pos_points = []
-    pos_captions = []
-    for i in range(len(points)):
-        if labels[i] == 1:
-            pos_points.append(f"(X:{points[i][0]}, Y:{points[i][1]})")
-            pos_captions.append(captions[i])
-    prev_visual_context = prev_visual_context + '\n' + 'There is an event described as  \"{}\" locating at {}'.format(
-        pos_captions[-1], ', '.join(pos_points))
-    context_length_thres = 500
-    prev_history = ""
-    for i in range(len(chat_state)):
-        q, a = chat_state[i]
-        if len(prev_history) < context_length_thres:
-            prev_history = prev_history + qa_template.format(**{"q": q, "a": a})
-        else:
-            break
-    chat_prompt = point_chat_prompt.format(
-        **{"img_caption": img_caption, "points_with_caps": prev_visual_context}) + prev_history + suffix.format(
-        **{"chat_input": chat_input})
-    print('\nchat_prompt: ', chat_prompt)
-    response = text_refiner.llm(chat_prompt)
-    state = state + [(chat_input, response)]
-    chat_state = chat_state + [(chat_input, response)]
-    return state, state, chat_state
-def upload_callback(image_input, state):
-    if isinstance(image_input, dict):  # if upload from sketcher_input, input contains image and mask
-        image_input, mask = image_input['image'], image_input['mask']
-    chat_state = []
-    click_state = [[], [], []]
-    res = 1024
-    width, height = image_input.size
-    ratio = min(1.0 * res / max(width, height), 1.0)
-    if ratio < 1.0:
-        image_input = image_input.resize((int(width * ratio), int(height * ratio)))
-        print('Scaling input image to {}'.format(image_input.size))
-    state = [] + [(None, 'Image size: ' + str(image_input.size))]
-    model = build_caption_anything_with_models(
-        args,
-        api_key="",
-        captioner=shared_captioner,
-        sam_model=shared_sam_model,
-        session_id=iface.app_id
-    )
-    model.segmenter.set_image(image_input)
-    image_embedding = model.image_embedding
-    original_size = model.original_size
-    input_size = model.input_size
-    img_caption, _ = model.captioner.inference_seg(image_input)
-    return state, state, chat_state, image_input, click_state, image_input, image_input, image_embedding, \
-        original_size, input_size, img_caption
-def inference_click(image_input, point_prompt, click_mode, enable_wiki, language, sentiment, factuality,
-                    length, image_embedding, state, click_state, original_size, input_size, text_refiner,
-                    evt: gr.SelectData):
-    click_index = evt.index
-    if point_prompt == 'Positive':
-        coordinate = "[[{}, {}, 1]]".format(str(click_index[0]), str(click_index[1]))
-    else:
-        coordinate = "[[{}, {}, 0]]".format(str(click_index[0]), str(click_index[1]))
-    prompt = get_click_prompt(coordinate, click_state, click_mode)
-    input_points = prompt['input_point']
-    input_labels = prompt['input_label']
-    controls = {'length': length,
-                'sentiment': sentiment,
-                'factuality': factuality,
-                'language': language}
-    model = build_caption_anything_with_models(
-        args,
-        api_key="",
-        captioner=shared_captioner,
-        sam_model=shared_sam_model,
-        text_refiner=text_refiner,
-        session_id=iface.app_id
-    )
-    model.setup(image_embedding, original_size, input_size, is_image_set=True)
-    enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
-    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)
-    state = state + [("Image point: {}, Input label: {}".format(prompt["input_point"], prompt["input_label"]), None)]
-    state = state + [(None, "raw_caption: {}".format(out['generated_captions']['raw_caption']))]
-    wiki = out['generated_captions'].get('wiki', "")
-    update_click_state(click_state, out['generated_captions']['raw_caption'], click_mode)
-    text = out['generated_captions']['raw_caption']
-    input_mask = np.array(out['mask'].convert('P'))
-    image_input = mask_painter(np.array(image_input), input_mask)
-    origin_image_input = image_input
-    image_input = create_bubble_frame(image_input, text, (click_index[0], click_index[1]), input_mask,
-                                      input_points=input_points, input_labels=input_labels)
-    yield state, state, click_state, image_input, wiki
-    if not args.disable_gpt and model.text_refiner:
-        refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
-                                                       enable_wiki=enable_wiki)
-        # new_cap = 'Original: ' + text + '. Refined: ' + refined_caption['caption']
-        new_cap = refined_caption['caption']
-        wiki = refined_caption['wiki']
-        state = state + [(None, f"caption: {new_cap}")]
-        refined_image_input = create_bubble_frame(origin_image_input, new_cap, (click_index[0], click_index[1]),
-                                                  input_mask,
-                                                  input_points=input_points, input_labels=input_labels)
-        yield state, state, click_state, refined_image_input, wiki
-def get_sketch_prompt(mask: PIL.Image.Image, multi_mask=True):
-    """
-    Get the prompt for the sketcher.
-    TODO: This is a temporary solution. We should cluster the sketch and get the bounding box of each cluster.
-    """
-    mask = np.array(np.asarray(mask)[..., 0])
-    mask[mask > 0] = 1  # Refine the mask, let all nonzero values be 1
-    if not multi_mask:
-        y, x = np.where(mask == 1)
-        x1, y1 = np.min(x), np.min(y)
-        x2, y2 = np.max(x), np.max(y)
-        prompt = {
-            'prompt_type': ['box'],
-            'input_boxes': [
-                [x1, y1, x2, y2]
-            ]
-        }
-        return prompt
-    traversed = np.zeros_like(mask)
-    groups = np.zeros_like(mask)
-    max_group_id = 1
-    # Iterate over all pixels
-    for x in range(mask.shape[0]):
-        for y in range(mask.shape[1]):
-            if traversed[x, y] == 1:
-                continue
-            if mask[x, y] == 0:
-                traversed[x, y] = 1
-            else:
-                # If pixel is part of mask
-                groups[x, y] = max_group_id
-                stack = [(x, y)]
-                while stack:
-                    i, j = stack.pop()
-                    if traversed[i, j] == 1:
-                        continue
-                    traversed[i, j] = 1
-                    if mask[i, j] == 1:
-                        groups[i, j] = max_group_id
-                        for di, dj in [(1, 0), (-1, 0), (0, 1), (0, -1), (1, 1), (1, -1), (-1, 1), (-1, -1)]:
-                            ni, nj = i + di, j + dj
-                            traversed[i, j] = 1
-                            if 0 <= nj < mask.shape[1] and mask.shape[0] > ni >= 0 == traversed[ni, nj]:
-                                stack.append((i + di, j + dj))
-                max_group_id += 1
-    # get the bounding box of each group
-    boxes = []
-    for group in range(1, max_group_id):
-        y, x = np.where(groups == group)
-        x1, y1 = np.min(x), np.min(y)
-        x2, y2 = np.max(x), np.max(y)
-        boxes.append([x1, y1, x2, y2])
-    prompt = {
-        'prompt_type': ['box'],
-        'input_boxes': boxes
-    }
-    return prompt
-def inference_traject(sketcher_image, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
-                      original_size, input_size, text_refiner):
-    image_input, mask = sketcher_image['image'], sketcher_image['mask']
-    prompt = get_sketch_prompt(mask, multi_mask=False)
-    boxes = prompt['input_boxes']
-    controls = {'length': length,
-                'sentiment': sentiment,
-                'factuality': factuality,
-                'language': language}
-    model = build_caption_anything_with_models(
-        args,
-        api_key="",
-        captioner=shared_captioner,
-        sam_model=shared_sam_model,
-        text_refiner=text_refiner,
-        session_id=iface.app_id
-    )
-    model.setup(image_embedding, original_size, input_size, is_image_set=True)
-    enable_wiki = True if enable_wiki in ['True', 'TRUE', 'true', True, 'Yes', 'YES', 'yes'] else False
-    out = model.inference(image_input, prompt, controls, disable_gpt=True, enable_wiki=enable_wiki)
-    # Update components and states
-    state.append((f'Box: {boxes}', None))
-    state.append((None, f'raw_caption: {out["generated_captions"]["raw_caption"]}'))
-    wiki = out['generated_captions'].get('wiki', "")
-    text = out['generated_captions']['raw_caption']
-    input_mask = np.array(out['mask'].convert('P'))
-    image_input = mask_painter(np.array(image_input), input_mask)
-    origin_image_input = image_input
-    fake_click_index = (int((boxes[0][0] + boxes[0][2]) / 2), int((boxes[0][1] + boxes[0][3]) / 2))
-    image_input = create_bubble_frame(image_input, text, fake_click_index, input_mask)
-    yield state, state, image_input, wiki
-    if not args.disable_gpt and model.text_refiner:
-        refined_caption = model.text_refiner.inference(query=text, controls=controls, context=out['context_captions'],
-                                                       enable_wiki=enable_wiki)
-        new_cap = refined_caption['caption']
-        wiki = refined_caption['wiki']
-        state = state + [(None, f"caption: {new_cap}")]
-        refined_image_input = create_bubble_frame(origin_image_input, new_cap, fake_click_index, input_mask)
-        yield state, state, refined_image_input, wiki
-def get_style():
-    current_version = version.parse(gr.__version__)
-    if current_version <= version.parse('3.24.1'):
-        style = '''
-        #image_sketcher{min-height:500px}
-        #image_sketcher [data-testid="image"], #image_sketcher [data-testid="image"] > div{min-height: 500px}
-        #image_upload{min-height:500px}
-        #image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 500px}
-        '''
-    elif current_version <= version.parse('3.27'):
-        style = '''
-        #image_sketcher{min-height:500px}
-        #image_upload{min-height:500px}
-        '''
-    else:
-        style = None
-    return style
-def create_ui():
-    title = """<p><h1 align="center">Caption-Anything</h1></p>
-    """
-    description = """<p>Gradio demo for Caption Anything, image to dense captioning generation with various language styles. To use it, simply upload your image, or click one of the examples to load them. Code: <a href="https://github.com/ttengwang/Caption-Anything">https://github.com/ttengwang/Caption-Anything</a> <a href="https://huggingface.co/spaces/TencentARC/Caption-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>"""
-    examples = [
-        ["test_images/img35.webp"],
-        ["test_images/img2.jpg"],
-        ["test_images/img5.jpg"],
-        ["test_images/img12.jpg"],
-        ["test_images/img14.jpg"],
-        ["test_images/qingming3.jpeg"],
-        ["test_images/img1.jpg"],
-    ]
-    with gr.Blocks(
-            css=get_style()
-    ) as iface:
-        state = gr.State([])
-        click_state = gr.State([[], [], []])
-        chat_state = gr.State([])
-        origin_image = gr.State(None)
-        image_embedding = gr.State(None)
-        text_refiner = gr.State(None)
-        original_size = gr.State(None)
-        input_size = gr.State(None)
-        img_caption = gr.State(None)
-        gr.Markdown(title)
-        gr.Markdown(description)
-        with gr.Row():
-            with gr.Column(scale=1.0):
-                with gr.Column(visible=False) as modules_not_need_gpt:
-                    with gr.Tab("Click"):
-                        image_input = gr.Image(type="pil", interactive=True, elem_id="image_upload")
-                        example_image = gr.Image(type="pil", interactive=False, visible=False)
-                        with gr.Row(scale=1.0):
-                            with gr.Row(scale=0.4):
-                                point_prompt = gr.Radio(
-                                    choices=["Positive", "Negative"],
-                                    value="Positive",
-                                    label="Point Prompt",
-                                    interactive=True)
-                                click_mode = gr.Radio(
-                                    choices=["Continuous", "Single"],
-                                    value="Continuous",
-                                    label="Clicking Mode",
-                                    interactive=True)
-                            with gr.Row(scale=0.4):
-                                clear_button_click = gr.Button(value="Clear Clicks", interactive=True)
-                                clear_button_image = gr.Button(value="Clear Image", interactive=True)
-                    with gr.Tab("Trajectory (Beta)"):
-                        sketcher_input = ImageSketcher(type="pil", interactive=True, brush_radius=20,
-                                                       elem_id="image_sketcher")
-                        with gr.Row():
-                            submit_button_sketcher = gr.Button(value="Submit", interactive=True)
-                with gr.Column(visible=False) as modules_need_gpt:
-                    with gr.Row(scale=1.0):
-                        language = gr.Dropdown(
-                            ['English', 'Chinese', 'French', "Spanish", "Arabic", "Portuguese", "Cantonese"],
-                            value="English", label="Language", interactive=True)
-                        sentiment = gr.Radio(
-                            choices=["Positive", "Natural", "Negative"],
-                            value="Natural",
-                            label="Sentiment",
-                            interactive=True,
-                        )
-                    with gr.Row(scale=1.0):
-                        factuality = gr.Radio(
-                            choices=["Factual", "Imagination"],
-                            value="Factual",
-                            label="Factuality",
-                            interactive=True,
-                        )
-                        length = gr.Slider(
-                            minimum=10,
-                            maximum=80,
-                            value=10,
-                            step=1,
-                            interactive=True,
-                            label="Generated Caption Length",
-                        )
-                        enable_wiki = gr.Radio(
-                            choices=["Yes", "No"],
-                            value="No",
-                            label="Enable Wiki",
-                            interactive=True)
-                with gr.Column(visible=True) as modules_not_need_gpt3:
-                    gr.Examples(
-                        examples=examples,
-                        inputs=[example_image],
-                    )
-            with gr.Column(scale=0.5):
-                openai_api_key = gr.Textbox(
-                    placeholder="Input openAI API key",
-                    show_label=False,
-                    label="OpenAI API Key",
-                    lines=1,
-                    type="password")
-                with gr.Row(scale=0.5):
-                    enable_chatGPT_button = gr.Button(value="Run with ChatGPT", interactive=True, variant='primary')
-                    disable_chatGPT_button = gr.Button(value="Run without ChatGPT (Faster)", interactive=True,
-                                                       variant='primary')
-                with gr.Column(visible=False) as modules_need_gpt2:
-                    wiki_output = gr.Textbox(lines=5, label="Wiki", max_lines=5)
-                with gr.Column(visible=False) as modules_not_need_gpt2:
-                    chatbot = gr.Chatbot(label="Chat about Selected Object", ).style(height=550, scale=0.5)
-                    with gr.Column(visible=False) as modules_need_gpt3:
-                        chat_input = gr.Textbox(show_label=False, placeholder="Enter text and press Enter").style(
-                            container=False)
-                        with gr.Row():
-                            clear_button_text = gr.Button(value="Clear Text", interactive=True)
-                            submit_button_text = gr.Button(value="Submit", interactive=True, variant="primary")
-        openai_api_key.submit(init_openai_api_key, inputs=[openai_api_key],
-                              outputs=[modules_need_gpt, modules_need_gpt2, modules_need_gpt3, modules_not_need_gpt,
-                                       modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
-        enable_chatGPT_button.click(init_openai_api_key, inputs=[openai_api_key],
-                                    outputs=[modules_need_gpt, modules_need_gpt2, modules_need_gpt3,
-                                             modules_not_need_gpt,
-                                             modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
-        disable_chatGPT_button.click(init_openai_api_key,
-                                     outputs=[modules_need_gpt, modules_need_gpt2, modules_need_gpt3,
-                                              modules_not_need_gpt,
-                                              modules_not_need_gpt2, modules_not_need_gpt3, text_refiner])
-        clear_button_click.click(
-            lambda x: ([[], [], []], x, ""),
-            [origin_image],
-            [click_state, image_input, wiki_output],
-            queue=False,
-            show_progress=False
-        )
-        clear_button_image.click(
-            lambda: (None, [], [], [], [[], [], []], "", "", ""),
-            [],
-            [image_input, chatbot, state, chat_state, click_state, wiki_output, origin_image, img_caption],
-            queue=False,
-            show_progress=False
-        )
-        clear_button_text.click(
-            lambda: ([], [], [[], [], [], []], []),
-            [],
-            [chatbot, state, click_state, chat_state],
-            queue=False,
-            show_progress=False
-        )
-        image_input.clear(
-            lambda: (None, [], [], [], [[], [], []], "", "", ""),
-            [],
-            [image_input, chatbot, state, chat_state, click_state, wiki_output, origin_image, img_caption],
-            queue=False,
-            show_progress=False
-        )
-        image_input.upload(upload_callback, [image_input, state],
-                           [chatbot, state, chat_state, origin_image, click_state, image_input, sketcher_input,
-                            image_embedding, original_size, input_size, img_caption])
-        sketcher_input.upload(upload_callback, [sketcher_input, state],
-                              [chatbot, state, chat_state, origin_image, click_state, image_input, sketcher_input,
-                               image_embedding, original_size, input_size, img_caption])
-        chat_input.submit(chat_with_points, [chat_input, click_state, chat_state, state, text_refiner, img_caption],
-                          [chatbot, state, chat_state])
-        chat_input.submit(lambda: "", None, chat_input)
-        example_image.change(upload_callback, [example_image, state],
-                             [chatbot, state, chat_state, origin_image, click_state, image_input, sketcher_input,
-                              image_embedding, original_size, input_size, img_caption])
-        # select coordinate
-        image_input.select(
-            inference_click,
-            inputs=[
-                origin_image, point_prompt, click_mode, enable_wiki, language, sentiment, factuality, length,
-                image_embedding, state, click_state, original_size, input_size, text_refiner
-            ],
-            outputs=[chatbot, state, click_state, image_input, wiki_output],
-            show_progress=False, queue=True
-        )
-        submit_button_sketcher.click(
-            inference_traject,
-            inputs=[
-                sketcher_input, enable_wiki, language, sentiment, factuality, length, image_embedding, state,
-                original_size, input_size, text_refiner
-            ],
-            outputs=[chatbot, state, sketcher_input, wiki_output],
-            show_progress=False, queue=True
-        )
-        return iface
-if __name__ == '__main__':
-    iface = create_ui()
-    iface.queue(concurrency_count=5, api_open=False, max_size=10)
-    iface.launch(server_name="0.0.0.0", enable_queue=True, server_port=args.port, share=args.gradio_share)

caption_anything/captioner/base_captioner.py CHANGED Viewed

@@ -9,8 +9,10 @@ from typing import Union
 import time
 import clip
 def boundary(inputs):
     col = inputs.shape[1]
     inputs = inputs.reshape(-1)
     lens = len(inputs)
@@ -20,11 +22,11 @@ def boundary(inputs):
     top = start // col
     bottom = end // col
     return top, bottom
 def new_seg_to_box(seg_mask: Union[np.ndarray, Image.Image, str]):
     if type(seg_mask) == str:
         seg_mask = Image.open(seg_mask)
     elif type(seg_mask) == np.ndarray:
@@ -35,12 +37,13 @@ def new_seg_to_box(seg_mask: Union[np.ndarray, Image.Image, str]):
     left, right = boundary(seg_mask.T)
     return [left / size, top / size, right / size, bottom / size]
 def seg_to_box(seg_mask: Union[np.ndarray, Image.Image, str]):
     if type(seg_mask) == str:
         seg_mask = cv2.imread(seg_mask, cv2.IMREAD_GRAYSCALE)
         _, seg_mask = cv2.threshold(seg_mask, 127, 255, 0)
     elif type(seg_mask) == np.ndarray:
-        assert seg_mask.ndim == 2 # only support single-channel segmentation mask
         seg_mask = seg_mask.astype('uint8')
         if seg_mask.dtype == 'bool':
             seg_mask = seg_mask * 255
@@ -49,25 +52,28 @@ def seg_to_box(seg_mask: Union[np.ndarray, Image.Image, str]):
     rect = cv2.minAreaRect(contours)
     box = cv2.boxPoints(rect)
     if rect[-1] >= 45:
-        newstart = box.argmin(axis=0)[1] # leftmost
     else:
-        newstart = box.argmax(axis=0)[0] # topmost
     box = np.concatenate([box[newstart:], box[:newstart]], axis=0)
     box = np.int0(box)
     return box
 def get_w_h(rect_points):
     w = np.linalg.norm(rect_points[0] - rect_points[1], ord=2).astype('int')
     h = np.linalg.norm(rect_points[0] - rect_points[3], ord=2).astype('int')
     return w, h
 def cut_box(img, rect_points):
     w, h = get_w_h(rect_points)
-    dst_pts = np.array([[h, 0], [h, w], [0, w], [0, 0],], dtype="float32")
     transform = cv2.getPerspectiveTransform(rect_points.astype("float32"), dst_pts)
     cropped_img = cv2.warpPerspective(img, transform, (h, w))
     return cropped_img
 class BaseCaptioner:
     def __init__(self, device, enable_filter=False):
         print(f"Initializing ImageCaptioning to {device}")
@@ -82,18 +88,15 @@ class BaseCaptioner:
     @torch.no_grad()
     def filter_caption(self, image: Union[np.ndarray, Image.Image, str], caption: str):
-        if type(image) == str: # input path
-            image = Image.open(image)
-        elif type(image) == np.ndarray:
-            image = Image.fromarray(image)
-        image = self.preprocess(image).unsqueeze(0).to(self.device) # (1, 3, 224, 224)
-        text = clip.tokenize(caption).to(self.device)               # (1, 77)
-        image_features = self.filter.encode_image(image) # (1, 512)
-        text_features = self.filter.encode_text(text)    # (1, 512)
-        image_features /= image_features.norm(dim = -1, keepdim = True)
-        text_features /= text_features.norm(dim = -1, keepdim = True)
         similarity = torch.matmul(image_features, text_features.transpose(1, 0)).item()
         if similarity < self.threshold:
             print('There seems to be nothing where you clicked.')
@@ -103,24 +106,21 @@ class BaseCaptioner:
         print(f'Clip score of the caption is {similarity}')
         return out
-    def inference(self, image: Union[np.ndarray, Image.Image, str], filter: bool=False):
         raise NotImplementedError()
-    def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, filter: bool=False):
         raise NotImplementedError()
     def inference_box(self, image: Union[np.ndarray, Image.Image, str], box: Union[list, np.ndarray], filter=False):
-        if type(image) == str: # input path
-            image = Image.open(image)
-        elif type(image) == np.ndarray:
-            image = Image.fromarray(image)
-        if np.array(box).size == 4: # [x0, y0, x1, y1], where (x0, y0), (x1, y1) represent top-left and bottom-right corners
             size = max(image.width, image.height)
             x1, y1, x2, y2 = box
-            image_crop = np.array(image.crop((x1 * size, y1 * size, x2 * size, y2 * size)))
-        elif np.array(box).size == 8: # four corners of an irregular rectangle
             image_crop = cut_box(np.array(image), box)
         crop_save_path = f'result/crop_{time.time()}.png'
@@ -128,24 +128,20 @@ class BaseCaptioner:
         print(f'croped image saved in {crop_save_path}')
         caption = self.inference(image_crop, filter)
         return caption, crop_save_path
-    def inference_seg(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str]=None, crop_mode="w_bg", filter=False, disable_regular_box = False):
         if seg_mask is None:
             seg_mask = np.ones(image.size).astype(bool)
-        if type(image) == str:
-            image = Image.open(image)
-        if type(seg_mask) == str:
-            seg_mask = Image.open(seg_mask)
-        elif type(seg_mask) == np.ndarray:
-            seg_mask = Image.fromarray(seg_mask)
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0
-        if crop_mode=="wo_bg":
-            image = np.array(image) * seg_mask[:,:,np.newaxis] + (1 - seg_mask[:,:,np.newaxis]) * 255
             image = np.uint8(image)
         else:
             image = np.array(image)
@@ -155,20 +151,17 @@ class BaseCaptioner:
         else:
             min_area_box = new_seg_to_box(seg_mask)
         return self.inference_box(image, min_area_box, filter)
-    def generate_seg_cropped_image(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str], crop_mode="w_bg", disable_regular_box = False):
-        if type(image) == str:
-            image = Image.open(image)
-        if type(seg_mask) == str:
-            seg_mask = Image.open(seg_mask)
-        elif type(seg_mask) == np.ndarray:
-            seg_mask = Image.fromarray(seg_mask)
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0
-        if crop_mode=="wo_bg":
-            image = np.array(image) * seg_mask[:,:,np.newaxis] + (1- seg_mask[:,:,np.newaxis]) * 255
         else:
             image = np.array(image)
@@ -176,24 +169,24 @@ class BaseCaptioner:
             box = seg_to_box(seg_mask)
         else:
             box = new_seg_to_box(seg_mask)
-        if np.array(box).size == 4: # [x0, y0, x1, y1], where (x0, y0), (x1, y1) represent top-left and bottom-right corners
             size = max(image.shape[0], image.shape[1])
             x1, y1, x2, y2 = box
-            image_crop = np.array(image.crop((x1 * size, y1 * size, x2 * size, y2 * size)))
-        elif np.array(box).size == 8: # four corners of an irregular rectangle
             image_crop = cut_box(np.array(image), box)
         crop_save_path = f'result/crop_{time.time()}.png'
         Image.fromarray(image_crop).save(crop_save_path)
         print(f'croped image saved in {crop_save_path}')
         return crop_save_path
 if __name__ == '__main__':
     model = BaseCaptioner(device='cuda:0')
     image_path = 'test_images/img2.jpg'
-    seg_mask = np.zeros((15,15))
     seg_mask[5:10, 5:10] = 1
     seg_mask = 'image/SAM/img10.jpg.raw_mask.png'
     print(model.inference_seg(image_path, seg_mask))

 import time
 import clip
+from caption_anything.utils.utils import load_image
 def boundary(inputs):
     col = inputs.shape[1]
     inputs = inputs.reshape(-1)
     lens = len(inputs)
     top = start // col
     bottom = end // col
     return top, bottom
 def new_seg_to_box(seg_mask: Union[np.ndarray, Image.Image, str]):
     if type(seg_mask) == str:
         seg_mask = Image.open(seg_mask)
     elif type(seg_mask) == np.ndarray:
     left, right = boundary(seg_mask.T)
     return [left / size, top / size, right / size, bottom / size]
 def seg_to_box(seg_mask: Union[np.ndarray, Image.Image, str]):
     if type(seg_mask) == str:
         seg_mask = cv2.imread(seg_mask, cv2.IMREAD_GRAYSCALE)
         _, seg_mask = cv2.threshold(seg_mask, 127, 255, 0)
     elif type(seg_mask) == np.ndarray:
+        assert seg_mask.ndim == 2  # only support single-channel segmentation mask
         seg_mask = seg_mask.astype('uint8')
         if seg_mask.dtype == 'bool':
             seg_mask = seg_mask * 255
     rect = cv2.minAreaRect(contours)
     box = cv2.boxPoints(rect)
     if rect[-1] >= 45:
+        newstart = box.argmin(axis=0)[1]  # leftmost
     else:
+        newstart = box.argmax(axis=0)[0]  # topmost
     box = np.concatenate([box[newstart:], box[:newstart]], axis=0)
     box = np.int0(box)
     return box
 def get_w_h(rect_points):
     w = np.linalg.norm(rect_points[0] - rect_points[1], ord=2).astype('int')
     h = np.linalg.norm(rect_points[0] - rect_points[3], ord=2).astype('int')
     return w, h
 def cut_box(img, rect_points):
     w, h = get_w_h(rect_points)
+    dst_pts = np.array([[h, 0], [h, w], [0, w], [0, 0], ], dtype="float32")
     transform = cv2.getPerspectiveTransform(rect_points.astype("float32"), dst_pts)
     cropped_img = cv2.warpPerspective(img, transform, (h, w))
     return cropped_img
 class BaseCaptioner:
     def __init__(self, device, enable_filter=False):
         print(f"Initializing ImageCaptioning to {device}")
     @torch.no_grad()
     def filter_caption(self, image: Union[np.ndarray, Image.Image, str], caption: str):
+        image = load_image(image, return_type='pil')
+        image = self.preprocess(image).unsqueeze(0).to(self.device)  # (1, 3, 224, 224)
+        text = clip.tokenize(caption).to(self.device)  # (1, 77)
+        image_features = self.filter.encode_image(image)  # (1, 512)
+        text_features = self.filter.encode_text(text)  # (1, 512)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
         similarity = torch.matmul(image_features, text_features.transpose(1, 0)).item()
         if similarity < self.threshold:
             print('There seems to be nothing where you clicked.')
         print(f'Clip score of the caption is {similarity}')
         return out
+    def inference(self, image: Union[np.ndarray, Image.Image, str], filter: bool = False):
         raise NotImplementedError()
+    def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, filter: bool = False):
         raise NotImplementedError()
     def inference_box(self, image: Union[np.ndarray, Image.Image, str], box: Union[list, np.ndarray], filter=False):
+        image = load_image(image, return_type="pil")
+        if np.array(box).size == 4:
+            # [x0, y0, x1, y1], where (x0, y0), (x1, y1) represent top-left and bottom-right corners
             size = max(image.width, image.height)
             x1, y1, x2, y2 = box
+            image_crop = np.array(image.crop((x1 * size, y1 * size, x2 * size, y2 * size)))
+        elif np.array(box).size == 8:  # four corners of an irregular rectangle
             image_crop = cut_box(np.array(image), box)
         crop_save_path = f'result/crop_{time.time()}.png'
         print(f'croped image saved in {crop_save_path}')
         caption = self.inference(image_crop, filter)
         return caption, crop_save_path
+    def inference_seg(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str] = None,
+                      crop_mode="w_bg", filter=False, disable_regular_box=False):
         if seg_mask is None:
             seg_mask = np.ones(image.size).astype(bool)
+        image = load_image(image, return_type="pil")
+        seg_mask = load_image(seg_mask, return_type="pil")
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0
+        if crop_mode == "wo_bg":
+            image = np.array(image) * seg_mask[:, :, np.newaxis] + (1 - seg_mask[:, :, np.newaxis]) * 255
             image = np.uint8(image)
         else:
             image = np.array(image)
         else:
             min_area_box = new_seg_to_box(seg_mask)
         return self.inference_box(image, min_area_box, filter)
+    def generate_seg_cropped_image(self, image: Union[np.ndarray, str], seg_mask: Union[np.ndarray, Image.Image, str],
+                                   crop_mode="w_bg", disable_regular_box=False):
+        image = load_image(image, return_type="pil")
+        seg_mask = load_image(seg_mask, return_type="pil")
         seg_mask = seg_mask.resize(image.size)
         seg_mask = np.array(seg_mask) > 0
+        if crop_mode == "wo_bg":
+            image = np.array(image) * seg_mask[:, :, np.newaxis] + (1 - seg_mask[:, :, np.newaxis]) * 255
         else:
             image = np.array(image)
             box = seg_to_box(seg_mask)
         else:
             box = new_seg_to_box(seg_mask)
+        if np.array(box).size == 4:
+            # [x0, y0, x1, y1], where (x0, y0), (x1, y1) represent top-left and bottom-right corners
             size = max(image.shape[0], image.shape[1])
             x1, y1, x2, y2 = box
+            image_crop = np.array(image.crop((x1 * size, y1 * size, x2 * size, y2 * size)))
+        elif np.array(box).size == 8:  # four corners of an irregular rectangle
             image_crop = cut_box(np.array(image), box)
         crop_save_path = f'result/crop_{time.time()}.png'
         Image.fromarray(image_crop).save(crop_save_path)
         print(f'croped image saved in {crop_save_path}')
         return crop_save_path
 if __name__ == '__main__':
     model = BaseCaptioner(device='cuda:0')
     image_path = 'test_images/img2.jpg'
+    seg_mask = np.zeros((15, 15))
     seg_mask[5:10, 5:10] = 1
     seg_mask = 'image/SAM/img10.jpg.raw_mask.png'
     print(model.inference_seg(image_path, seg_mask))

caption_anything/captioner/blip.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import torch
-from PIL import Image, ImageDraw, ImageOps
 from transformers import BlipProcessor
 from .modeling_blip import BlipForConditionalGeneration
-import json
-import pdb
-import cv2
 import numpy as np
 from typing import Union
 from .base_captioner import BaseCaptioner
-import torchvision.transforms.functional as F
 class BLIPCaptioner(BaseCaptioner):
@@ -17,12 +16,12 @@ class BLIPCaptioner(BaseCaptioner):
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
-        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
-        if type(image) == str: # input path
-                image = Image.open(image)
         inputs = self.processor(image, return_tensors="pt").to(self.device, self.torch_dtype)
         out = self.model.generate(**inputs, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
@@ -30,12 +29,13 @@ class BLIPCaptioner(BaseCaptioner):
             captions = self.filter_caption(image, captions)
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
         return captions
     @torch.no_grad()
-    def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg", filter=False, disable_regular_box = False):
-        crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode, disable_regular_box=disable_regular_box)
-        if type(image) == str: # input path
-            image = Image.open(image)
         inputs = self.processor(image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(self.device, self.torch_dtype)
         _, _, H, W = pixel_values.shape
@@ -56,11 +56,10 @@ if __name__ == '__main__':
     model = BLIPCaptioner(device='cuda:0')
     # image_path = 'test_images/img2.jpg'
     image_path = 'image/SAM/img10.jpg'
-    seg_mask = np.zeros((15,15))
     seg_mask[5:10, 5:10] = 1
     seg_mask = 'test_images/img10.jpg.raw_mask.png'
     image_path = 'test_images/img2.jpg'
     seg_mask = 'test_images/img2.jpg.raw_mask.png'
     print(f'process image {image_path}')
     print(model.inference_with_reduced_tokens(image_path, seg_mask))

 import torch
+from PIL import Image
 from transformers import BlipProcessor
+from caption_anything.utils.utils import load_image
 from .modeling_blip import BlipForConditionalGeneration
 import numpy as np
 from typing import Union
 from .base_captioner import BaseCaptioner
+import torchvision.transforms.functional as F
 class BLIPCaptioner(BaseCaptioner):
         self.device = device
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large",
+                                                                  torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
+        image = load_image(image, return_type="pil")
         inputs = self.processor(image, return_tensors="pt").to(self.device, self.torch_dtype)
         out = self.model.generate(**inputs, max_new_tokens=50)
         captions = self.processor.decode(out[0], skip_special_tokens=True).strip()
             captions = self.filter_caption(image, captions)
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
         return captions
     @torch.no_grad()
+    def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg",
+                                      filter=False, disable_regular_box=False):
+        crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode,
+                                                         disable_regular_box=disable_regular_box)
+        image = load_image(image, return_type="pil")
         inputs = self.processor(image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(self.device, self.torch_dtype)
         _, _, H, W = pixel_values.shape
     model = BLIPCaptioner(device='cuda:0')
     # image_path = 'test_images/img2.jpg'
     image_path = 'image/SAM/img10.jpg'
+    seg_mask = np.zeros((15, 15))
     seg_mask[5:10, 5:10] = 1
     seg_mask = 'test_images/img10.jpg.raw_mask.png'
     image_path = 'test_images/img2.jpg'
     seg_mask = 'test_images/img2.jpg.raw_mask.png'
     print(f'process image {image_path}')
     print(model.inference_with_reduced_tokens(image_path, seg_mask))

caption_anything/captioner/blip2.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 from typing import Union
 from transformers import AutoProcessor, Blip2ForConditionalGeneration
-from caption_anything.utils.utils import is_platform_win
 from .base_captioner import BaseCaptioner
 class BLIP2Captioner(BaseCaptioner):
@@ -21,11 +21,10 @@ class BLIP2Captioner(BaseCaptioner):
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
-        if type(image) == str: # input path
-            image = Image.open(image)
         if not self.dialogue:
-            text_prompt = 'Question: what does the image show? Answer:'
             inputs = self.processor(image, text = text_prompt, return_tensors="pt").to(self.device, self.torch_dtype)
             out = self.model.generate(**inputs, max_new_tokens=50)
             captions = self.processor.decode(out[0], skip_special_tokens=True).strip()

 from typing import Union
 from transformers import AutoProcessor, Blip2ForConditionalGeneration
+from caption_anything.utils.utils import is_platform_win, load_image
 from .base_captioner import BaseCaptioner
 class BLIP2Captioner(BaseCaptioner):
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
+        image = load_image(image, return_type="pil")
         if not self.dialogue:
+            text_prompt = 'The image shows'
             inputs = self.processor(image, text = text_prompt, return_tensors="pt").to(self.device, self.torch_dtype)
             out = self.model.generate(**inputs, max_new_tokens=50)
             captions = self.processor.decode(out[0], skip_special_tokens=True).strip()

caption_anything/captioner/git.py CHANGED Viewed

@@ -1,4 +1,6 @@
 from transformers import GitProcessor, AutoProcessor
 from .modeling_git import GitForCausalLM
 from PIL import Image
 import torch
@@ -15,11 +17,10 @@ class GITCaptioner(BaseCaptioner):
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = AutoProcessor.from_pretrained("microsoft/git-large")
         self.model = GitForCausalLM.from_pretrained("microsoft/git-large", torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
-        if type(image) == str: # input path
-            image = Image.open(image)
         pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device, self.torch_dtype)
         generated_ids = self.model.generate(pixel_values=pixel_values, max_new_tokens=50)
         generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
@@ -29,10 +30,11 @@ class GITCaptioner(BaseCaptioner):
         return generated_caption
     @torch.no_grad()
-    def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg", filter=False, disable_regular_box = False):
-        crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode, disable_regular_box=disable_regular_box)
-        if type(image) == str: # input path
-            image = Image.open(image)
         inputs = self.processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(self.device, self.torch_dtype)
         _, _, H, W = pixel_values.shape
@@ -48,10 +50,11 @@ class GITCaptioner(BaseCaptioner):
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
         return captions, crop_save_path
 if __name__ == '__main__':
     model = GITCaptioner(device='cuda:2', enable_filter=False)
     image_path = 'test_images/img2.jpg'
-    seg_mask = np.zeros((224,224))
     seg_mask[50:200, 50:200] = 1
     print(f'process image {image_path}')
-    print(model.inference_with_reduced_tokens(image_path, seg_mask))

 from transformers import GitProcessor, AutoProcessor
+from caption_anything.utils.utils import load_image
 from .modeling_git import GitForCausalLM
 from PIL import Image
 import torch
         self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
         self.processor = AutoProcessor.from_pretrained("microsoft/git-large")
         self.model = GitForCausalLM.from_pretrained("microsoft/git-large", torch_dtype=self.torch_dtype).to(self.device)
     @torch.no_grad()
     def inference(self, image: Union[np.ndarray, Image.Image, str], filter=False):
+        image = load_image(image, return_type="pil")
         pixel_values = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device, self.torch_dtype)
         generated_ids = self.model.generate(pixel_values=pixel_values, max_new_tokens=50)
         generated_caption = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
         return generated_caption
     @torch.no_grad()
+    def inference_with_reduced_tokens(self, image: Union[np.ndarray, Image.Image, str], seg_mask, crop_mode="w_bg",
+                                      filter=False, disable_regular_box=False):
+        crop_save_path = self.generate_seg_cropped_image(image=image, seg_mask=seg_mask, crop_mode=crop_mode,
+                                                         disable_regular_box=disable_regular_box)
+        image = load_image(image, return_type="pil")
         inputs = self.processor(images=image, return_tensors="pt")
         pixel_values = inputs.pixel_values.to(self.device, self.torch_dtype)
         _, _, H, W = pixel_values.shape
         print(f"\nProcessed ImageCaptioning by BLIPCaptioner, Output Text: {captions}")
         return captions, crop_save_path
 if __name__ == '__main__':
     model = GITCaptioner(device='cuda:2', enable_filter=False)
     image_path = 'test_images/img2.jpg'
+    seg_mask = np.zeros((224, 224))
     seg_mask[50:200, 50:200] = 1
     print(f'process image {image_path}')
+    print(model.inference_with_reduced_tokens(image_path, seg_mask))

caption_anything/model.py CHANGED Viewed

@@ -62,9 +62,12 @@ class CaptionAnything:
             print('OpenAI GPT is not available')
     def inference(self, image, prompt, controls, disable_gpt=False, enable_wiki=False):
         #  segment with prompt
         print("CA prompt: ", prompt, "CA controls", controls)
         seg_mask = self.segmenter.inference(image, prompt)[0, ...]
         if self.args.enable_morphologyex:
             seg_mask = 255 * seg_mask.astype(np.uint8)
             seg_mask = np.stack([seg_mask, seg_mask, seg_mask], axis=-1)
@@ -80,6 +83,7 @@ class CaptionAnything:
         seg_mask_img.save(mask_save_path)
         print('seg_mask path: ', mask_save_path)
         print("seg_mask.shape: ", seg_mask.shape)
         #  captioning with mask
         if self.args.enable_reduce_tokens:
             caption, crop_save_path = self.captioner. \
@@ -92,6 +96,7 @@ class CaptionAnything:
                 inference_seg(image, seg_mask, crop_mode=self.args.seg_crop_mode,
                               filter=self.args.clip_filter,
                               disable_regular_box=self.args.disable_regular_box)
         #  refining with TextRefiner
         context_captions = []
         if self.args.context_captions:
@@ -111,6 +116,7 @@ class CaptionAnything:
 if __name__ == "__main__":
     from caption_anything.utils.parser import parse_augment
     args = parse_augment()
     # image_path = 'test_images/img3.jpg'
     image_path = 'test_images/img1.jpg'

             print('OpenAI GPT is not available')
     def inference(self, image, prompt, controls, disable_gpt=False, enable_wiki=False):
+        # TODO: Add support to multiple seg masks.
         #  segment with prompt
         print("CA prompt: ", prompt, "CA controls", controls)
         seg_mask = self.segmenter.inference(image, prompt)[0, ...]
         if self.args.enable_morphologyex:
             seg_mask = 255 * seg_mask.astype(np.uint8)
             seg_mask = np.stack([seg_mask, seg_mask, seg_mask], axis=-1)
         seg_mask_img.save(mask_save_path)
         print('seg_mask path: ', mask_save_path)
         print("seg_mask.shape: ", seg_mask.shape)
         #  captioning with mask
         if self.args.enable_reduce_tokens:
             caption, crop_save_path = self.captioner. \
                 inference_seg(image, seg_mask, crop_mode=self.args.seg_crop_mode,
                               filter=self.args.clip_filter,
                               disable_regular_box=self.args.disable_regular_box)
         #  refining with TextRefiner
         context_captions = []
         if self.args.context_captions:
 if __name__ == "__main__":
     from caption_anything.utils.parser import parse_augment
     args = parse_augment()
     # image_path = 'test_images/img3.jpg'
     image_path = 'test_images/img1.jpg'

caption_anything/segmenter/base_segmenter.py CHANGED Viewed

@@ -5,7 +5,7 @@ from PIL import Image, ImageDraw, ImageOps
 import numpy as np
 from typing import Union
 from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
-from caption_anything.utils.utils import prepare_segmenter, seg_model_map
 import matplotlib.pyplot as plt
 import PIL
@@ -30,21 +30,9 @@ class BaseSegmenter:
         self.image_embedding = None
         self.image = None
-    def read_image(self, image: Union[np.ndarray, Image.Image, str]):
-        if type(image) == str:  # input path
-            image = Image.open(image)
-            image = np.array(image)
-        elif type(image) == Image.Image:
-            image = np.array(image)
-        elif type(image) == np.ndarray:
-            image = image
-        else:
-            raise TypeError
-        return image
     @torch.no_grad()
     def set_image(self, image: Union[np.ndarray, Image.Image, str]):
-        image = self.read_image(image)
         self.image = image
         if self.reuse_feature:
             self.predictor.set_image(image)
@@ -57,7 +45,7 @@ class BaseSegmenter:
         SAM inference of image according to control.
         Args:
             image: str or PIL.Image or np.ndarray
-            control:
                 prompt_type:
                     1. {control['prompt_type'] = ['everything']} to segment everything in the image.
                     2. {control['prompt_type'] = ['click', 'box']} to segment according to click and box.
@@ -77,7 +65,7 @@ class BaseSegmenter:
             masks: np.ndarray of shape [num_masks, height, width]
         """
-        image = self.read_image(image)  # Turn image into np.ndarray
         if 'everything' in control['prompt_type']:
             masks = self.mask_generator.generate(image)
             new_masks = np.concatenate([mask["segmentation"][np.newaxis, :] for mask in masks])

 import numpy as np
 from typing import Union
 from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
+from caption_anything.utils.utils import prepare_segmenter, seg_model_map, load_image
 import matplotlib.pyplot as plt
 import PIL
         self.image_embedding = None
         self.image = None
     @torch.no_grad()
     def set_image(self, image: Union[np.ndarray, Image.Image, str]):
+        image = load_image(image, return_type='numpy')
         self.image = image
         if self.reuse_feature:
             self.predictor.set_image(image)
         SAM inference of image according to control.
         Args:
             image: str or PIL.Image or np.ndarray
+            control: dict to control SAM.
                 prompt_type:
                     1. {control['prompt_type'] = ['everything']} to segment everything in the image.
                     2. {control['prompt_type'] = ['click', 'box']} to segment according to click and box.
             masks: np.ndarray of shape [num_masks, height, width]
         """
+        image = load_image(image, return_type='numpy')
         if 'everything' in control['prompt_type']:
             masks = self.mask_generator.generate(image)
             new_masks = np.concatenate([mask["segmentation"][np.newaxis, :] for mask in masks])

caption_anything/utils/chatbot.py CHANGED Viewed

@@ -19,22 +19,11 @@ from PIL import Image, ImageDraw, ImageOps
 from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
 VISUAL_CHATGPT_PREFIX = """
-    Caption Anything Chatbox (short as CATchat) is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. CATchat is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
-    As a language model, CATchat can not directly read images, but it has a list of tools to finish different visual tasks. CATchat can invoke different tools to indirectly understand pictures.
-    Visual ChatGPT  has access to the following tools:"""
-# VISUAL_CHATGPT_PREFIX = """Visual ChatGPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Visual ChatGPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
-# Visual ChatGPT is able to process and understand large amounts of text and images. As a language model, Visual ChatGPT can not directly read images, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "chat_image/xxx.png", and Visual ChatGPT can invoke different tools to indirectly understand pictures. When talking about images, Visual ChatGPT is very strict to the file name and will never fabricate nonexistent files. Visual ChatGPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name.
-# Visual ChatGPT is aware of the coordinate of an object in the image, which is represented as a point (X, Y) on the object. Note that (0, 0) represents the bottom-left corner of the image.
-# Human may provide new figures to Visual ChatGPT with a description. The description helps Visual ChatGPT to understand this image, but Visual ChatGPT should use tools to finish following tasks, rather than directly imagine from the description.
-# Overall, Visual ChatGPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.
 # TOOLS:
@@ -63,8 +52,7 @@ Previous conversation history:
 {chat_history}
 New input: {input}
-Since CATchat is a text language model, CATchat must use tools iteratively to observe images rather than imagination.
-The thoughts and observations are only visible for CATchat, CATchat should remember to repeat important information in the final response for Human.
 Thought: Do I need to use a tool? {agent_scratchpad} (You are strictly to use the aforementioned "Thought/Action/Action Input/Observation" format as the answer.)"""
@@ -111,9 +99,9 @@ class VisualQuestionAnswering:
             # "Salesforce/blip-vqa-capfilt-large", torch_dtype=self.torch_dtype).to(self.device)
     @prompts(name="Answer Question About The Image",
-             description="useful when you need an answer for a question based on an image. "
-                         "like: what is the background color of the last image, how many cats in this figure, what is in this figure. "
-                         "The input to this tool should be a comma separated string of two, representing the image_path and the question")
     def inference(self, inputs):
         image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         raw_image = Image.open(image_path).convert('RGB')
@@ -151,12 +139,13 @@ def build_chatbot_tools(load_dict):
 class ConversationBot:
     def __init__(self, tools, api_key=""):
         # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
-        llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key=api_key)
         self.llm = llm
         self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
         self.tools = tools
         self.current_image = None
         self.point_prompt = ""
         self.agent = initialize_agent(
             self.tools,
             self.llm,
@@ -212,7 +201,7 @@ if __name__ == '__main__':
     bot = ConversationBot(tools)
     with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
         with gr.Row():
-            chatbot = gr.Chatbot(elem_id="chatbot", label="Visual ChatGPT").style(height=1000,scale=0.5)
             auxwindow = gr.Chatbot(elem_id="chatbot", label="Aux Window").style(height=1000,scale=0.5)
         state = gr.State([])
         aux_state = gr.State([])

 from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
 VISUAL_CHATGPT_PREFIX = """
+    I want you act as Caption Anything Chatbox (short as CATchat), which is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. You are able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
+    As a language model, you can not directly read images, but can invoke VQA tool to indirectly understand pictures, by repeatly asking questions about the objects and scene of the image. You should carefully asking informative questions to maximize your information about this image content. Each image will have a file name formed as "chat_image/xxx.png", you are very strict to the file name and will never fabricate nonexistent files.
+    You have access to the following tools:"""
 # TOOLS:
 {chat_history}
 New input: {input}
+As a language model, you must repeatly to use VQA tools to observe images. You response should be consistent with the outputs of the VQA tool instead of imagination. Do not repeat asking the same question.
 Thought: Do I need to use a tool? {agent_scratchpad} (You are strictly to use the aforementioned "Thought/Action/Action Input/Observation" format as the answer.)"""
             # "Salesforce/blip-vqa-capfilt-large", torch_dtype=self.torch_dtype).to(self.device)
     @prompts(name="Answer Question About The Image",
+             description="VQA tool is useful when you need an answer for a question based on an image. "
+                         "like: what is the color of an object, how many cats in this figure, where is the child sitting, what does the cat doing, why is he laughing."
+                         "The input to this tool should be a comma separated string of two, representing the image path and the question.")
     def inference(self, inputs):
         image_path, question = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
         raw_image = Image.open(image_path).convert('RGB')
 class ConversationBot:
     def __init__(self, tools, api_key=""):
         # load_dict = {'VisualQuestionAnswering':'cuda:0', 'ImageCaptioning':'cuda:1',...}
+        llm = OpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key=api_key)
         self.llm = llm
         self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')
         self.tools = tools
         self.current_image = None
         self.point_prompt = ""
+        self.global_prompt = ""
         self.agent = initialize_agent(
             self.tools,
             self.llm,
     bot = ConversationBot(tools)
     with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
         with gr.Row():
+            chatbot = gr.Chatbot(elem_id="chatbot", label="CATchat").style(height=1000,scale=0.5)
             auxwindow = gr.Chatbot(elem_id="chatbot", label="Aux Window").style(height=1000,scale=0.5)
         state = gr.State([])
         aux_state = gr.State([])

caption_anything/utils/utils.py CHANGED Viewed

@@ -1,13 +1,41 @@
 import os
 import cv2
 import requests
 import numpy as np
 from PIL import Image
-import time
-import sys
-import urllib
 from tqdm import tqdm
-import hashlib
 def is_platform_win():
     return sys.platform == "win32"
@@ -114,7 +142,7 @@ def vis_add_mask(image, mask, color, alpha, kernel_size):
     mask = mask.astype('float').copy()
     mask = (cv2.GaussianBlur(mask, (kernel_size, kernel_size), kernel_size) / 255.) * (alpha)
     for i in range(3):
-        image[:, :, i] = image[:, :, i] * (1-alpha+mask) + color[i] * (alpha-mask)
     return image
@@ -122,11 +150,12 @@ def vis_add_mask_wo_blur(image, mask, color, alpha):
     color = np.array(color)
     mask = mask.astype('float').copy()
     for i in range(3):
-        image[:, :, i] = image[:, :, i] * (1-alpha+mask) + color[i] * (alpha-mask)
     return image
-def vis_add_mask_wo_gaussian(image, background_mask, contour_mask, background_color, contour_color, background_alpha, contour_alpha):
     background_color = np.array(background_color)
     contour_color = np.array(contour_color)
@@ -134,16 +163,17 @@ def vis_add_mask_wo_gaussian(image, background_mask, contour_mask, background_co
     # contour_mask = 1 - contour_mask
     for i in range(3):
-        image[:, :, i] = image[:, :, i] * (1-background_alpha+background_mask*background_alpha) \
-                         + background_color[i] * (background_alpha-background_mask*background_alpha)
-        image[:, :, i] = image[:, :, i] * (1-contour_alpha+contour_mask*contour_alpha) \
-                         + contour_color[i] * (contour_alpha-contour_mask*contour_alpha)
     return image.astype('uint8')
-def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_radius=7, contour_width=3, contour_color=3, contour_alpha=1, background_color=0, paint_foreground=False):
     """
     add color mask to the background/foreground area
     input_image: numpy array (w, h, C)
@@ -163,23 +193,27 @@ def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_
     assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
     # 0: background, 1: foreground
-    input_mask[input_mask>0] = 255
     if paint_foreground:
-        painted_image = vis_add_mask(input_image, 255 - input_mask, color_list[background_color], background_alpha, background_blur_radius)    # black for background
     else:
-          # mask background
-        painted_image = vis_add_mask(input_image, input_mask, color_list[background_color], background_alpha, background_blur_radius)    # black for background
     # mask contour
     contour_mask = input_mask.copy()
-    contour_mask = cv2.Canny(contour_mask, 100, 200)    # contour extraction
     # widden contour
     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (contour_width, contour_width))
     contour_mask = cv2.dilate(contour_mask, kernel)
-    painted_image = vis_add_mask(painted_image, 255-contour_mask, color_list[contour_color], contour_alpha, contour_width)
     return painted_image
-def mask_painter_foreground_all(input_image, input_masks, background_alpha=0.7, background_blur_radius=7, contour_width=3, contour_color=3, contour_alpha=1):
     """
     paint color mask on the all foreground area
     input_image: numpy array with shape (w, h, C)
@@ -194,22 +228,24 @@ def mask_painter_foreground_all(input_image, input_masks, background_alpha=0.7,
     Output:
     painted_image: numpy array
     """
     for i, input_mask in enumerate(input_masks):
-        input_image = mask_painter(input_image, input_mask,  background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, background_color=i + 2, paint_foreground=True)
     return input_image
 def mask_generator_00(mask, background_radius, contour_radius):
     # no background width when '00'
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
-    dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # ...:::!!!:::...
     contour_radius += 2
     contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
     contour_mask = contour_mask / np.max(contour_mask)
-    contour_mask[contour_mask>0.5] = 1.
     return mask, contour_mask
@@ -218,7 +254,7 @@ def mask_generator_01(mask, background_radius, contour_radius):
     # no background width when '00'
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
-    dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # ...:::!!!:::...
     contour_radius += 2
@@ -230,7 +266,7 @@ def mask_generator_01(mask, background_radius, contour_radius):
 def mask_generator_10(mask, background_radius, contour_radius):
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
-    dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # .....:::::!!!!!
     background_mask = np.clip(dist_map, -background_radius, background_radius)
@@ -240,14 +276,14 @@ def mask_generator_10(mask, background_radius, contour_radius):
     contour_radius += 2
     contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
     contour_mask = contour_mask / np.max(contour_mask)
-    contour_mask[contour_mask>0.5] = 1.
     return background_mask, contour_mask
 def mask_generator_11(mask, background_radius, contour_radius):
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
-    dist_transform_back = cv2.distanceTransform(1-mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # .....:::::!!!!!
     background_mask = np.clip(dist_map, -background_radius, background_radius)
@@ -260,7 +296,8 @@ def mask_generator_11(mask, background_radius, contour_radius):
     return background_mask, contour_mask
-def mask_painter_wo_gaussian(input_image, input_mask, background_alpha=0.5, background_blur_radius=7, contour_width=3, contour_color=3, contour_alpha=1, mode='11'):
     """
     Input:
     input_image: numpy array
@@ -283,8 +320,8 @@ def mask_painter_wo_gaussian(input_image, input_mask, background_alpha=0.5, back
     width, height = input_image.shape[0], input_image.shape[1]
     res = 1024
     ratio = min(1.0 * res / max(width, height), 1.0)
-    input_image = cv2.resize(input_image, (int(height*ratio), int(width*ratio)))
-    input_mask = cv2.resize(input_mask, (int(height*ratio), int(width*ratio)))
     # 0: background, 1: foreground
     msk = np.clip(input_mask, 0, 1)
@@ -292,23 +329,78 @@ def mask_painter_wo_gaussian(input_image, input_mask, background_alpha=0.5, back
     # generate masks for background and contour pixels
     background_radius = (background_blur_radius - 1) // 2
     contour_radius = (contour_width - 1) // 2
-    generator_dict = {'00':mask_generator_00, '01':mask_generator_01, '10':mask_generator_10, '11':mask_generator_11}
     background_mask, contour_mask = generator_dict[mode](msk, background_radius, contour_radius)
     # paint
     painted_image = vis_add_mask_wo_gaussian \
-        (input_image, background_mask, contour_mask, color_list[0], color_list[contour_color], background_alpha, contour_alpha)    # black for background
     return painted_image
 if __name__ == '__main__':
-    background_alpha = 0.7      # transparency of background 1: all black, 0: do nothing
-    background_blur_radius = 31    # radius of background blur, must be odd number
-    contour_width = 11           # contour width, must be odd number
-    contour_color = 3              # id in color map, 0: black, 1: white, >1: others
-    contour_alpha = 1           # transparency of background, 0: no contour highlighted
     # load input image and mask
     input_image = np.array(Image.open('./test_images/painter_input_image.jpg').convert('RGB'))
@@ -323,23 +415,28 @@ if __name__ == '__main__':
     for i in range(50):
         t2 = time.time()
-        painted_image_00 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='00')
         e2 = time.time()
         t3 = time.time()
-        painted_image_10 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='10')
         e3 = time.time()
         t1 = time.time()
-        painted_image = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha)
         e1 = time.time()
         t4 = time.time()
-        painted_image_01 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='01')
         e4 = time.time()
         t5 = time.time()
-        painted_image_11 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius, contour_width, contour_color, contour_alpha, mode='11')
         e5 = time.time()
         overall_time_1 += (e1 - t1)
@@ -348,11 +445,11 @@ if __name__ == '__main__':
         overall_time_4 += (e4 - t4)
         overall_time_5 += (e5 - t5)
-    print(f'average time w gaussian: {overall_time_1/50}')
-    print(f'average time w/o gaussian00: {overall_time_2/50}')
-    print(f'average time w/o gaussian10: {overall_time_3/50}')
-    print(f'average time w/o gaussian01: {overall_time_4/50}')
-    print(f'average time w/o gaussian11: {overall_time_5/50}')
     # save
     painted_image_00 = Image.fromarray(painted_image_00)
@@ -366,54 +463,3 @@ if __name__ == '__main__':
     painted_image_11 = Image.fromarray(painted_image_11)
     painted_image_11.save('./test_images/painter_output_image_11.png')
-seg_model_map = {
-    'base': 'vit_b',
-    'large': 'vit_l',
-    'huge': 'vit_h'
-}
-ckpt_url_map = {
-    'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth',
-    'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
-    'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
-}
-expected_sha256_map = {
-    'vit_b': 'ec2df62732614e57411cdcf32a23ffdf28910380d03139ee0f4fcbe91eb8c912',
-	'vit_l': '3adcc4315b642a4d2101128f611684e8734c41232a17c648ed1693702a49a622',
-	'vit_h': 'a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e'
-}
-def prepare_segmenter(segmenter = "huge", download_root: str = None):
-    """
-    Prepare segmenter model and download checkpoint if necessary.
-    Returns: segmenter model name from 'vit_b', 'vit_l', 'vit_h'.
-    """
-    os.makedirs('result', exist_ok=True)
-    seg_model_name = seg_model_map[segmenter]
-    checkpoint_url = ckpt_url_map[seg_model_name]
-    folder = download_root or os.path.expanduser("~/.cache/SAM")
-    filename = os.path.basename(checkpoint_url)
-    segmenter_checkpoint = download_checkpoint(checkpoint_url, folder, filename, expected_sha256_map[seg_model_name])
-    return seg_model_name, segmenter_checkpoint
-def download_checkpoint(url, folder, filename, expected_sha256):
-    os.makedirs(folder, exist_ok=True)
-    download_target = os.path.join(folder, filename)
-    if os.path.isfile(download_target):
-        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
-            return download_target
-    print(f'Download SAM checkpoint {url}, saving to {download_target} ...')
-    with requests.get(url, stream=True) as response, open(download_target, "wb") as output:
-        progress = tqdm(total=int(response.headers.get('content-length', 0)), unit='B', unit_scale=True)
-        for data in response.iter_content(chunk_size=1024):
-            size = output.write(data)
-            progress.update(size)
-    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
-        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
-    return download_target

 import os
+import time
+import sys
 import cv2
+import hashlib
 import requests
 import numpy as np
+from typing import Union
 from PIL import Image
 from tqdm import tqdm
+def load_image(image: Union[np.ndarray, Image.Image, str], return_type='numpy'):
+    """
+    Load image from path or PIL.Image or numpy.ndarray to required format.
+    """
+    # Check if image is already in return_type
+    if isinstance(image, Image.Image) and return_type == 'pil' or \
+            isinstance(image, np.ndarray) and return_type == 'numpy':
+        return image
+    # PIL.Image as intermediate format
+    if isinstance(image, str):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    if return_type == 'pil':
+        return image
+    elif return_type == 'numpy':
+        return np.asarray(image)
+    else:
+        raise NotImplementedError()
 def is_platform_win():
     return sys.platform == "win32"
     mask = mask.astype('float').copy()
     mask = (cv2.GaussianBlur(mask, (kernel_size, kernel_size), kernel_size) / 255.) * (alpha)
     for i in range(3):
+        image[:, :, i] = image[:, :, i] * (1 - alpha + mask) + color[i] * (alpha - mask)
     return image
     color = np.array(color)
     mask = mask.astype('float').copy()
     for i in range(3):
+        image[:, :, i] = image[:, :, i] * (1 - alpha + mask) + color[i] * (alpha - mask)
     return image
+def vis_add_mask_wo_gaussian(image, background_mask, contour_mask, background_color, contour_color, background_alpha,
+                             contour_alpha):
     background_color = np.array(background_color)
     contour_color = np.array(contour_color)
     # contour_mask = 1 - contour_mask
     for i in range(3):
+        image[:, :, i] = image[:, :, i] * (1 - background_alpha + background_mask * background_alpha) \
+                         + background_color[i] * (background_alpha - background_mask * background_alpha)
+        image[:, :, i] = image[:, :, i] * (1 - contour_alpha + contour_mask * contour_alpha) \
+                         + contour_color[i] * (contour_alpha - contour_mask * contour_alpha)
     return image.astype('uint8')
+def mask_painter(input_image, input_mask, background_alpha=0.7, background_blur_radius=7, contour_width=3,
+                 contour_color=3, contour_alpha=1, background_color=0, paint_foreground=False):
     """
     add color mask to the background/foreground area
     input_image: numpy array (w, h, C)
     assert background_blur_radius % 2 * contour_width % 2 > 0, 'background_blur_radius and contour_width must be ODD'
     # 0: background, 1: foreground
+    input_mask[input_mask > 0] = 255
     if paint_foreground:
+        painted_image = vis_add_mask(input_image, 255 - input_mask, color_list[background_color], background_alpha,
+                                     background_blur_radius)  # black for background
     else:
+        # mask background
+        painted_image = vis_add_mask(input_image, input_mask, color_list[background_color], background_alpha,
+                                     background_blur_radius)  # black for background
     # mask contour
     contour_mask = input_mask.copy()
+    contour_mask = cv2.Canny(contour_mask, 100, 200)  # contour extraction
     # widden contour
     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (contour_width, contour_width))
     contour_mask = cv2.dilate(contour_mask, kernel)
+    painted_image = vis_add_mask(painted_image, 255 - contour_mask, color_list[contour_color], contour_alpha,
+                                 contour_width)
     return painted_image
+def mask_painter_foreground_all(input_image, input_masks, background_alpha=0.7, background_blur_radius=7,
+                                contour_width=3, contour_color=3, contour_alpha=1):
     """
     paint color mask on the all foreground area
     input_image: numpy array with shape (w, h, C)
     Output:
     painted_image: numpy array
     """
     for i, input_mask in enumerate(input_masks):
+        input_image = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width,
+                                   contour_color, contour_alpha, background_color=i + 2, paint_foreground=True)
     return input_image
 def mask_generator_00(mask, background_radius, contour_radius):
     # no background width when '00'
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+    dist_transform_back = cv2.distanceTransform(1 - mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # ...:::!!!:::...
     contour_radius += 2
     contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
     contour_mask = contour_mask / np.max(contour_mask)
+    contour_mask[contour_mask > 0.5] = 1.
     return mask, contour_mask
     # no background width when '00'
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+    dist_transform_back = cv2.distanceTransform(1 - mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # ...:::!!!:::...
     contour_radius += 2
 def mask_generator_10(mask, background_radius, contour_radius):
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+    dist_transform_back = cv2.distanceTransform(1 - mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # .....:::::!!!!!
     background_mask = np.clip(dist_map, -background_radius, background_radius)
     contour_radius += 2
     contour_mask = np.abs(np.clip(dist_map, -contour_radius, contour_radius))
     contour_mask = contour_mask / np.max(contour_mask)
+    contour_mask[contour_mask > 0.5] = 1.
     return background_mask, contour_mask
 def mask_generator_11(mask, background_radius, contour_radius):
     # distance map
     dist_transform_fore = cv2.distanceTransform(mask, cv2.DIST_L2, 3)
+    dist_transform_back = cv2.distanceTransform(1 - mask, cv2.DIST_L2, 3)
     dist_map = dist_transform_fore - dist_transform_back
     # .....:::::!!!!!
     background_mask = np.clip(dist_map, -background_radius, background_radius)
     return background_mask, contour_mask
+def mask_painter_wo_gaussian(input_image, input_mask, background_alpha=0.5, background_blur_radius=7, contour_width=3,
+                             contour_color=3, contour_alpha=1, mode='11'):
     """
     Input:
     input_image: numpy array
     width, height = input_image.shape[0], input_image.shape[1]
     res = 1024
     ratio = min(1.0 * res / max(width, height), 1.0)
+    input_image = cv2.resize(input_image, (int(height * ratio), int(width * ratio)))
+    input_mask = cv2.resize(input_mask, (int(height * ratio), int(width * ratio)))
     # 0: background, 1: foreground
     msk = np.clip(input_mask, 0, 1)
     # generate masks for background and contour pixels
     background_radius = (background_blur_radius - 1) // 2
     contour_radius = (contour_width - 1) // 2
+    generator_dict = {'00': mask_generator_00, '01': mask_generator_01, '10': mask_generator_10,
+                      '11': mask_generator_11}
     background_mask, contour_mask = generator_dict[mode](msk, background_radius, contour_radius)
     # paint
     painted_image = vis_add_mask_wo_gaussian \
+        (input_image, background_mask, contour_mask, color_list[0], color_list[contour_color], background_alpha,
+         contour_alpha)  # black for background
     return painted_image
+seg_model_map = {
+    'base': 'vit_b',
+    'large': 'vit_l',
+    'huge': 'vit_h'
+}
+ckpt_url_map = {
+    'vit_b': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth',
+    'vit_l': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth',
+    'vit_h': 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth'
+}
+expected_sha256_map = {
+    'vit_b': 'ec2df62732614e57411cdcf32a23ffdf28910380d03139ee0f4fcbe91eb8c912',
+    'vit_l': '3adcc4315b642a4d2101128f611684e8734c41232a17c648ed1693702a49a622',
+    'vit_h': 'a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e'
+}
+def prepare_segmenter(segmenter="huge", download_root: str = None):
+    """
+    Prepare segmenter model and download checkpoint if necessary.
+    Returns: segmenter model name from 'vit_b', 'vit_l', 'vit_h'.
+    """
+    os.makedirs('result', exist_ok=True)
+    seg_model_name = seg_model_map[segmenter]
+    checkpoint_url = ckpt_url_map[seg_model_name]
+    folder = download_root or os.path.expanduser("~/.cache/SAM")
+    filename = os.path.basename(checkpoint_url)
+    segmenter_checkpoint = download_checkpoint(checkpoint_url, folder, filename, expected_sha256_map[seg_model_name])
+    return seg_model_name, segmenter_checkpoint
+def download_checkpoint(url, folder, filename, expected_sha256):
+    os.makedirs(folder, exist_ok=True)
+    download_target = os.path.join(folder, filename)
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+    print(f'Download SAM checkpoint {url}, saving to {download_target} ...')
+    with requests.get(url, stream=True) as response, open(download_target, "wb") as output:
+        progress = tqdm(total=int(response.headers.get('content-length', 0)), unit='B', unit_scale=True)
+        for data in response.iter_content(chunk_size=1024):
+            size = output.write(data)
+            progress.update(size)
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
 if __name__ == '__main__':
+    background_alpha = 0.7  # transparency of background 1: all black, 0: do nothing
+    background_blur_radius = 31  # radius of background blur, must be odd number
+    contour_width = 11  # contour width, must be odd number
+    contour_color = 3  # id in color map, 0: black, 1: white, >1: others
+    contour_alpha = 1  # transparency of background, 0: no contour highlighted
     # load input image and mask
     input_image = np.array(Image.open('./test_images/painter_input_image.jpg').convert('RGB'))
     for i in range(50):
         t2 = time.time()
+        painted_image_00 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius,
+                                                    contour_width, contour_color, contour_alpha, mode='00')
         e2 = time.time()
         t3 = time.time()
+        painted_image_10 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius,
+                                                    contour_width, contour_color, contour_alpha, mode='10')
         e3 = time.time()
         t1 = time.time()
+        painted_image = mask_painter(input_image, input_mask, background_alpha, background_blur_radius, contour_width,
+                                     contour_color, contour_alpha)
         e1 = time.time()
         t4 = time.time()
+        painted_image_01 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius,
+                                                    contour_width, contour_color, contour_alpha, mode='01')
         e4 = time.time()
         t5 = time.time()
+        painted_image_11 = mask_painter_wo_gaussian(input_image, input_mask, background_alpha, background_blur_radius,
+                                                    contour_width, contour_color, contour_alpha, mode='11')
         e5 = time.time()
         overall_time_1 += (e1 - t1)
         overall_time_4 += (e4 - t4)
         overall_time_5 += (e5 - t5)
+    print(f'average time w gaussian: {overall_time_1 / 50}')
+    print(f'average time w/o gaussian00: {overall_time_2 / 50}')
+    print(f'average time w/o gaussian10: {overall_time_3 / 50}')
+    print(f'average time w/o gaussian01: {overall_time_4 / 50}')
+    print(f'average time w/o gaussian11: {overall_time_5 / 50}')
     # save
     painted_image_00 = Image.fromarray(painted_image_00)
     painted_image_11 = Image.fromarray(painted_image_11)
     painted_image_11.save('./test_images/painter_output_image_11.png')