Spaces:

dragonSwing
/

annotate-anything

Runtime error

App Files Files Community

dragonSwing commited on Jun 4, 2023

Commit

5b31094

1 Parent(s): 78f7941

Add application files

Browse files

Files changed (28) hide show

.gitignore +200 -0
LICENSE +21 -0
annotate_anything.py +384 -0
app.py +277 -0
config.py +68 -0
examples/dog.png +0 -0
examples/eiffel.jpg +0 -0
examples/eiffel.png +0 -0
examples/girl.png +0 -0
examples/horse.png +0 -0
examples/horses.jpg +0 -0
examples/traffic.jpg +0 -0
requirements.txt +31 -0
style.css +11 -0
tag2text/LICENSE +21 -0
tag2text/README.md +101 -0
tag2text/configs/med_config.json +21 -0
tag2text/configs/q2l_config.json +22 -0
tag2text/configs/swin/config_swinB_384.json +9 -0
tag2text/data/tag_list.txt +3429 -0
tag2text/inference.py +102 -0
tag2text/models/bert.py +1157 -0
tag2text/models/swin_transformer.py +831 -0
tag2text/models/tag2text.py +274 -0
tag2text/models/utils.py +241 -0
tag2text/models/vit.py +430 -0
tag2text/requirements.txt +8 -0
utils.py +263 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,200 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,metals
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,metals
+### Metals ###
+.metals/
+.bloop/
+project/**/metals.sbt
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,metals

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Binh Le
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

annotate_anything.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import argparse
+import json
+import os
+import sys
+import tempfile
+import numpy as np
+import supervision as sv
+from groundingdino.util.inference import Model as DinoModel
+from imutils import paths
+from PIL import Image
+from segment_anything import sam_model_registry
+from segment_anything import SamAutomaticMaskGenerator
+from segment_anything import SamPredictor
+from supervision.detection.utils import xywh_to_xyxy
+from tqdm import tqdm
+sys.path.append("tag2text")
+from tag2text.models import tag2text
+from config import *
+from utils import detect, download_file_hf, segment, generate_tags, show_anns_sv
+def process(
+    tag2text_model,
+    grounding_dino_model,
+    sam_predictor,
+    sam_automask_generator,
+    image_path,
+    task,
+    prompt,
+    box_threshold,
+    text_threshold,
+    iou_threshold,
+    device,
+    output_dir=None,
+    save_mask=False,
+):
+    detections = None
+    metadata = {"image": {}, "annotations": [], "assets": {}}
+    if save_mask:
+        metadata["assets"]["intermediate_mask"] = []
+    try:
+        # Load image
+        image = Image.open(image_path)
+        image_pil = image.convert("RGB")
+        image = np.array(image_pil)
+        # Extract image metadata
+        filename = os.path.basename(image_path)
+        basename = os.path.splitext(filename)[0]
+        h, w = image.shape[:2]
+        metadata["image"]["file_name"] = filename
+        metadata["image"]["width"] = w
+        metadata["image"]["height"] = h
+        # Generate tags
+        if task in ["auto", "detection"] and prompt == "":
+            tags, caption = generate_tags(tag2text_model, image_pil, "None", device)
+            prompt = " . ".join(tags)
+            # print(f"Caption: {caption}")
+            # print(f"Tags: {tags}")
+            # ToDo: Extract metadata
+            metadata["image"]["caption"] = caption
+            metadata["image"]["tags"] = tags
+        if prompt:
+            metadata["prompt"] = prompt
+        # Detect boxes
+        if prompt != "":
+            detections, _, classes = detect(
+                grounding_dino_model,
+                image,
+                caption=prompt,
+                box_threshold=box_threshold,
+                text_threshold=text_threshold,
+                iou_threshold=iou_threshold,
+                post_process=True,
+            )
+            # Save detection image
+            if output_dir:
+                # Draw boxes
+                box_annotator = sv.BoxAnnotator()
+                labels = [
+                    f"{classes[class_id] if class_id else 'Unkown'} {confidence:0.2f}"
+                    for _, _, confidence, class_id, _ in detections
+                ]
+                box_image = box_annotator.annotate(
+                    scene=image, detections=detections, labels=labels
+                )
+                box_image_path = os.path.join(output_dir, basename + "_detect.png")
+                metadata["assets"]["detection"] = box_image_path
+                Image.fromarray(box_image).save(box_image_path)
+        # Segmentation
+        if task in ["auto", "segment"]:
+            if detections:
+                masks, scores = segment(
+                    sam_predictor, image=image, boxes=detections.xyxy
+                )
+                detections.mask = masks
+            else:
+                masks = sam_automask_generator.generate(image)
+                sorted_generated_masks = sorted(
+                    masks, key=lambda x: x["area"], reverse=True
+                )
+                xywh = np.array([mask["bbox"] for mask in sorted_generated_masks])
+                mask = np.array(
+                    [mask["segmentation"] for mask in sorted_generated_masks]
+                )
+                scores = np.array(
+                    [mask["predicted_iou"] for mask in sorted_generated_masks]
+                )
+                detections = sv.Detections(
+                    xyxy=xywh_to_xyxy(boxes_xywh=xywh), mask=mask
+                )
+            # Save annotated image
+            if output_dir:
+                mask_annotator = sv.MaskAnnotator()
+                mask_image, res = show_anns_sv(detections)
+                annotated_image = mask_annotator.annotate(image, detections=detections)
+                mask_image_path = os.path.join(output_dir, basename + "_mask.png")
+                metadata["assets"]["mask"] = mask_image_path
+                Image.fromarray(mask_image).save(mask_image_path)
+                # Save annotation encoding from https://github.com/LUSSeg/ImageNet-S
+                mask_enc_path = os.path.join(output_dir, basename + "_mask_enc.npy")
+                np.save(mask_enc_path, res)
+                metadata["assets"]["mask_enc"] = mask_enc_path
+                annotated_image_path = os.path.join(
+                    output_dir, basename + "_annotate.png"
+                )
+                metadata["assets"]["annotate"] = annotated_image_path
+                Image.fromarray(annotated_image).save(annotated_image_path)
+        # ToDo: Extract metadata
+        if detections:
+            id = 1
+            for (xyxy, mask, confidence, class_id, _), area, box_area, score in zip(
+                detections, detections.area, detections.box_area, scores
+            ):
+                annotation = {
+                    "id": id,
+                    "bbox": [int(x) for x in xyxy],
+                    "box_area": float(box_area),
+                }
+                if class_id:
+                    annotation["box_confidence"] = float(confidence)
+                    annotation["label"] = classes[class_id] if class_id else "Unkown"
+                if mask is not None:
+                    annotation["area"] = int(area)
+                    annotation["predicted_iou"] = float(score)
+                metadata["annotations"].append(annotation)
+                if output_dir and save_mask:
+                    mask_image_path = os.path.join(
+                        output_dir, f"{basename}_mask_{id}.png"
+                    )
+                    metadata["assets"]["intermediate_mask"].append(mask_image_path)
+                    Image.fromarray(mask * 255).save(mask_image_path)
+                id += 1
+        if output_dir:
+            meta_file_path = os.path.join(output_dir, basename + "_meta.json")
+            with open(meta_file_path, "w") as fp:
+                json.dump(metadata, fp)
+        else:
+            meta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+            meta_file_path = meta_file.name
+        return meta_file_path
+    except Exception as error:
+        raise ValueError(f"global exception: {error}")
+def main(args: argparse.Namespace) -> None:
+    device = args.device
+    prompt = args.prompt
+    task = args.task
+    tag2text_model = None
+    grounding_dino_model = None
+    sam_predictor = None
+    sam_automask_generator = None
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    iou_threshold = args.iou_threshold
+    save_mask = args.save_mask
+    # load model
+    if task in ["auto", "detection"] and prompt == "":
+        print("Loading Tag2Text model...")
+        tag2text_type = args.tag2text
+        tag2text_checkpoint = os.path.join(
+            abs_weight_dir, tag2text_dict[tag2text_type]["checkpoint_file"]
+        )
+        if not os.path.exists(tag2text_checkpoint):
+            print(f"Downloading weights for Tag2Text {tag2text_type} model")
+            os.system(
+                f"wget {tag2text_dict[tag2text_type]['checkpoint_url']} -O {tag2text_checkpoint}"
+            )
+        tag2text_model = tag2text.tag2text_caption(
+            pretrained=tag2text_checkpoint,
+            image_size=384,
+            vit="swin_b",
+            delete_tag_index=delete_tag_index,
+        )
+        # threshold for tagging
+        # we reduce the threshold to obtain more tags
+        tag2text_model.threshold = 0.64
+        tag2text_model.to(device)
+        tag2text_model.eval()
+    if task in ["auto", "detection"] or prompt != "":
+        print("Loading Grounding Dino model...")
+        dino_type = args.dino
+        dino_checkpoint = os.path.join(
+            abs_weight_dir, dino_dict[dino_type]["checkpoint_file"]
+        )
+        dino_config_file = os.path.join(
+            abs_weight_dir, dino_dict[dino_type]["config_file"]
+        )
+        if not os.path.exists(dino_checkpoint):
+            print(f"Downloading weights for Grounding Dino {dino_type} model")
+            dino_repo_id = dino_dict[dino_type]["repo_id"]
+            download_file_hf(
+                repo_id=dino_repo_id,
+                filename=dino_dict[dino_type]["checkpoint_file"],
+                cache_dir=weight_dir,
+            )
+            download_file_hf(
+                repo_id=dino_repo_id,
+                filename=dino_dict[dino_type]["checkpoint_file"],
+                cache_dir=weight_dir,
+            )
+        grounding_dino_model = DinoModel(
+            model_config_path=dino_config_file, model_checkpoint_path=dino_checkpoint
+        )
+    if task in ["auto", "segment"]:
+        print("Loading SAM...")
+        sam_type = args.sam
+        sam_checkpoint = os.path.join(
+            abs_weight_dir, sam_dict[sam_type]["checkpoint_file"]
+        )
+        if not os.path.exists(sam_checkpoint):
+            print(f"Downloading weights for SAM {sam_type}")
+            os.system(
+                f"wget {sam_dict[sam_type]['checkpoint_url']} -O {sam_checkpoint}"
+            )
+        sam = sam_model_registry[sam_type](checkpoint=sam_checkpoint)
+        sam.to(device=device)
+        sam_predictor = SamPredictor(sam)
+        sam_automask_generator = SamAutomaticMaskGenerator(sam)
+    if not os.path.exists(args.input):
+        raise ValueError("The input directory doesn't exist!")
+    elif not os.path.isdir(args.input):
+        image_paths = [args.input]
+    else:
+        image_paths = paths.list_images(args.input)
+    os.makedirs(args.output, exist_ok=True)
+    with tqdm(image_paths) as pbar:
+        for image_path in pbar:
+            pbar.set_postfix_str(f"Processing {image_path}")
+            process(
+                tag2text_model=tag2text_model,
+                grounding_dino_model=grounding_dino_model,
+                sam_predictor=sam_predictor,
+                sam_automask_generator=sam_automask_generator,
+                image_path=image_path,
+                task=task,
+                prompt=prompt,
+                box_threshold=box_threshold,
+                text_threshold=text_threshold,
+                iou_threshold=iou_threshold,
+                device=device,
+                output_dir=args.output,
+                save_mask=save_mask,
+            )
+if __name__ == "__main__":
+    if not os.path.exists(abs_weight_dir):
+        os.makedirs(abs_weight_dir, exist_ok=True)
+    parser = argparse.ArgumentParser(
+        description=(
+            "Runs automatic detection and mask generation on an input image or directory of images"
+        )
+    )
+    parser.add_argument(
+        "--input",
+        "-i",
+        type=str,
+        required=True,
+        help="Path to either a single input image or folder of images.",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        required=True,
+        help=(
+            "Path to the directory where masks will be output. Output will be either a folder "
+            "of PNGs per image or a single json with COCO-style masks."
+        ),
+    )
+    parser.add_argument(
+        "--sam",
+        type=str,
+        default=default_sam,
+        choices=sam_dict.keys(),
+        help="The type of SA model to load",
+    )
+    parser.add_argument(
+        "--tag2text",
+        type=str,
+        default=default_tag2text,
+        choices=tag2text_dict.keys(),
+        help="The path to the Tag2Text checkpoint to use for tags and caption generation.",
+    )
+    parser.add_argument(
+        "--dino",
+        type=str,
+        default=default_dino,
+        choices=dino_dict.keys(),
+        help="The config file of Grounding Dino model to load",
+    )
+    parser.add_argument(
+        "--task",
+        help="Task to run",
+        default="auto",
+        choices=["auto", "detect", "segment"],
+        type=str,
+    )
+    parser.add_argument(
+        "--prompt",
+        help="Detection prompt",
+        default="",
+        type=str,
+    )
+    parser.add_argument(
+        "--box-threshold", type=float, default=0.25, help="box threshold"
+    )
+    parser.add_argument(
+        "--text-threshold", type=float, default=0.2, help="text threshold"
+    )
+    parser.add_argument(
+        "--iou-threshold", type=float, default=0.5, help="iou threshold"
+    )
+    parser.add_argument(
+        "--save-mask",
+        action="store_true",
+        default=False,
+        help="If True, save all intermidiate masks.",
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda", help="The device to run generation on."
+    )
+    args = parser.parse_args()
+    main(args)

app.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import json
+import os
+import sys
+import tempfile
+import gradio as gr
+import numpy as np
+import supervision as sv
+import torch
+from groundingdino.util.inference import Model as DinoModel
+from PIL import Image
+from segment_anything import build_sam
+from segment_anything import SamAutomaticMaskGenerator
+from segment_anything import SamPredictor
+from supervision.detection.utils import mask_to_polygons
+from supervision.detection.utils import xywh_to_xyxy
+# segment anything
+# Grounding DINO
+sys.path.append("tag2text")
+from tag2text.models import tag2text
+from config import *
+from utils import download_file_hf, detect, segment, show_anns, generate_tags
+if not os.path.exists(abs_weight_dir):
+    os.makedirs(abs_weight_dir, exist_ok=True)
+sam_checkpoint = os.path.join(abs_weight_dir, sam_dict[default_sam]["checkpoint_file"])
+if not os.path.exists(sam_checkpoint):
+    os.system(f"wget {sam_dict[default_sam]['checkpoint_url']} -O {sam_checkpoint}")
+tag2text_checkpoint = os.path.join(
+    abs_weight_dir, tag2text_dict[default_tag2text]["checkpoint_file"]
+)
+if not os.path.exists(tag2text_checkpoint):
+    os.system(
+        f"wget {tag2text_dict[default_tag2text]['checkpoint_url']} -O {tag2text_checkpoint}"
+    )
+dino_checkpoint = os.path.join(
+    abs_weight_dir, dino_dict[default_dino]["checkpoint_file"]
+)
+dino_config_file = os.path.join(abs_weight_dir, dino_dict[default_dino]["config_file"])
+if not os.path.exists(dino_checkpoint):
+    dino_repo_id = dino_dict[default_dino]["repo_id"]
+    download_file_hf(
+        repo_id=dino_repo_id,
+        filename=dino_dict[default_dino]["config_file"],
+        cache_dir=weight_dir,
+    )
+    download_file_hf(
+        repo_id=dino_repo_id,
+        filename=dino_dict[default_dino]["checkpoint_file"],
+        cache_dir=weight_dir,
+    )
+# load model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tag2text_model = tag2text.tag2text_caption(
+    pretrained=tag2text_checkpoint,
+    image_size=384,
+    vit="swin_b",
+    delete_tag_index=delete_tag_index,
+)
+# threshold for tagging
+# we reduce the threshold to obtain more tags
+tag2text_model.threshold = 0.64
+tag2text_model.to(device)
+tag2text_model.eval()
+sam = build_sam(checkpoint=sam_checkpoint)
+sam.to(device=device)
+sam_predictor = SamPredictor(sam)
+sam_automask_generator = SamAutomaticMaskGenerator(sam)
+grounding_dino_model = DinoModel(
+    model_config_path=dino_config_file, model_checkpoint_path=dino_checkpoint
+)
+def process(image_path, task, prompt, box_threshold, text_threshold, iou_threshold):
+    global tag2text_model, sam_predictor, sam_automask_generator, grounding_dino_model, device
+    output_gallery = []
+    detections = None
+    metadata = {"image": {}, "annotations": []}
+    try:
+        # Load image
+        image = Image.open(image_path)
+        image_pil = image.convert("RGB")
+        image = np.array(image_pil)
+        # Extract image metadata
+        filename = os.path.basename(image_path)
+        h, w = image.shape[:2]
+        metadata["image"]["file_name"] = filename
+        metadata["image"]["width"] = w
+        metadata["image"]["height"] = h
+        # Generate tags
+        if task in ["auto", "detection"] and prompt == "":
+            tags, caption = generate_tags(tag2text_model, image_pil, "None", device)
+            prompt = " . ".join(tags)
+            print(f"Caption: {caption}")
+            print(f"Tags: {tags}")
+            # ToDo: Extract metadata
+            metadata["image"]["caption"] = caption
+            metadata["image"]["tags"] = tags
+        if prompt:
+            metadata["prompt"] = prompt
+            print(f"Prompt: {prompt}")
+        # Detect boxes
+        if prompt != "":
+            detections, phrases, classes = detect(
+                grounding_dino_model,
+                image,
+                caption=prompt,
+                box_threshold=box_threshold,
+                text_threshold=text_threshold,
+                iou_threshold=iou_threshold,
+                post_process=True,
+            )
+            # Draw boxes
+            box_annotator = sv.BoxAnnotator()
+            labels = [
+                f"{classes[class_id] if class_id else 'Unkown'} {confidence:0.2f}"
+                for _, _, confidence, class_id, _ in detections
+            ]
+            image = box_annotator.annotate(
+                scene=image, detections=detections, labels=labels
+            )
+            output_gallery.append(image)
+        # Segmentation
+        if task in ["auto", "segment"]:
+            if detections:
+                masks, scores = segment(
+                    sam_predictor, image=image, boxes=detections.xyxy
+                )
+                detections.mask = masks
+            else:
+                masks = sam_automask_generator.generate(image)
+                sorted_generated_masks = sorted(
+                    masks, key=lambda x: x["area"], reverse=True
+                )
+                xywh = np.array([mask["bbox"] for mask in sorted_generated_masks])
+                mask = np.array(
+                    [mask["segmentation"] for mask in sorted_generated_masks]
+                )
+                scores = np.array(
+                    [mask["predicted_iou"] for mask in sorted_generated_masks]
+                )
+                detections = sv.Detections(
+                    xyxy=xywh_to_xyxy(boxes_xywh=xywh), mask=mask
+                )
+                # opacity = 0.4
+                # mask_image, _ = show_anns_sam(masks)
+                # annotated_image = np.uint8(mask_image * opacity + image * (1 - opacity))
+            mask_annotator = sv.MaskAnnotator()
+            mask_image = np.zeros_like(image, dtype=np.uint8)
+            mask_image = mask_annotator.annotate(
+                mask_image, detections=detections, opacity=1
+            )
+            annotated_image = mask_annotator.annotate(image, detections=detections)
+            output_gallery.append(mask_image)
+            output_gallery.append(annotated_image)
+        # ToDo: Extract metadata
+        if detections:
+            id = 1
+            for (xyxy, mask, confidence, class_id, _), area, box_area, score in zip(
+                detections, detections.area, detections.box_area, scores
+            ):
+                annotation = {
+                    "id": id,
+                    "bbox": [int(x) for x in xyxy],
+                    "box_area": float(box_area),
+                }
+                if class_id:
+                    annotation["box_confidence"] = float(confidence)
+                    annotation["label"] = classes[class_id] if class_id else "Unkown"
+                if mask is not None:
+                    # annotation["segmentation"] = mask_to_polygons(mask)
+                    annotation["area"] = int(area)
+                    annotation["predicted_iou"] = float(score)
+                metadata["annotations"].append(annotation)
+                id += 1
+        meta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+        meta_file_path = meta_file.name
+        with open(meta_file_path, "w") as fp:
+            json.dump(metadata, fp)
+        return output_gallery, meta_file_path
+    except Exception as error:
+        raise gr.Error(f"global exception: {error}")
+title = "Annotate Anything"
+with gr.Blocks(css="style.css", title=title) as demo:
+    with gr.Row(elem_classes=["container"]):
+        with gr.Column(scale=1):
+            input_image = gr.Image(type="filepath", label="Input")
+            task = gr.Dropdown(
+                ["detect", "segment", "auto"], value="auto", label="task_type"
+            )
+            text_prompt = gr.Textbox(label="Detection Prompt")
+            with gr.Accordion("Advanced parameters", open=False):
+                box_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.3,
+                    step=0.05,
+                    label="Box threshold",
+                    info="Hash size to use for image hashing",
+                )
+                text_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.25,
+                    step=0.05,
+                    label="Text threshold",
+                    info="Number of history images used to find out duplicate image",
+                )
+                iou_threshold = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    value=0.5,
+                    step=0.05,
+                    label="IOU threshold",
+                    info="Minimum similarity threshold (in percent) to consider 2 images to be similar",
+                )
+            run_button = gr.Button(label="Run")
+        with gr.Column(scale=2):
+            gallery = gr.Gallery(
+                label="Generated images", show_label=False, elem_id="gallery"
+            ).style(preview=True, grid=2, object_fit="scale-down")
+            meta_file = gr.File(label="Metadata file")
+    with gr.Row(elem_classes=["container"]):
+        gr.Examples(
+            [
+                ["examples/dog.png", "auto", ""],
+                ["examples/eiffel.png", "auto", ""],
+                ["examples/eiffel.png", "segment", ""],
+                ["examples/girl.png", "auto", "girl . face"],
+                ["examples/horse.png", "detect", "horse"],
+                ["examples/horses.jpg", "auto", "horse"],
+                ["examples/traffic.jpg", "auto", ""],
+            ],
+            [input_image, task, text_prompt],
+        )
+    run_button.click(
+        fn=process,
+        inputs=[
+            input_image,
+            task,
+            text_prompt,
+            box_threshold,
+            text_threshold,
+            iou_threshold,
+        ],
+        outputs=[gallery, meta_file],
+    )
+demo.queue(concurrency_count=2).launch()

config.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+# Configurations
+tag2text_dict = {
+    "swin_14m": {
+        "checkpoint_url": "https://huggingface.co/spaces/xinyu1205/Tag2Text/resolve/main/tag2text_swin_14m.pth",
+        "checkpoint_file": "tag2text_swin_14m.pth",
+    }
+}
+sam_dict = {
+    "default": {
+        "checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+        "checkpoint_file": "sam_vit_h_4b8939.pth",
+    },
+    "vit_h": {
+        "checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
+        "checkpoint_file": "sam_vit_h_4b8939.pth",
+    },
+    "vit_l": {
+        "checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
+        "checkpoint_file": "sam_vit_l_0b3195.pth",
+    },
+    "vit_b": {
+        "checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth",
+        "checkpoint_file": "sam_vit_b_01ec64.pth",
+    },
+}
+dino_dict = {
+    "swinb": {
+        "repo_id": "ShilongLiu/GroundingDINO",
+        "config_file": "GroundingDINO_SwinB.cfg.py",
+        "checkpoint_file": "groundingdino_swinb_cogcoor.pth",
+    },
+    "swint_ogc": {
+        "repo_id": "ShilongLiu/GroundingDINO",
+        "config_file": "GroundingDINO_SwinT_OGC.cfg.py",
+        "checkpoint_file": "groundingdino_swint_ogc.pth",
+    },
+}
+default_sam = "default"
+default_tag2text = "swin_14m"
+default_dino = "swint_ogc"
+root_dir = os.path.dirname(os.path.abspath(__file__))
+weight_dir = "weights"
+abs_weight_dir = os.path.join(root_dir, weight_dir)
+tag2text_checkpoint = "tag2text_swin_14m.pth"
+tag2text_url = "https://huggingface.co/spaces/xinyu1205/Tag2Text/resolve/main/tag2text_swin_14m.pth"
+sam_checkpoint = "sam_vit_h_4b8939.pth"
+sam_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
+output_dir = "outputs"
+dino_config_file = "GroundingDINO_SwinB.cfg.py"
+dino_repo_id = "ShilongLiu/GroundingDINO"
+dino_checkpoint = "groundingdino_swinb_cogcoor.pth"
+iou_threshold = 0.5
+box_threshold = 0.3
+text_threshold = 0.25
+# filter out attributes and action categories which are difficult to grounding
+delete_tag_index = []
+for i in range(3012, 3429):
+    delete_tag_index.append(i)

examples/dog.png ADDED Viewed

examples/eiffel.jpg ADDED Viewed

examples/eiffel.png ADDED Viewed

examples/girl.png ADDED Viewed

examples/horse.png ADDED Viewed

examples/horses.jpg ADDED Viewed

examples/traffic.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+accelerate
+addict
+gradio
+huggingface_hub
+matplotlib
+numpy
+onnxruntime
+opencv_python
+Pillow
+pycocotools
+pycocoevalcap
+PyYAML
+requests
+setuptools
+supervision
+termcolor
+timm
+torch
+torchvision
+transformers
+yapf
+numba
+scipy
+safetensors
+pynvml
+fairscale
+imutils
+argparse
+tqdm
+git+https://github.com/facebookresearch/segment-anything.git
+git+https://github.com/IDEA-Research/GroundingDINO

style.css ADDED Viewed

	@@ -0,0 +1,11 @@

+.container {
+    max-width: 1368px;
+    margin-left: auto;
+    margin-right: auto;
+  }
+  #row-flex {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+  }

tag2text/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 OPPO LLC
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

tag2text/README.md ADDED Viewed

	@@ -0,0 +1,101 @@

+# :label: Tag2Text: Guiding Vision-Language Model via Image Tagging
+Official PyTorch Implementation of the <a href="https://arxiv.org/abs/2303.05657">Tag2Text</a>, an efficient and controllable vision-language model with tagging guidance. Code is available now!
+Welcome to try out [Tag2Text Web demo🤗](https://huggingface.co/spaces/xinyu1205/Tag2Text)! Both Tagging and Captioning are included.
+Tag2Text now is combine with [Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything), which can automatically recognize, detect, and segment for an image! Tag2Text showcases powerful image recognition capabilities:
+![](./images/tag2text_grounded_sam.jpg)
+## :fire: News
+- **`2023/05/20`**: Tag2Text is combined with [VideoChat](https://github.com/OpenGVLab/Ask-Anything), Tag2Text provides powerful tagging and captioning capabilities as a fundamental component!
+- **`2023/04/20`**: We marry [Tag2Text with with Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything) to provide powerful image recognition capabilities!
+- **`2023/04/10`**: Code and checkpoint is available Now!
+- **`2023/03/14`**: [Tag2Text web demo 🤗](https://huggingface.co/spaces/xinyu1205/Tag2Text) is available on Hugging Face Space!
+## :bulb: Highlight
+- **Tagging.** Without manual annotations, Tag2Text achieves **superior** image tag recognition ability of [**3,429**](./data/tag_list.txt) commonly human-used categories.
+- **Efficient.** Tagging guidance effectively enhances the performance of vision-language models on both **generation-based** and **alignment-based** tasks.
+- **Controllable.** Tag2Text permits users to input **desired tags**, providing the flexibility in composing corresponding texts based on the input tags.
+<p align="center">
+ <table class="tg">
+  <tr>
+    <td class="tg-c3ow"><img src="images/tag2text_framework.png" align="center" width="800" ></td>
+  </tr>
+</table>
+</p>
+## :writing_hand: TODO
+- [x] Release demo.
+- [x] Release checkpoints.
+- [x] Release inference code.
+- [ ] Release training codes.
+- [ ] Release training datasets.
+## :toolbox: Checkpoints
+<!-- insert a table -->
+<table>
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>name</th>
+      <th>backbone</th>
+      <th>Data</th>
+      <th>Illustration</th>
+      <th>Checkpoint</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>1</th>
+      <td>Tag2Text-Swin</td>
+      <td>Swin-Base</td>
+      <td>COCO, VG, SBU, CC-3M, CC-12M</td>
+      <td>Demo version with comprehensive captions.</td>
+      <td><a href="https://huggingface.co/spaces/xinyu1205/Tag2Text/blob/main/tag2text_swin_14m.pth">Download  link</a></td>
+    </tr>
+  </tbody>
+</table>
+## :running: Model Inference
+1. Install the dependencies, run:
+<pre/>pip install -r requirements.txt</pre>
+2. Download Tag2Text pretrained checkpoints.
+1. Get the tagging and captioning results:
+   <pre/>
+   python inference.py  --image images/1641173_2291260800.jpg \
+   --pretrained pretrained/tag2text_swin_14m.pth
+   </pre>
+   Or get the tagging and sepcifed captioning results (optional):
+   <pre/>python inference.py  --image images/1641173_2291260800.jpg \
+   --pretrained pretrained/tag2text_swin_14m.pth \
+   --specified-tags "cloud,sky"</pre>
+## :black_nib: Citation
+If you find our work to be useful for your research, please consider citing.
+```
+@article{huang2023tag2text,
+  title={Tag2Text: Guiding Vision-Language Model via Image Tagging},
+  author={Huang, Xinyu and Zhang, Youcai and Ma, Jinyu and Tian, Weiwei and Feng, Rui and Zhang, Yuejie and Li, Yaqian and Guo, Yandong and Zhang, Lei},
+  journal={arXiv preprint arXiv:2303.05657},
+  year={2023}
+}
+```
+## :hearts: Acknowledgements
+This work is done with the help of the amazing code base of [BLIP](https://github.com/salesforce/BLIP), thanks very much!
+We also want to thank @Cheng Rui @Shilong Liu @Ren Tianhe for their help in [marrying Tag2Text with Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything).

tag2text/configs/med_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "architectures": [
+      "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30524,
+    "encoder_width": 768,
+    "add_cross_attention": true
+  }

tag2text/configs/q2l_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "architectures": [
+      "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30522,
+    "encoder_width": 768,
+    "add_cross_attention": true,
+    "add_tag_cross_attention": false
+  }

tag2text/configs/swin/config_swinB_384.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth",
+    "vision_width": 1024,
+    "image_res": 384,
+    "window_size": 12,
+    "embed_dim": 128,
+    "depths": [ 2, 2, 18, 2 ],
+    "num_heads": [ 4, 8, 16, 32 ]
+  }

tag2text/data/tag_list.txt ADDED Viewed

	@@ -0,0 +1,3429 @@

+tennis
+bear cub
+observatory
+bicycle
+hillside
+judge
+watercolor illustration
+granite
+lobster
+livery
+stone
+ceramic
+ranch
+cloth
+smile
+building
+tattoo
+cricketer
+cheek
+pear
+source
+winter
+surface
+spray
+ceremony
+magic
+curve
+container
+fair
+medicine
+baby
+tennis racquet
+ornament
+bamboo
+duckling
+song
+safari
+team presentation
+daffodil
+cross
+toothpaste
+shield
+fashion model
+capsule
+map
+creek
+glass house
+glass plate
+siding
+corner
+water buffalo
+bison
+figure skater
+diploma
+tire
+race
+cable car
+brain
+gas stove
+soap bubble
+palette
+snowboard
+school child
+trench coat
+monk
+fiber
+kitchen window
+sunglass
+coffee
+security
+strawberry
+penguin
+tree root
+loaf
+engagement ring
+lamb
+vector cartoon illustration
+sandwich
+mountain village
+shape
+charm
+fiction
+knot
+greenhouse
+sushi
+text
+disaster
+trophy
+gang
+strap
+soccer game
+cardinal
+tee
+turtle
+water surface
+grassland
+dolphin
+store
+dirt
+iceberg
+pergola
+farmer market
+publicity portrait
+tote bag
+teenage girl
+view mirror
+session
+commuter
+dressing room
+tricycle
+christmas ball
+headlight
+police
+armchair
+chart
+yacht
+saw
+printer
+rock band
+gingerbread house
+tag
+table lamp
+hockey game
+slope
+font
+wicker basket
+jewelry
+quarter
+software
+weapon
+pin
+worship
+painter
+goal
+morning light
+bike
+baseball bat
+elevator
+cuisine
+sausage
+stunt
+wrestler
+statue
+landing
+pillar
+willow tree
+sea wave
+chicken
+peanut
+muscle
+bob
+tv genre
+bathroom window
+radish
+textile
+pelican
+marketplace
+crest
+elevation map
+gift
+parish
+traffic light
+campfire
+fog
+award winner
+beach ball
+mat
+white house
+plaster
+moped
+football team
+solution
+bicyclist
+bit
+playground
+darkness
+cake
+maple leave
+mold
+cracker
+blueberry
+rubble
+container ship
+pedestrian bridge
+snail
+parrot
+form
+circuit
+highlight
+pickup truck
+koala
+rain
+system
+weather
+raincoat
+soccer team
+windshield
+thunderstorm
+mike
+bird house
+bridge
+grandfather
+restroom
+animation
+wilderness
+clown
+banana
+brown
+braid
+dining room
+kindergarten
+launch event
+purple
+school
+stairwell
+brooch
+movie poster image
+mountain river
+shelf
+wicket
+headboard
+buddha
+flower field
+dugout
+cd
+bald eagle
+lagoon
+seaweed
+agriculture
+emergency service
+maple tree
+parachute
+continent
+amusement park
+remote
+bun
+tackle
+hospital
+garage door
+birthday party
+friendship
+go
+mausoleum
+jeep
+raccoon
+step
+ice hockey team
+cigarette
+lace dress
+forest floor
+mall
+captain
+milk
+golf course
+meal
+picnic table
+sail
+volleyball
+canal
+terrace
+computer desk
+caravan
+hotel
+cheerleader
+nurse
+museum
+marsh
+fox
+plateau
+night
+twin
+letter logo
+autumn tree
+powder
+convention
+creature
+lighthouse
+shop window
+jacket
+stork
+taxi
+trade
+blackboard
+olive
+road sign
+resort
+snowflake
+cemetery
+travel
+evening dress
+picnic
+drink
+winter morning
+football player
+snack
+boxing glove
+dinner party
+airline
+swing
+port
+wheelbarrow
+bathroom sink
+sweater
+ambulance
+gear
+oil
+wii controller
+array
+home office
+car show
+mixture
+profession
+tree frog
+square
+facility
+coral reef
+sea wall
+pizza
+exhibit
+demolition
+trout
+ring
+coffee shop
+bracelet
+bean
+lip
+fencing
+landscape
+sitting
+package
+metal
+bust
+king
+hair
+window seat
+wildlife
+trunk
+greenery
+stencil
+fire hydrant
+bridesmaid
+plaza
+alps
+tower bridge
+crop top
+crossing
+cinema
+pedestrian crossing
+family
+shopping cart
+stomach
+church building
+screen door
+skater
+soccer field
+kettle
+mussel
+raindrop
+candy cane
+water lily
+flower girl
+desert
+enclosure
+christmas light
+kitchen
+caterpillar
+plaid
+bath
+bush
+mud
+ballet
+knee
+adult
+raft
+sea view
+cactus
+office chair
+overall
+rim
+scaffolding
+pig
+cover
+poster page
+sprinkle
+chandelier
+algae
+traffic
+surfboard
+book
+filming
+flash
+mansion
+camouflage
+trouser
+ticket
+weed
+cab
+trench
+elephant
+huddle
+sphere
+christmas decoration
+city
+launch
+doll
+christmas ornament
+fabric
+bikini
+biplane
+breakfast
+neighbourhood
+race track
+foliage
+avocado
+school bus
+footwear
+highway
+ocean view
+art vector illustration
+wall clock
+curtain
+teenager
+kitchen area
+robot
+tusk
+lounge chair
+beam
+paddle
+camel
+lid
+world map
+city view
+newlywed
+cargo ship
+yellow
+exhibition
+bend
+novel
+wool
+ontario
+bread
+campus
+coastline
+cutting board
+booth
+table top
+carpet
+beach chair
+workout
+street food
+fun
+costumer film designer
+gadget
+artist
+fishing village
+builder
+violinist
+iphone
+spider web
+traffic sign
+ruin
+rescue
+clipboard
+seal
+film director
+paw
+nursery
+intersection
+tomato sauce
+taste
+paddy field
+christmas tree
+wave
+stool
+watering can
+rug
+daytime
+subway station
+craft
+pine forest
+black
+planet
+motif
+christmas market
+glass window
+college
+wheat
+damage
+rectangle
+picture frame
+chess
+guest room
+street corner
+religion
+seed
+puzzle
+freeway
+beauty
+ocean
+watch
+mother
+garage
+quote
+dj
+supporter
+hip hop artist
+muffin
+eiffel tower
+cash
+firefighter
+cauliflower
+bunker
+sled
+manicure
+shark
+stall
+jungle
+family home
+tour bus
+chimney
+touchdown
+roundabout
+coyote
+street scene
+tank
+wedding dress
+mantle
+bedroom window
+coconut
+chapel
+goat
+living space
+rock wall
+polka dot
+railway
+mandala
+mango
+lesson
+mountain landscape
+team photo
+bookshelf
+meter
+bulldog
+evening sun
+stick
+card
+pink
+fish pond
+paint
+pill
+cart
+pea
+van
+album
+football college game
+mountain pass
+doughnut
+ski slope
+match
+official
+shadow
+organ
+celebration
+coin
+log cabin
+firework display
+present
+twig
+chef
+confetti
+footpath
+tour
+ponytail
+artwork
+race car
+club
+season
+hose
+pencil
+aircraft
+rock formation
+wardrobe
+participant
+politician
+engineer
+peace
+filter
+sailing boat
+water bottle
+service dog
+poodle
+loki
+statesman
+sleeping bag
+outskirt
+clock
+factory
+oak tree
+physician
+color
+room
+stairway
+company
+lady
+graph
+faucet
+tablecloth
+subway train
+chocolate chip cookie
+headquarters
+screw
+goggle
+halloween
+city street
+swirl
+cord
+forward
+bone
+bedding
+archway
+wig
+lobby
+mask
+attic
+kitchen table
+skylight
+fire
+exit
+oil painting
+passenger
+meditation
+salmon
+fedora
+rubber stamp
+orange juice
+arch
+scientist
+stroll
+manhattan
+float
+baseball uniform
+circle
+church
+decker bus
+competitor
+zoo
+basketball team
+tourist
+daughter
+silverware
+ceiling fan
+birth
+vase
+jack
+mushroom
+spiral
+cage
+limb
+salad
+ad
+control
+earth
+party
+bolt
+tractor
+barley
+wedding photo
+hawk
+warehouse
+vegetable garden
+chocolate cake
+cabbage
+floor window
+baby shower
+magnifying glass
+table
+stethoscope
+reading
+mission
+croissant
+gift box
+rocket
+forest road
+cooking
+suite
+hill country
+motorcycle
+baseball player
+angle
+drug
+sport association
+championship
+family portrait
+florist
+softball
+egret
+office
+plywood
+jockey
+mosque
+brunch
+beanie
+office building
+pattern
+calendar
+indoor
+pepper
+ledge
+trail
+fuel
+laptop computer
+tennis shoe
+deck chair
+guitarist
+barn
+surgery
+cartoon illustration
+nebula
+railroad
+mountain goat
+goose
+car door
+cheer
+liquid
+hardwood floor
+pathway
+acorn
+gull
+airliner
+couch
+lake house
+spaghetti
+promenade
+collection
+garden
+bank
+robin
+tennis ball
+peony
+gymnast
+lavender
+deck
+test
+riverside
+rapper
+domino
+bride
+mouse
+basil
+wedding couple
+ocean wave
+arm
+kitchen floor
+grove
+family member
+backyard
+raspberry
+forest fire
+officer
+hibiscus
+canyon
+composer
+signature
+olive oil
+hibiscus flower
+rose
+vector icon
+sunrise
+horseback
+motor scooter
+office worker
+tradition
+ingredient
+washing machine
+lighting
+bagel
+sailboat
+policeman
+mare
+graphic
+halloween pumpkin
+stock
+pilot
+education
+team
+body
+horse
+kimono
+bazaar
+bag
+recording studio
+parsley
+entrance
+denim
+vet
+horse farm
+charcoal
+architecture
+glass vase
+puppy
+estuary
+television show host
+city bus
+shoulder
+beast
+balance
+golfer
+roadside
+denim jacket
+stone wall
+counter top
+app icon
+toast
+head coach
+ham
+warrior
+gem
+refrigerator
+snowman
+construction worker
+coal
+website
+morning fog
+mustard
+human
+owl
+puppy dog
+piggy bank
+vegetation
+pirate
+action film
+marshmallow
+thanksgiving
+business
+disease
+signage
+greeting
+skate park
+tile
+mouth
+spinach
+vacation
+leader
+shrine
+walker
+science fiction film
+bill
+rabbit
+motor boat
+bar
+radio
+barge
+tail
+chainsaw
+gallery
+rainbow
+pasta
+padlock
+web
+pastry
+ink
+reef
+school uniform
+shawl
+treasure
+peach
+dinner table
+injury
+harbor
+witch
+car dealership
+litter
+gesture
+documentary
+marriage
+sea shell
+priest
+dome
+kit
+icon
+seaside
+bucket
+entertainment
+stable
+hat
+puddle
+sock
+shopper
+technology
+harbour
+orbit
+antler
+tube
+flag waving
+cook
+tight
+commander
+farmland
+switch
+hiker
+wedding ceremony
+award ceremony
+champion
+chopstick
+farmhouse
+performer
+spike
+accident
+cruise ship
+passenger train
+attraction
+entertainer
+rear view
+sidewalk
+parade
+racing
+plane
+ritual
+peacock
+pocket
+plum
+drop
+carrot
+floor
+sunset
+troop
+architect
+coffee table
+dust
+outline
+leather
+charity event
+heat
+whale
+laundry
+coconut tree
+crosswalk
+pony
+ant
+pipe
+string
+coat
+angel
+beef
+church tower
+dish
+pitch
+cupboard
+thermometer
+dirt field
+fireworks
+minute
+cane
+pajama
+flower garden
+autumn
+trash can
+dachshund
+banana tree
+tray
+moose
+roadway
+carnival
+antenna
+pole
+castle wall
+ram
+cattle
+hay
+cookie
+swimmer
+baseball team
+strait
+hedge
+jet
+fire pit
+octopus
+calf
+cube
+opera
+cardboard box
+tiara
+kitchen sink
+prairie
+bowl
+galaxy
+straw hat
+linen
+ski resort
+stitch
+street lamp
+motorist
+icicle
+stain
+flora
+drain
+kitchen cabinet
+decor
+bouquet
+pound
+interior design
+nail polish
+figurine
+tomb
+disc
+twist
+blouse
+ribbon
+figure
+burger
+cork
+soccer goalkeeper
+train bridge
+drinking water
+dew
+baker
+storm cloud
+tarmac
+tv drama
+sponge
+magnet
+sailor
+entry
+swan
+exercise
+sloth
+jewel
+scuba diver
+bite
+cat tree
+tent
+can
+tennis match
+ecosystem
+picket fence
+palm
+train car
+frying pan
+rally
+tablet pc
+reindeer
+image
+wolf
+chin
+conservatory
+flood water
+cityscape
+beach sand
+car park
+pavement
+farm field
+swimming
+winter storm
+stem
+pillow
+inning
+gorilla
+desk
+avenue
+fern
+money
+pearl
+train station
+skillet
+nap
+barber
+library
+freezer
+label
+rainforest
+parking sign
+mirror
+wing
+noodle
+press room
+sculpture
+tablet
+viewer
+prayer
+mini
+mechanic
+laugh
+rice field
+hand
+mustache
+mountain road
+catwalk
+conference
+cape
+installation
+musician
+stream
+machine
+speech
+crocodile
+soccer match
+town square
+passport
+post box
+point
+stone building
+motorway
+mix
+dentist
+businessperson
+happiness
+boat
+vineyard
+treadmill
+glass wall
+water droplet
+coffee mug
+graduate
+sunflower
+parliament
+shepherd
+movie
+wine
+orchard
+tulip
+motherboard
+cup
+broom
+spot
+drawing
+polo shirt
+graduation
+film producer
+moonlight
+glow
+film format
+t shirt
+rock face
+sword
+clinic
+festival day
+meadow
+staple
+pupil
+training ground
+rider
+flower
+foal
+wharf
+foot bridge
+shooting
+top
+mast
+police car
+robe
+wedding bouquet
+stop sign
+birthday cake
+glitter
+butter
+scooter
+tundra
+superhero
+pocket watch
+inscription
+youngster
+fruit tree
+movie poster
+engine
+foundation
+motorcyclist
+take
+woman
+antelope
+country artist
+road trip
+typewriter
+tuxedo
+brand
+pine
+bathroom
+paradise
+texture
+balloon
+dining table
+home
+computer screen
+actor
+clip
+tv tower
+panorama
+summit
+cat
+plot
+eagle
+dancer
+pup
+studio shot
+tear
+bird bath
+classroom
+bookstore
+city wall
+tv programme
+blade
+easel
+buttercream
+sweet
+designer
+diamond
+handshake
+herb
+corn field
+seafront
+concrete
+street artist
+gas
+stamp
+window display
+paper
+note
+pint
+quarry
+research
+fixture
+manager
+soil
+leopard
+board game
+ladder
+stop light
+island
+ramp
+football match
+icing
+drill
+currency
+summer evening
+topping
+pyramid
+pomegranate
+cell
+ivy
+squad
+scenery
+computer
+locomotive
+surf
+mascot
+dune
+path
+duck
+twilight
+wire
+bow tie
+strike
+cormorant
+car wash
+crane
+market
+philosopher
+alarm clock
+camera
+birch
+greeting card
+plain
+clay
+donut
+lock
+moth
+laboratory
+fan
+violin
+jazz fusion artist
+mountain biker
+terrain
+magazine
+pickup
+comedy film
+smartphone
+film
+bed
+microwave oven
+tournament
+lawn
+car window
+alligator
+screen
+jetty
+shopping bag
+landscape view
+cabinetry
+friendly match
+thing
+petal
+shopping center
+transport
+ballet dancer
+shoreline
+princess
+car seat
+parking meter
+green
+vodka
+band
+rock
+costume
+warning sign
+strip
+plaque
+wheelchair
+headband
+ginger
+dice
+media
+hairdresser
+press
+living room
+stove
+player
+cherry
+workshop
+carving
+embroidery
+doodle
+adventure
+rugby player
+monument
+brush
+marker
+loft
+postcard
+collage
+ball
+professor
+dresser
+gig
+festival
+blackbird
+makeup artist
+video camera
+sticker
+peak
+wildflower
+santa hat
+rodeo
+wedding photographer
+guy
+staff
+waterfall
+operation
+defender
+falcon
+haze
+individual
+gentleman
+greyhound
+rocking chair
+rice
+garbage
+platter
+chocolate
+splash
+business suit
+cheetah
+valley
+maze
+trampoline
+garland
+slalom
+unicorn
+tree stump
+painting
+romance
+fight
+alcohol
+ghost
+fondant
+spa
+shutter
+death
+demonstration
+cotton
+pier
+flea market
+history
+savannah
+fist
+aisle
+crew
+jug
+pose
+anchor
+teapot
+boat house
+business team
+tripod
+bee
+pebble
+mattress
+canvas
+hallway
+campaign
+pod
+lake district
+article
+white
+sofa
+honey
+marathon
+pancake
+tourist attraction
+wedding gown
+battle
+shelving
+sea
+sheet music
+pie
+yarn
+construction site
+flyer
+tie
+star
+lettuce
+martial artist
+dart
+straw
+reflection
+conference room
+temperature
+rugby
+mosquito
+physicist
+rock climber
+crash
+backdrop
+toilet seat
+sand castle
+water park
+toy car
+waste
+luxury
+hangar
+rv
+tree trunk
+board
+gold
+project picture
+cap
+cottage
+relief
+attire
+microscope
+battery
+roll
+line
+parking garage
+crystal
+broadcasting
+brick wall
+lab
+flooring
+meeting
+3d cg rendering
+desktop computer
+cowboy
+sailing ship
+junction
+hairstyle
+homework
+profile
+model
+flower pot
+street light
+salt lake
+maple
+space
+blizzard
+throw
+zebras
+brochure
+constellation
+beak
+kilt
+pond
+blue sky
+sneaker
+sand dune
+morning sun
+almond
+grill
+curl
+basketball girl game
+chameleon
+toilet bowl
+prince
+keyboard
+queen
+computer monitor
+writing
+crown
+basilica
+kiss
+house
+parking
+football competition
+shell
+sport equipment
+comedy
+baboon
+vendor
+rise building
+wrap
+food truck
+cat bed
+rickshaw
+flare
+teal
+nectar
+eclipse
+vehicle
+steam locomotive
+gorge
+cow
+christmas card
+demonstrator
+memorial
+towel
+jewellery
+train
+frisbee
+baseball game
+fur
+afternoon sun
+community
+sparkler
+bandage
+firework
+dollar
+pasture
+video
+bus
+tree house
+seashore
+field
+hamburger
+souvenir
+hedgehog
+worm
+pine cone
+osprey
+dinosaur
+vegetable
+junk
+poster
+army
+winger
+bundle
+stage
+growth
+wedding party
+service
+blanket
+ruler
+eye
+credit card
+castle
+diner
+hut
+elk
+hard rock artist
+nun
+dog breed
+nest
+drama film
+number icon
+water tank
+giraffe
+altar
+pavilion
+tv personality
+suv
+street vendor
+street sign
+ditch
+debris
+foam
+takeoff
+spice
+mountain lake
+tea
+orchestra
+spacecraft
+counter
+abbey
+mountain
+hydrangea
+racer
+orange tree
+tide
+cowboy hat
+rapid
+town
+wild
+herd
+vein
+driveway
+jar
+bark
+illustration
+horror film
+corn
+stroller
+industry
+mountain stream
+gym
+neckline
+pan
+client
+spectator
+eggplant
+camper
+fawn
+hoodie
+meat
+lemonade
+food market
+slum
+comic book character
+flower market
+love
+palace
+gun
+heel
+shopping street
+shooting basketball guard
+family photo
+rooftop
+laundry basket
+airport runway
+horn
+face mask
+flight
+appetizer
+violet
+country lane
+cement
+instrument
+tv actor
+spark
+celebrity
+award
+country house
+standing
+auction
+date
+engagement
+puck
+advertisement
+chair
+zebra
+driftwood
+bumblebee
+maple leaf
+bonnet
+orange
+water tower
+door
+singer
+floor plan
+discussion
+theatre
+pilgrim
+mug
+branch
+window sill
+baseball pitcher
+bakery
+lollipop
+basketball player
+toilet paper
+chalkboard
+cabin
+sign
+night sky
+cannon
+fishing net
+submarine
+suit
+fur coat
+wine bottle
+folder
+street art
+suspension bridge
+evening sky
+billboard
+postage stamp
+newspaper
+transportation
+surgeon
+light
+park
+horizon
+road
+sand bar
+trumpet
+lounge
+cloud forest
+birthday celebration
+balcony
+anime
+beehive
+umbrella
+goldfish
+baseball cap
+waterhole
+ceiling
+carousel
+backpack
+plant pot
+atmosphere
+sunflower field
+spire
+vision
+woodpecker
+chip
+pool table
+lotus flower
+cone
+humpback whale
+reservoir
+hunt
+piano
+plate
+dining area
+luggage
+skier
+dance floor
+crow
+stair
+overpass
+opera house
+bear
+jazz artist
+water
+vessel
+cast
+yard
+cathedral
+basketball hoop
+graveyard
+sound
+berry
+onlooker
+fauna
+birch tree
+retail
+hill
+skeleton
+journalist
+frost
+basket
+nail
+dusk
+trash
+dawn
+clover
+hen
+volcano
+basketball coach
+home decor
+charge
+haircut
+sense
+university
+lizard
+daisy
+tablet computer
+grass field
+prison
+metal artist
+bathroom mirror
+window frame
+chest
+flavor
+pop country artist
+market square
+monkey
+blog
+deer
+speech bubble
+dog
+independence day
+girl
+boy
+tartan
+furniture
+appliance
+office window
+fish boat
+sand box
+tv sitcom
+drama
+sleigh
+depression
+paper towel
+baseball
+protestor
+grape
+wedding cake
+invitation
+accessory
+pick
+grandparent
+racket
+tea plantation
+outdoors
+egg
+glass bowl
+sun
+organization
+lion
+panel
+station
+wallpaper
+helicopter
+salt
+vanity
+patio
+lunch
+street performer
+mountain range
+soup
+bacon
+power station
+cantilever bridge
+hummingbird
+shirt
+rope
+hip
+chalk
+pendant
+choir
+tv
+lichen
+railway bridge
+art gallery
+bartender
+wagon
+baby elephant
+accordion
+horseshoe
+building site
+clutch
+harvest
+savanna
+geranium
+business woman
+paddock
+patch
+beech tree
+war
+suburbs
+hospital bed
+motorcycle racer
+moss
+gravel
+government agency
+dollar bill
+father
+fjord
+concert
+nut
+wedding photography
+finish line
+home plate
+food
+nose
+thumb
+village
+dining room table
+bumper
+monster
+blackberry
+lime
+conflict
+gala
+wallet
+wrist
+hug
+mermaid
+lava
+lawyer
+folk rock artist
+arena
+onion
+toothbrush
+fashion
+perfume
+flip
+triangle
+woodland
+mail
+grasshopper
+studio
+wood floor
+den
+racquet
+cello
+lemur
+astronaut
+glass table
+blood
+dvd
+planter
+silver
+leash
+master bedroom
+forest
+batter
+shoe
+engraving
+opening
+product
+toe
+cocktail
+mallard duck
+bike ride
+oasis
+wedding ring
+cinematographer
+holly
+autograph
+fence
+ice cube
+cove
+pineapple
+aurora
+glass bead
+produce
+apartment building
+cob
+miniature
+cockpit
+flashlight
+frog
+sheep
+groom
+steel
+watermelon
+clip art
+paper plate
+ostrich
+contour
+mural
+cub
+paisley bandanna
+winery
+turn
+handle
+satellite
+post
+pork
+child
+asphalt
+grocery store
+vulture
+trolley
+nightclub
+brick
+trailer
+compass
+cereal
+cafe
+cartoon character
+sugar
+fiction book
+glass floor
+umpire
+guitar
+hamster
+protester
+airplane
+garment
+blazer
+railway line
+wedding
+shoe box
+parking lot
+construction
+graduation ceremony
+tram
+telescope
+copper
+pain
+autumn forest
+guest house
+partner
+crayon
+dip
+boot
+corridor
+computer keyboard
+hockey player
+chicken coop
+bus station
+gathering
+ankle
+bunk bed
+wood table
+football coach
+monarch
+pharmacy
+legging
+mannequin
+female
+train track
+stack
+canopy
+design element
+grandmother
+symbol
+beach hut
+zucchini
+bomb
+businessman
+skyscraper
+tongue
+case
+sparkle
+highland
+ballroom
+prom
+estate
+customer
+archipelago
+cheese
+debate
+carriage
+bulldozer
+pumpkin
+sitting room
+gas station
+wedding reception
+camp
+dog bed
+tower
+property
+river bed
+pop latin artist
+fridge
+wine glass
+coast
+beer
+tow truck
+fire truck
+mountain bike
+thigh
+heron
+boat ride
+gondola
+turquoise
+lake
+llama
+kitty
+tin
+waiting room
+coffee cup
+socialite
+guard
+tap
+waterway
+forehead
+list
+erosion
+box
+sea lion
+pollen
+dam
+wasp
+salon
+tennis tournament
+flower box
+aquarium
+rain cloud
+clothing store
+lead singer
+cupcake
+tortoise
+lettering
+sport facility
+dance
+dog house
+nature
+football
+rooster
+footballer
+railway track
+crowd
+fishing rod
+silhouette
+wind turbine
+sari
+bus window
+cloud
+charity
+medal
+yoga
+event
+veil
+fashion menswear milan week
+news
+knife
+print
+screen tv
+walnut
+fungus
+ice cream
+computer mouse
+play
+tribe
+picture
+video game
+business card
+music festival
+rack
+envelope
+shower
+dirt road
+mine
+oyster
+monarch butterfly
+dude
+fruit salad
+podium
+fork
+lace
+test match
+boulder
+cricket player
+staircase
+peninsula
+shopping
+popcorn
+oak
+market stall
+pine tree
+mountaineer
+student
+closet
+hood
+handstand
+centerpiece
+insect
+patient
+makeover
+tennis player
+sheet
+park bench
+apple
+organism
+hook
+turkey
+tangerine
+sibling
+shopping mall
+bird
+scarf
+smoothie
+net
+grass
+napkin
+ray
+eyebrow
+laptop keyboard
+motorbike
+woman hand
+oven
+book cover
+easter egg
+microwave
+sand
+snapshot
+soccer ball
+makeup
+knight
+bowling ball
+shower curtain
+flame
+lightning
+running
+power plant
+crib
+cartoon
+moat
+fashion girl
+wedding invitation
+bottle
+cliff
+monastery
+file photo
+apartment
+casino
+cream
+sweatshirt
+storm
+cruise
+teddy bear
+shovel
+wind farm
+writer
+dock
+professional
+hotel room
+job
+monitor
+donkey
+pass
+interview
+duchess
+mark
+plank
+beard
+zombie
+trio
+channel
+cricket team
+windmill
+vest
+diagram
+cable
+winter scene
+golden gate bridge
+buffalo
+studio portrait
+pagoda
+whiskey
+freight train
+kite
+future
+steam train
+phone box
+headset
+wood
+snowboarder
+paper bag
+slide
+grapefruit
+seating
+morning
+bronze sculpture
+theatre actor
+stump
+jean
+landmark
+jam
+waist
+watercolor
+hammock
+light fixture
+ice
+basin
+beverage
+shelter
+premiere
+mound
+ear
+bronze
+sunlight
+street
+energy
+barn door
+hike
+fleet
+claw
+beach
+pepperoni
+bin
+trainer
+buffet
+archive
+toddler
+referee
+bay window
+dove
+production company
+evening light
+gate
+farm
+reed
+fruit stand
+explorer
+snow storm
+throw pillow
+button
+display case
+bookcase
+lead
+lipstick
+basketball court
+cargo
+ensemble
+pope
+clock tower
+teen
+speaker
+rat
+laptop
+ski
+mess
+stadium
+ferry boat
+bunny
+waterfront
+downtown
+sink
+press conference
+dinner
+condiment
+thread
+audience
+grid
+car
+plastic
+people
+barbecue
+pigeon
+urinal
+seagull
+volunteer
+hockey
+fir tree
+pollution
+trial
+collar
+area
+meeting room
+circus
+yogurt
+orangutan
+viaduct
+comedian
+drone
+scissor
+pop rock artist
+biscuit
+panda
+water feature
+air balloon
+remote control
+watercolor painting
+show
+walk
+post office
+bike path
+rap gangsta artist
+microphone
+crack
+sunset sky
+glass
+tv show
+cartoon style
+stripe
+foyer
+signal
+calligraphy
+bulb
+gardener
+coffee bean
+spider
+tapestry
+city skyline
+necklace
+kitten
+traveler
+veteran
+frosting
+fry
+tennis court
+tank top
+butterfly house
+mist
+drummer
+water level
+scale
+baseball glove
+music video performer
+champagne
+camping
+clothing
+water drop
+telephone box
+pen
+morning mist
+fire engine
+porch
+opening ceremony
+style
+palm tree
+fashion show
+universe
+scratch
+axe
+ottoman
+explosion
+rib
+boutique
+game
+cucumber
+fruit
+stone bridge
+nature reserve
+track
+train window
+punch
+telephone pole
+velvet
+sauce
+moon
+contrast
+flamingo
+bat
+vending machine
+ship
+equestrian
+shade
+comforter
+pallet
+sparrow
+wii
+glaze
+grocery
+steeple
+soccer player
+contract
+advertising
+runner
+chimpanzee
+world
+seat
+project
+chihuahua
+bubble
+willow
+pedestal
+soul hip hop artist
+curb
+drawer
+leaf
+banner
+launch party
+coach
+government
+snowball
+toy
+portrait
+doctor
+whiteboard
+electronic
+tiger
+graffiti
+column
+nightstand
+whistle
+maxi dress
+bench
+wetsuit
+bird feeder
+football game
+basketball
+class
+bathroom door
+store window
+text message
+wreath
+street view
+binocular
+pet
+facade
+drought
+lemon
+new year
+night view
+airplane window
+specie
+rule
+jaw
+wheat field
+diet
+pop artist
+habitat
+screenshot
+scoreboard
+shore
+mane
+quilt
+ski lift
+orchid
+turban
+christmas
+airport
+marina
+glass door
+glass bottle
+restaurant
+conductor
+logo
+sleep
+tape
+tomato
+river bank
+lilac
+tooth
+training
+pottery
+shop
+steam engine
+mason jar
+base
+procession
+border
+shoot
+footprint
+hotdog
+bull
+stocking
+recreation
+automobile model
+design
+country pop artist
+river
+retriever
+department store
+auditorium
+sport car
+supermarket
+belt
+cricket
+window box
+dress shirt
+letter
+residence
+megaphone
+pant
+wildfire
+bird nest
+crab
+swimsuit
+candle
+funeral
+mill
+national park
+plant
+cop
+power line
+perch
+blue
+finger
+ferris wheel
+globe
+skateboard
+helmet
+movie theater
+uniform
+hammer
+material
+kid
+well
+butterfly
+sideline
+fashion fall show
+planet earth
+lift
+male
+sauna
+gray
+flour
+sand sculpture
+program
+cabinet
+infant
+wheel
+aircraft model
+dough
+garlic
+skate
+arrow
+wrapping paper
+ripple
+lamp
+iron
+banknote
+beaver
+ferry
+courtyard
+bassist
+countryside
+steak
+comfort
+boxer
+laundry room
+campsite
+brick building
+golf
+subway
+headphone
+fort
+handbag
+drum
+flood
+saddle
+bass
+labyrinth
+needle
+sun ray
+app
+menu
+president
+cardigan
+dandelion
+wetland
+ice hockey player
+number
+city hall
+fishing
+portrait session
+pug
+key
+art print
+minister
+hurdle
+emergency
+painting artist
+flag pole
+evening
+purse
+recipe
+golf ball
+coloring book
+mountain peak
+senior
+holiday
+bud
+cousin
+pantry
+lap
+skin
+flag
+tissue paper
+ridge
+wire fence
+surfer
+climber
+photograph
+sewing machine
+cooler
+actress
+apple tree
+cancer
+starfish
+automobile make
+dumbbell
+brace
+tunnel
+window
+paint artist
+composition
+school student
+condo
+convertible
+cushion
+selfie
+territory
+guide
+tree
+court
+shrimp
+stone house
+dress
+eyelash
+juice
+broccoli
+chain
+tourism
+mountain top
+concept car
+film premiere
+light bulb
+cafeteria
+badge
+flower bed
+theater
+root
+racecar driver
+basketball boy game
+glove
+skyline
+wall
+glacier
+airport terminal
+bug
+trim
+railway station
+briefcase
+flat
+fountain
+person
+lane
+asparagus
+art
+lantern
+dishwasher
+director
+snake
+lecture
+game controller
+tree branch
+pub
+bathing suit
+queue
+belly
+poppy
+bow
+pitcher
+ice cream cone
+cave
+candy
+road bridge
+host
+traffic jam
+earring
+file
+foot
+watermark overlay stamp
+mailbox
+supercar
+railing
+bedroom
+seafood
+waffle
+bronze statue
+plan
+flow
+marble
+basketball game
+automobile
+scene
+cypress tree
+soldier
+skateboarder
+glass building
+cherry tree
+pump
+grain
+wildebeest
+loop
+frame
+bathtub
+saxophone
+diver
+stalk
+lily
+bead
+alley
+flock
+family room
+manufacturing
+pointer
+worker
+navy
+potato
+teacher
+photography
+dolly
+boardwalk
+water fountain
+athlete
+side dish
+bay
+ice hockey
+phone
+hero
+face
+gold medal
+blind
+swamp
+researcher
+swim
+meatball
+iguana
+leather jacket
+jellyfish
+site
+smoke
+traffic signal
+melon
+beetle
+calculator
+skirt
+plantation
+sculptor
+barrier
+catcher
+security guard
+sketch
+awning
+steering wheel
+mountain view
+bus stop
+pool
+leg
+spotlight
+apron
+mineral
+inlet
+sleeve
+torch
+emotion
+march
+police officer
+performance
+lamp post
+fishing boat
+summer
+presentation
+saucer
+suitcase
+supermodel
+goalkeeper
+shrub
+rock artist
+document
+beach house
+man
+blue artist
+cigar
+railroad track
+gown
+mosaic
+bungalow
+alphabet
+baseball field
+shed
+pedestrian
+rail
+soap
+kitchen counter
+dessert
+dunk
+blossom
+conversation
+fruit market
+glass jar
+military
+beer bottle
+photographer
+tennis racket
+competition
+escalator
+bell tower
+stilt
+ballerina
+television
+feather
+fence post
+rear
+dahlia
+red carpet
+tub
+hole
+fortress
+pack
+telephone
+cardboard
+city park
+platform
+college student
+arch bridge
+wind
+blender
+bloom
+ice rink
+birthday
+raven
+fairy
+embankment
+hall
+flower shop
+suburb
+barrel
+biker
+steam
+dragonfly
+formation
+electricity
+business people
+symmetry
+walkway
+fisherman
+gas mask
+loch
+youth
+hanger
+dot
+fish
+street market
+animation film
+crime fiction film
+boar
+emblem
+halloween costume
+kangaroo
+couple
+spoon
+squirrel
+neon sign
+sky
+office desk
+beauty salon
+breakwater
+fashion look
+toaster
+author
+news conference
+outdoor
+canoe
+dragon
+tool
+shopping centre
+ladybug
+swimming pool
+landscaping
+ski pole
+red
+truck
+fly
+temple
+level
+sunday
+railroad bridge
+car mirror
+lawn mower
+flute
+aircraft carrier
+fashion menswear london week
+sunshine
+tile floor
+skull
+fossil
+flower arrangement
+diaper
+sea turtle
+cherry blossom
+fireman
+shack
+lens
+waiter
+animal
+basement
+snow
+autumn park
+glass box
+kick
+head
+anniversary
+vine
+back
+paper lantern
+fish tank
+cellphone
+silk
+coral
+notebook
+photo
+gazebo
+ketchup
+driver
+farmer
+bonfire
+chestnut
+photoshoot
+football field
+olive tree
+pheasant
+sandal
+toilet
+fireplace
+music
+deity
+fish market
+fig
+bell
+neck
+grave
+villa
+cyclist
+crate
+grey
+asphalt road
+soccer
+hostel
+municipality
+courthouse
+roof
+end table
+pot
+sedan
+structure
+folk artist
+sport
+sport team
+protest
+syringe
+fashion designer
+jersey
+heart shape
+kayak
+stare
+sit with
+direct
+read
+photograph
+spin
+teach
+laugh
+carve
+grow on
+warm
+watch
+stretch
+smell
+decorate
+shine
+light
+dance
+send
+park
+chase
+collect
+lead
+kiss
+lead to
+lick
+smile
+cheer
+sit
+point
+block
+rock
+drop
+cut
+ski
+wrap
+lose
+serve
+provide
+sleep
+dress
+embrace
+burn
+pack
+stir
+create
+touch
+wash
+stick
+reveal
+shop
+train
+paint
+groom
+hunt
+bloom
+play
+pay
+brush
+shoot
+hold
+picture
+carry
+sip
+contain
+turn
+pour
+pitch
+give
+add
+blow
+look in
+show
+walk
+illuminate
+kneel
+cover
+drag
+post
+present
+fit
+operate
+fish
+race
+write
+deliver
+peel
+push
+run
+sit around
+buy
+jump
+walk on
+attend
+clean
+sell
+ride on
+mount
+host
+dry
+plant
+sing
+row
+shake
+perch
+ride
+fight
+skateboard
+live
+call
+surround
+practice
+play on
+work on
+step
+relax
+hit
+fall in
+flow
+greet
+launch
+wear
+hang on
+drive
+sit in
+break
+learn
+fly
+connect
+display
+locate
+compete
+go for
+sail
+lift
+toast
+help
+run on
+reflect
+pose
+scratch
+frame
+dribble
+herd
+enter
+exit
+place
+inspect
+build
+pick
+fill
+grind
+skate
+offer
+float
+sit by
+stand
+release
+rest
+singe
+climb
+tie
+mark
+lay
+stand around
+capture
+set
+land
+swinge
+run in
+kick
+lean
+head
+sign
+approach
+swim
+close
+crash
+control
+fall
+remove
+repair
+open
+appear
+travel
+load
+miss
+check
+surf
+moor
+smoke
+drink
+board
+seat
+feed
+rise
+sit on
+swing
+grow
+strike
+date
+slide
+share
+graze
+jump in
+lie
+extrude
+roll
+move
+gather
+eat
+pull
+run through
+squeeze
+lay on
+draw
+play with
+wave
+assemble
+perform
+march
+score
+attach
+adjust
+hang
+hug
+sleep on
+throw
+live in
+talk
+pet
+work
+run with
+see
+flip
+catch
+cook
+receive
+celebrate
+look
+classic
+bridal
+indoor
+industrial
+teenage
+mini
+grassy
+aged
+long
+warm
+light
+handsome
+happy
+three
+pregnant
+circular
+urban
+silver
+ceramic
+3d
+green
+blonde
+golden
+dark
+tropical
+ripe
+deep
+fat
+musical
+giant
+medical
+medieval
+bare
+stunning
+bold
+geographical
+huge
+plastic
+foggy
+stormy
+gothic
+biological
+empty
+clear
+antique
+pink
+steep
+brown
+striped
+aerial
+rainy
+cool
+flying
+commercial
+purple
+trendy
+blank
+haired
+dead
+wooden
+flat
+high
+beige
+panoramic
+angry
+dozen
+rural
+solar
+big
+small
+stained
+thick
+many
+fresh
+clean
+strong
+abstract
+crowded
+retro
+dry
+gorgeous
+martial
+modern
+blue
+cloudy
+low
+four
+outdoor
+single
+much
+beautiful
+snowy
+pretty
+new
+short
+sunny
+closed
+rocky
+red
+two
+double
+male
+gray
+five
+colorful
+automotive
+various
+one
+old
+rusty
+tall
+wild
+narrow
+natural
+several
+frozen
+textured
+lush
+young
+hot
+mixed
+white
+float
+quiet
+round
+bright
+religious
+female
+historical
+shiny
+traditional
+tourist
+yellow
+bald
+coastal
+lovely
+little
+broken
+romantic
+wide
+royal
+rich
+open
+cute
+ancient
+cold
+political
+elderly
+gold
+full
+rustic
+metallic
+floral
+sad
+wet
+fancy
+senior
+tiny
+stylish
+large
+frosty
+orange
+transparent
+electronic
+shallow
+scared
+armed
+dirty
+historic
+black
+few
+windy
+some
+square
+ornamental
+sandy
+thin

tag2text/inference.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+ * Tag2Text
+ * Written by Xinyu Huang
+"""
+import argparse
+import random
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from models.tag2text import tag2text_caption
+from PIL import Image
+parser = argparse.ArgumentParser(
+    description="Tag2Text inferece for tagging and captioning"
+)
+parser.add_argument(
+    "--image",
+    metavar="DIR",
+    help="path to dataset",
+    default="images/1641173_2291260800.jpg",
+)
+parser.add_argument(
+    "--pretrained",
+    metavar="DIR",
+    help="path to pretrained model",
+    default="pretrained/tag2text_swin_14m.pth",
+)
+parser.add_argument(
+    "--image-size",
+    default=384,
+    type=int,
+    metavar="N",
+    help="input image size (default: 448)",
+)
+parser.add_argument(
+    "--thre", default=0.68, type=float, metavar="N", help="threshold value"
+)
+parser.add_argument(
+    "--specified-tags", default="None", help="User input specified tags"
+)
+def inference(image, model, input_tag="None"):
+    with torch.no_grad():
+        caption, tag_predict = model.generate(
+            image, tag_input=None, max_length=50, return_tag_predict=True
+        )
+    if input_tag == "" or input_tag == "none" or input_tag == "None":
+        return tag_predict[0], None, caption[0]
+    # If user input specified tags:
+    else:
+        input_tag_list = []
+        input_tag_list.append(input_tag.replace(",", " | "))
+        with torch.no_grad():
+            caption, input_tag = model.generate(
+                image, tag_input=input_tag_list, max_length=50, return_tag_predict=True
+            )
+        return tag_predict[0], input_tag[0], caption[0]
+if __name__ == "__main__":
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    normalize = transforms.Normalize(
+        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+    )
+    transform = transforms.Compose(
+        [
+            transforms.Resize((args.image_size, args.image_size)),
+            transforms.ToTensor(),
+            normalize,
+        ]
+    )
+    # delete some tags that may disturb captioning
+    # 127: "quarter"; 2961: "back", 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one"
+    delete_tag_index = [127, 2961, 3351, 3265, 3338, 3355, 3359]
+    #######load model
+    model = tag2text_caption(
+        pretrained=args.pretrained,
+        image_size=args.image_size,
+        vit="swin_b",
+        delete_tag_index=delete_tag_index,
+    )
+    model.threshold = args.thre  # threshold for tagging
+    model.eval()
+    model = model.to(device)
+    raw_image = Image.open(args.image).resize((args.image_size, args.image_size))
+    image = transform(raw_image).unsqueeze(0).to(device)
+    res = inference(image, model, args.specified_tags)
+    print("Model Identified Tags: ", res[0])
+    print("User Specified Tags: ", res[1])
+    print("Image Caption: ", res[2])

tag2text/models/bert.py ADDED Viewed

	@@ -0,0 +1,1157 @@

+"""
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+"""
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional
+from typing import Tuple
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import device
+from torch import dtype
+from torch import nn
+from torch import Tensor
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers.modeling_outputs import MultipleChoiceModelOutput
+from transformers.modeling_outputs import NextSentencePredictorOutput
+from transformers.modeling_outputs import QuestionAnsweringModelOutput
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers.modeling_outputs import TokenClassifierOutput
+from transformers.modeling_utils import apply_chunking_to_forward
+from transformers.modeling_utils import find_pruneable_heads_and_indices
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_utils import prune_linear_layer
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class BertEmbeddings_nopos(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.config = config
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        # if position_ids is None:
+        # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = inputs_embeds
+        # if self.position_embedding_type == "absolute":
+        #     position_embeddings = self.position_embeddings(position_ids)
+        #     # print('add position_embeddings!!!!')
+        #     embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        self.config = config
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = self.position_ids[
+                :, past_key_values_length : seq_length + past_key_values_length
+            ]
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = inputs_embeds
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            # print('add position_embeddings!!!!')
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.save_attention = False
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            # print(self.key.weight.shape)
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        past_key_value = (key_layer, value_layer)
+        # compatible with higher versions of transformers
+        if key_layer.shape[0] > query_layer.shape[0]:
+            key_layer = key_layer[: query_layer.shape[0], :, :, :]
+            attention_mask = attention_mask[: query_layer.shape[0], :, :]
+            value_layer = value_layer[: query_layer.shape[0], :, :, :]
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        if (
+            self.position_embedding_type == "relative_key"
+            or self.position_embedding_type == "relative_key_query"
+        ):
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long, device=hidden_states.device
+            ).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1
+            )
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype
+            )  # fp16 compatibility
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding
+                )
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding
+                )
+                attention_scores = (
+                    attention_scores
+                    + relative_position_scores_query
+                    + relative_position_scores_key
+                )
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        outputs = outputs + (past_key_value,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.self.num_attention_heads,
+            self.self.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = (
+            self.self.attention_head_size * self.self.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.layer_num = layer_num
+        if self.config.add_cross_attention:
+            self.crossattention = BertAttention(
+                config, is_cross_attention=self.config.add_cross_attention
+            )
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        mode=None,
+    ):
+        if mode == "tagging":
+            assert (
+                encoder_hidden_states is not None
+            ), "encoder_hidden_states must be given for cross-attention layers"
+            cross_attention_outputs = self.crossattention(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = cross_attention_outputs[
+                1:-1
+            ]  # add cross attentions if we output attention weights
+            present_key_value = cross_attention_outputs[-1]
+        else:
+            # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+            self_attn_past_key_value = (
+                past_key_value[:2] if past_key_value is not None else None
+            )
+            self_attention_outputs = self.attention(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                output_attentions=output_attentions,
+                past_key_value=self_attn_past_key_value,
+            )
+            attention_output = self_attention_outputs[0]
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+            if mode == "multimodal":
+                assert (
+                    encoder_hidden_states is not None
+                ), "encoder_hidden_states must be given for cross-attention layers"
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                attention_output = cross_attention_outputs[0]
+                outputs = (
+                    outputs + cross_attention_outputs[1:-1]
+                )  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+        outputs = (layer_output,) + outputs
+        outputs = outputs + (present_key_value,)
+        return outputs
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [BertLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode="multimodal",
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = (
+            () if output_attentions and self.config.add_cross_attention else None
+        )
+        next_decoder_cache = () if use_cache else None
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    mode=mode,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    mode=mode,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertPreTrainedModel(PreTrainedModel):
+    """An abstract class to handle weights initialization and a simple interface for downloading and loading
+    pretrained models."""
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+class BertModel(BertPreTrainedModel):
+    """The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention
+    is all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob
+    Uszkoreit, Llion Jones, Aidan N.
+    Gomez, Lukasz Kaiser and Illia Polosukhin. argument and :obj:`add_cross_attention` set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def get_extended_attention_mask(
+        self,
+        attention_mask: Tensor,
+        input_shape: Tuple[int],
+        device: device,
+        is_decoder: bool,
+    ) -> Tensor:
+        """Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, seq_length, prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode="multimodal",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds or encoder_embeds"
+            )
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device, is_decoder
+        )
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
+                    0
+                ].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask
+                ]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(
+                    encoder_attention_mask
+                )
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = (
+            self.pooler(sequence_output) if self.pooler is not None else None
+        )
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+class BertLMHeadModel(BertPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+        mode="multimodal",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        # sequence_output.shape torch.Size([85, 30, 768])
+        # prediction_scores.shape torch.Size([85, 30, 30524])
+        # labels.shape torch.Size([85, 30])
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            lm_loss = loss_fct(
+                shifted_prediction_scores.view(-1, self.config.vocab_size),
+                labels.view(-1),
+            )
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past=None, attention_mask=None, **model_kwargs
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past

tag2text/models/swin_transformer.py ADDED Viewed

	@@ -0,0 +1,831 @@

+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from scipy import interpolate
+from timm.models.layers import DropPath
+from timm.models.layers import to_2tuple
+from timm.models.layers import trunc_normal_
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(
+        B, H // window_size, W // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class SwinTransformerBlock(nn.Module):
+    r"""Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert (
+            0 <= self.shift_size < self.window_size
+        ), "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(
+                img_mask, self.window_size
+            )  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(
+                attn_mask != 0, float(-100.0)
+            ).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+            )
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=self.attn_mask
+        )  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
+            )
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+    def extra_repr(self) -> str:
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+        )
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class PatchMerging(nn.Module):
+    r"""Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+        x = x.view(B, H, W, C)
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list)
+                    else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+class PatchEmbed(nn.Module):
+    r"""Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(
+        self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None
+    ):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0],
+            img_size[1] // patch_size[1],
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = (
+            Ho
+            * Wo
+            * self.embed_dim
+            * self.in_chans
+            * (self.patch_size[0] * self.patch_size[1])
+        )
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+class SwinTransformer(nn.Module):
+    r"""Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=4,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        use_checkpoint=False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, num_patches, embed_dim)
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                input_resolution=(
+                    patches_resolution[0] // (2**i_layer),
+                    patches_resolution[1] // (2**i_layer),
+                ),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"absolute_pos_embed"}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {"relative_position_bias_table"}
+    def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.norm(x)  # B L C
+        x_cls = self.avgpool(x.transpose(1, 2))  # B C 1
+        if idx_to_group_img is None:
+            return torch.cat([x_cls.transpose(1, 2), x], dim=1)
+        else:
+            x_bs = torch.gather(
+                x,
+                dim=0,
+                index=idx_to_group_img.view(-1, 1, 1).expand(
+                    -1, x.shape[1], x.shape[2]
+                ),
+            )
+            weights = image_atts[:, 1:].unsqueeze(2)  # B L 1
+            x_bs_cls = torch.sum(
+                (weights * x_bs).transpose(1, 2), dim=-1, keepdim=True
+            )  # B C 1
+            x_bs_cls = x_bs_cls / torch.sum(
+                weights.transpose(1, 2), dim=-1, keepdim=True
+            )  # avgpool
+            return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), torch.cat(
+                [x_cls.transpose(1, 2), x], dim=1
+            )
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += (
+            self.num_features
+            * self.patches_resolution[0]
+            * self.patches_resolution[1]
+            // (2**self.num_layers)
+        )
+        flops += self.num_features * self.num_classes
+        return flops
+def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=""):
+    # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348
+    # rel_pos_bias: relative_position_bias_table
+    src_num_pos, num_attn_heads = rel_pos_bias.size()
+    num_extra_tokens = 0
+    src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
+    dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
+    if src_size != dst_size:
+        print(
+            "Position interpolate %s from %dx%d to %dx%d"
+            % (param_name, src_size, src_size, dst_size, dst_size)
+        )
+        # extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+        # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+        def geometric_progression(a, r, n):
+            return a * (1.0 - r**n) / (1.0 - r)
+        left, right = 1.01, 1.5
+        while right - left > 1e-6:
+            q = (left + right) / 2.0
+            gp = geometric_progression(1, q, src_size // 2)
+            if gp > dst_size // 2:
+                right = q
+            else:
+                left = q
+        # if q > 1.090307:
+        #     q = 1.090307
+        dis = []
+        cur = 1
+        for i in range(src_size // 2):
+            dis.append(cur)
+            cur += q ** (i + 1)
+        r_ids = [-_ for _ in reversed(dis)]
+        x = r_ids + [0] + dis
+        y = r_ids + [0] + dis
+        t = dst_size // 2.0
+        dx = np.arange(-t, t + 0.1, 1.0)
+        dy = np.arange(-t, t + 0.1, 1.0)
+        # print("Original positions = %s" % str(x))
+        # print("Target positions = %s" % str(dx))
+        all_rel_pos_bias = []
+        for i in range(num_attn_heads):
+            z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
+            f = interpolate.interp2d(x, y, z, kind="cubic")
+            all_rel_pos_bias.append(
+                torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)
+            )
+        rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+    return rel_pos_bias

tag2text/models/tag2text.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+ * Tag2Text
+ * Written by Xinyu Huang
+"""
+import json
+import warnings
+import numpy as np
+import torch
+from models.bert import BertConfig
+from models.bert import BertLMHeadModel
+from models.bert import BertModel
+from models.swin_transformer import SwinTransformer
+from models.utils import *
+from models.vit import VisionTransformer
+from torch import nn
+warnings.filterwarnings("ignore")
+class Tag2Text_Caption(nn.Module):
+    def __init__(
+        self,
+        med_config=f"{CONFIG_PATH}/configs/med_config.json",
+        image_size=384,
+        vit="base",
+        vit_grad_ckpt=False,
+        vit_ckpt_layer=0,
+        prompt="a picture of ",
+        threshold=0.68,
+        delete_tag_index=[],
+        tag_list=f"{CONFIG_PATH}/data/tag_list.txt",
+    ):
+        r"""Tag2Text inference module, both captioning and tagging are included.
+        Tag2Text is an efficient and controllable vision-language pre-training framework.
+        Described in the paper "Tag2Text: Guiding Vision-Language Model via Image Tagging" https://arxiv.org/abs/2303.05657
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+            threshold (int): tagging threshold
+            delete_tag_index (list): delete some tags that may disturb captioning
+        """
+        super().__init__()
+        # create image encoder
+        if vit == "swin_b":
+            if image_size == 224:
+                vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_224.json"
+            elif image_size == 384:
+                vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_384.json"
+            vision_config = read_json(vision_config_path)
+            assert image_size == vision_config["image_res"]
+            # assert config['patch_size'] == 32
+            vision_width = vision_config["vision_width"]
+            self.visual_encoder = SwinTransformer(
+                img_size=vision_config["image_res"],
+                patch_size=4,
+                in_chans=3,
+                embed_dim=vision_config["embed_dim"],
+                depths=vision_config["depths"],
+                num_heads=vision_config["num_heads"],
+                window_size=vision_config["window_size"],
+                mlp_ratio=4.0,
+                qkv_bias=True,
+                drop_rate=0.0,
+                drop_path_rate=0.1,
+                ape=False,
+                patch_norm=True,
+                use_checkpoint=False,
+            )
+        else:
+            self.visual_encoder, vision_width = create_vit(
+                vit, image_size, vit_grad_ckpt, vit_ckpt_layer
+            )
+        # create tokenzier
+        self.tokenizer = init_tokenizer()
+        # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder
+        # create image-tag interaction encoder
+        encoder_config = BertConfig.from_json_file(med_config)
+        encoder_config.encoder_width = vision_width
+        self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
+        # create image-tag-text decoder
+        decoder_config = BertConfig.from_json_file(med_config)
+        self.text_decoder = BertLMHeadModel(config=decoder_config)
+        self.delete_tag_index = delete_tag_index
+        self.prompt = prompt
+        self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1
+        # load tag list
+        self.tag_list = self.load_tag_list(tag_list)
+        # create image-tag recognition decoder
+        self.threshold = threshold
+        self.num_class = len(self.tag_list)
+        q2l_config = BertConfig.from_json_file(f"{CONFIG_PATH}/configs/q2l_config.json")
+        q2l_config.encoder_width = vision_width
+        self.tagging_head = BertModel(config=q2l_config, add_pooling_layer=False)
+        self.tagging_head.resize_token_embeddings(len(self.tokenizer))
+        self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size)
+        self.fc = GroupWiseLinear(self.num_class, q2l_config.hidden_size, bias=True)
+        self.del_selfattention()
+        # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder"
+        tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, "", " ")
+    def load_tag_list(self, tag_list_file):
+        with open(tag_list_file) as f:
+            tag_list = f.read().splitlines()
+        tag_list = np.array(tag_list)
+        return tag_list
+    # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label
+    def del_selfattention(self):
+        del self.tagging_head.embeddings
+        for layer in self.tagging_head.encoder.layer:
+            del layer.attention
+    def generate(
+        self,
+        image,
+        sample=False,
+        num_beams=3,
+        max_length=30,
+        min_length=10,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        tag_input=None,
+        return_tag_predict=False,
+    ):
+        image_embeds = self.visual_encoder(image)
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+            image.device
+        )
+        # if not user specified tags, recognized image tags using image-tag recogntiion decoder
+        if tag_input == None:
+            image_cls_embeds = image_embeds[:, 0, :]
+            image_spatial_embeds = image_embeds[:, 1:, :]
+            bs = image_spatial_embeds.shape[0]
+            label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
+            tagging_embed = self.tagging_head(
+                encoder_embeds=label_embed,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=False,
+                mode="tagging",
+            )
+            logits = self.fc(tagging_embed[0])
+            targets = torch.where(
+                torch.sigmoid(logits) > self.threshold,
+                torch.tensor(1.0).to(image.device),
+                torch.zeros(self.num_class).to(image.device),
+            )
+            tag = targets.cpu().numpy()
+            # delete some tags that may disturb captioning
+            tag[:, self.delete_tag_index] = 0
+            tag_input = []
+            for b in range(bs):
+                index = np.argwhere(tag[b] == 1)
+                token = self.tag_list[index].squeeze(axis=1)
+                tag_input.append(" | ".join(token))
+        tag_output = tag_input
+        # beam search for text generation(default)
+        if not sample:
+            image_embeds = image_embeds.repeat_interleave(num_beams, dim=0)
+            tag_input_temp = []
+            for tag in tag_input:
+                for i in range(num_beams):
+                    tag_input_temp.append(tag)
+            tag_input = tag_input_temp
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
+            image.device
+        )
+        # tokenizer input tags
+        tag_input_tokenzier = self.tokenizer(
+            tag_input,
+            padding="max_length",
+            truncation=True,
+            max_length=40,
+            return_tensors="pt",
+        ).to(image.device)
+        encoder_input_ids = tag_input_tokenzier.input_ids
+        encoder_input_ids[:, 0] = self.tokenizer.enc_token_id
+        # put input tag into image-tag interaction encoder to interact with image embeddings
+        output_tagembedding = self.tag_encoder(
+            encoder_input_ids,
+            attention_mask=tag_input_tokenzier.attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=True,
+        )
+        # prompt trick for better captioning, followed BLIP
+        prompt = [self.prompt] * image.size(0)
+        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(
+            image.device
+        )
+        input_ids[:, 0] = self.tokenizer.bos_token_id
+        input_ids = input_ids[:, :-1]
+        if sample:
+            # nucleus sampling
+            model_kwargs = {
+                "encoder_hidden_states": output_tagembedding.last_hidden_state,
+                "encoder_attention_mask": None,
+            }
+            outputs = self.text_decoder.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=True,
+                top_p=top_p,
+                num_return_sequences=1,
+                eos_token_id=self.tokenizer.sep_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
+                repetition_penalty=1.1,
+                **model_kwargs,
+            )
+        else:
+            # beam search (default)
+            model_kwargs = {
+                "encoder_hidden_states": output_tagembedding.last_hidden_state,
+                "encoder_attention_mask": None,
+            }
+            outputs = self.text_decoder.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                min_length=min_length,
+                num_beams=num_beams,
+                eos_token_id=self.tokenizer.sep_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
+                repetition_penalty=repetition_penalty,
+                **model_kwargs,
+            )
+        captions = []
+        for output in outputs:
+            caption = self.tokenizer.decode(output, skip_special_tokens=True)
+            captions.append(caption[len(self.prompt) :])
+        if return_tag_predict == True:
+            return captions, tag_output
+        return captions
+# load pretrained model parameters
+def tag2text_caption(pretrained="", **kwargs):
+    model = Tag2Text_Caption(**kwargs)
+    if pretrained:
+        if kwargs["vit"] == "swin_b":
+            model, msg = load_checkpoint_swinbase(model, pretrained, kwargs)
+        else:
+            model, msg = load_checkpoint(model, pretrained)
+        print("vit:", kwargs["vit"])
+        print("msg", msg)
+    return model

tag2text/models/utils.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import json
+import math
+import os
+from pathlib import Path
+from typing import List
+from urllib.parse import urlparse
+import torch
+from models.swin_transformer import interpolate_relative_pos_embed
+from models.vit import interpolate_pos_embed
+from timm.models.hub import download_cached_file
+from torch import nn
+from transformers import BertTokenizer
+CONFIG_PATH = Path(__file__).resolve().parents[1]
+def read_json(rpath):
+    with open(rpath) as f:
+        return json.load(f)
+def tie_encoder_decoder_weights(
+    encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key: str
+):
+    uninitialized_encoder_weights: List[str] = []
+    if decoder.__class__ != encoder.__class__:
+        logger.info(
+            f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
+        )
+    def tie_encoder_to_decoder_recursively(
+        decoder_pointer: nn.Module,
+        encoder_pointer: nn.Module,
+        module_name: str,
+        uninitialized_encoder_weights: List[str],
+        skip_key: str,
+        depth=0,
+    ):
+        assert isinstance(decoder_pointer, nn.Module) and isinstance(
+            encoder_pointer, nn.Module
+        ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
+        if hasattr(decoder_pointer, "weight") and skip_key not in module_name:
+            assert hasattr(encoder_pointer, "weight")
+            encoder_pointer.weight = decoder_pointer.weight
+            if hasattr(decoder_pointer, "bias"):
+                assert hasattr(encoder_pointer, "bias")
+                encoder_pointer.bias = decoder_pointer.bias
+            print(module_name + " is tied")
+            return
+        encoder_modules = encoder_pointer._modules
+        decoder_modules = decoder_pointer._modules
+        if len(decoder_modules) > 0:
+            assert (
+                len(encoder_modules) > 0
+            ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+            all_encoder_weights = {
+                module_name + "/" + sub_name for sub_name in encoder_modules.keys()
+            }
+            encoder_layer_pos = 0
+            for name, module in decoder_modules.items():
+                if name.isdigit():
+                    encoder_name = str(int(name) + encoder_layer_pos)
+                    decoder_name = name
+                    if not isinstance(
+                        decoder_modules[decoder_name],
+                        type(encoder_modules[encoder_name]),
+                    ) and len(encoder_modules) != len(decoder_modules):
+                        # this can happen if the name corresponds to the position in a list module list of layers
+                        # in this case the decoder has added a cross-attention that the encoder does not have
+                        # thus skip this step and subtract one layer pos from encoder
+                        encoder_layer_pos -= 1
+                        continue
+                elif name not in encoder_modules:
+                    continue
+                elif depth > 500:
+                    raise ValueError(
+                        "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
+                    )
+                else:
+                    decoder_name = encoder_name = name
+                tie_encoder_to_decoder_recursively(
+                    decoder_modules[decoder_name],
+                    encoder_modules[encoder_name],
+                    module_name + "/" + name,
+                    uninitialized_encoder_weights,
+                    skip_key,
+                    depth=depth + 1,
+                )
+                all_encoder_weights.remove(module_name + "/" + encoder_name)
+            uninitialized_encoder_weights += list(all_encoder_weights)
+    # tie weights recursively
+    tie_encoder_to_decoder_recursively(
+        decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key
+    )
+class GroupWiseLinear(nn.Module):
+    # could be changed to:
+    # output = torch.einsum('ijk,zjk->ij', x, self.W)
+    # or output = torch.einsum('ijk,jk->ij', x, self.W[0])
+    def __init__(self, num_class, hidden_dim, bias=True):
+        super().__init__()
+        self.num_class = num_class
+        self.hidden_dim = hidden_dim
+        self.bias = bias
+        self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim))
+        if bias:
+            self.b = nn.Parameter(torch.Tensor(1, num_class))
+        self.reset_parameters()
+    def reset_parameters(self):
+        stdv = 1.0 / math.sqrt(self.W.size(2))
+        for i in range(self.num_class):
+            self.W[0][i].data.uniform_(-stdv, stdv)
+        if self.bias:
+            for i in range(self.num_class):
+                self.b[0][i].data.uniform_(-stdv, stdv)
+    def forward(self, x):
+        # x: B,K,d
+        x = (self.W * x).sum(-1)
+        if self.bias:
+            x = x + self.b
+        return x
+def init_tokenizer():
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+    tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]})
+    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
+    return tokenizer
+def create_vit(
+    vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0
+):
+    assert vit in ["base", "large"], "vit parameter must be base or large"
+    if vit == "base":
+        vision_width = 768
+        visual_encoder = VisionTransformer(
+            img_size=image_size,
+            patch_size=16,
+            embed_dim=vision_width,
+            depth=12,
+            num_heads=12,
+            use_grad_checkpointing=use_grad_checkpointing,
+            ckpt_layer=ckpt_layer,
+            drop_path_rate=0 or drop_path_rate,
+        )
+    elif vit == "large":
+        vision_width = 1024
+        visual_encoder = VisionTransformer(
+            img_size=image_size,
+            patch_size=16,
+            embed_dim=vision_width,
+            depth=24,
+            num_heads=16,
+            use_grad_checkpointing=use_grad_checkpointing,
+            ckpt_layer=ckpt_layer,
+            drop_path_rate=0.1 or drop_path_rate,
+        )
+    return visual_encoder, vision_width
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def load_checkpoint(model, url_or_filename):
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(
+            url_or_filename, check_hash=False, progress=True
+        )
+        checkpoint = torch.load(cached_file, map_location="cpu")
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location="cpu")
+    else:
+        raise RuntimeError("checkpoint url or path is invalid")
+    state_dict = checkpoint["model"]
+    state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed(
+        state_dict["visual_encoder.pos_embed"], model.visual_encoder
+    )
+    if "visual_encoder_m.pos_embed" in model.state_dict().keys():
+        state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed(
+            state_dict["visual_encoder_m.pos_embed"], model.visual_encoder_m
+        )
+    for key in model.state_dict().keys():
+        if key in state_dict.keys():
+            if state_dict[key].shape != model.state_dict()[key].shape:
+                del state_dict[key]
+    msg = model.load_state_dict(state_dict, strict=False)
+    print("load checkpoint from %s" % url_or_filename)
+    return model, msg
+def load_checkpoint_swinbase(model, url_or_filename, kwargs):
+    if kwargs["image_size"] == 224:
+        vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_224.json"
+    elif kwargs["image_size"] == 384:
+        vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_384.json"
+    window_size = read_json(vision_config_path)["window_size"]
+    print("--------------")
+    print(url_or_filename)
+    print("--------------")
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(
+            url_or_filename, check_hash=False, progress=True
+        )
+        checkpoint = torch.load(cached_file, map_location="cpu")
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location="cpu")
+    else:
+        raise RuntimeError("checkpoint url or path is invalid")
+    state_dict = checkpoint["model"]
+    for k in list(state_dict.keys()):
+        if "relative_position_bias_table" in k:
+            dst_num_pos = (2 * window_size - 1) ** 2
+            state_dict[k] = interpolate_relative_pos_embed(
+                state_dict[k], dst_num_pos, param_name=k
+            )
+        elif ("relative_position_index" in k) or ("attn_mask" in k):
+            del state_dict[k]
+        elif "vision_multi" in k:
+            state_dict[k.replace("vision_multi", "tagging_head")] = state_dict.pop(k)
+    msg = model.load_state_dict(state_dict, strict=False)
+    print("load checkpoint from %s" % url_or_filename)
+    return model, msg

tag2text/models/vit.py ADDED Viewed

	@@ -0,0 +1,430 @@

+"""
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on timm code base
+ * https://github.com/rwightman/pytorch-image-models/tree/master/timm
+"""
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
+from timm.models.helpers import adapt_input_conv
+from timm.models.helpers import named_apply
+from timm.models.layers import DropPath
+from timm.models.layers import trunc_normal_
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg
+from timm.models.vision_transformer import PatchEmbed
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks."""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_gradients = None
+        self.attention_map = None
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+    def get_attention_map(self):
+        return self.attention_map
+    def forward(self, x, register_hook=False):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        if register_hook:
+            self.save_attention_map(attn)
+            attn.register_hook(self.save_attn_gradients)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        use_grad_checkpointing=False,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        if use_grad_checkpointing:
+            self.attn = checkpoint_wrapper(self.attn)
+            self.mlp = checkpoint_wrapper(self.mlp)
+    def forward(self, x, register_hook=False):
+        x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
+        https://arxiv.org/abs/2010.11929
+    """
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        representation_size=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=None,
+        use_grad_checkpointing=False,
+        ckpt_layer=0,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    use_grad_checkpointing=(
+                        use_grad_checkpointing and i >= depth - ckpt_layer
+                    ),
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+        trunc_normal_(self.pos_embed, std=0.02)
+        trunc_normal_(self.cls_token, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+    def forward(self, x, register_blk=-1):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed[:, : x.size(1), :]
+        x = self.pos_drop(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x, register_blk == i)
+        x = self.norm(x)
+        return x
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=""):
+        _load_weights(self, checkpoint_path, prefix)
+@torch.no_grad()
+def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ""):
+    """Load weights from .npz checkpoints for official Google Brain Flax implementation."""
+    import numpy as np
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+    w = np.load(checkpoint_path)
+    if not prefix and "opt/target/embedding/kernel" in w:
+        prefix = "opt/target/"
+    if hasattr(model.patch_embed, "backbone"):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, "stem")
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(
+            adapt_input_conv(
+                stem.conv.weight.shape[1], _n2p(w[f"{prefix}conv_root/kernel"])
+            )
+        )
+        stem.norm.weight.copy_(_n2p(w[f"{prefix}gn_root/scale"]))
+        stem.norm.bias.copy_(_n2p(w[f"{prefix}gn_root/bias"]))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f"{prefix}block{i + 1}/unit{j + 1}/"
+                    for r in range(3):
+                        getattr(block, f"conv{r + 1}").weight.copy_(
+                            _n2p(w[f"{bp}conv{r + 1}/kernel"])
+                        )
+                        getattr(block, f"norm{r + 1}").weight.copy_(
+                            _n2p(w[f"{bp}gn{r + 1}/scale"])
+                        )
+                        getattr(block, f"norm{r + 1}").bias.copy_(
+                            _n2p(w[f"{bp}gn{r + 1}/bias"])
+                        )
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(
+                            _n2p(w[f"{bp}conv_proj/kernel"])
+                        )
+                        block.downsample.norm.weight.copy_(
+                            _n2p(w[f"{bp}gn_proj/scale"])
+                        )
+                        block.downsample.norm.bias.copy_(_n2p(w[f"{bp}gn_proj/bias"]))
+        embed_conv_w = _n2p(w[f"{prefix}embedding/kernel"])
+    else:
+        embed_conv_w = adapt_input_conv(
+            model.patch_embed.proj.weight.shape[1], _n2p(w[f"{prefix}embedding/kernel"])
+        )
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f"{prefix}embedding/bias"]))
+    model.cls_token.copy_(_n2p(w[f"{prefix}cls"], t=False))
+    pos_embed_w = _n2p(w[f"{prefix}Transformer/posembed_input/pos_embedding"], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w,
+            model.pos_embed,
+            getattr(model, "num_tokens", 1),
+            model.patch_embed.grid_size,
+        )
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/scale"]))
+    model.norm.bias.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/bias"]))
+    #     if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+    #         model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+    #         model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    #     if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+    #         model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+    #         model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f"{prefix}Transformer/encoderblock_{i}/"
+        mha_prefix = block_prefix + "MultiHeadDotProductAttention_1/"
+        block.norm1.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/scale"]))
+        block.norm1.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/bias"]))
+        block.attn.qkv.weight.copy_(
+            torch.cat(
+                [
+                    _n2p(w[f"{mha_prefix}{n}/kernel"], t=False).flatten(1).T
+                    for n in ("query", "key", "value")
+                ]
+            )
+        )
+        block.attn.qkv.bias.copy_(
+            torch.cat(
+                [
+                    _n2p(w[f"{mha_prefix}{n}/bias"], t=False).reshape(-1)
+                    for n in ("query", "key", "value")
+                ]
+            )
+        )
+        block.attn.proj.weight.copy_(_n2p(w[f"{mha_prefix}out/kernel"]).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f"{mha_prefix}out/bias"]))
+        for r in range(2):
+            getattr(block.mlp, f"fc{r + 1}").weight.copy_(
+                _n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/kernel"])
+            )
+            getattr(block.mlp, f"fc{r + 1}").bias.copy_(
+                _n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/bias"])
+            )
+        block.norm2.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/scale"]))
+        block.norm2.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/bias"]))
+def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):
+    # interpolate position embedding
+    embedding_size = pos_embed_checkpoint.shape[-1]
+    num_patches = visual_encoder.patch_embed.num_patches
+    num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
+    # height (== width) for the checkpoint position embedding
+    orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+    # height (== width) for the new position embedding
+    new_size = int(num_patches**0.5)
+    if orig_size != new_size:
+        # class_token and dist_token are kept unchanged
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(
+            -1, orig_size, orig_size, embedding_size
+        ).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False
+        )
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        print(
+            "reshape position embedding from %d to %d" % (orig_size**2, new_size**2)
+        )
+        return new_pos_embed
+    else:
+        return pos_embed_checkpoint

tag2text/requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+timm==0.4.12
+transformers==4.15.0
+fairscale==0.4.4
+pycocoevalcap
+torch
+torchvision
+Pillow
+scipy

utils.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import random
+import sys
+from typing import Dict
+from typing import List
+import numpy as np
+import supervision as sv
+import torch
+import torchvision
+import torchvision.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util.inference import Model as DinoModel
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from segment_anything import SamPredictor
+# segment anything
+sys.path.append("tag2text")
+from tag2text.inference import inference as tag2text_inference
+def load_model_hf(repo_id, filename, ckpt_config_filename, device="cpu"):
+    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
+    args = SLConfig.fromfile(cache_config_file)
+    args.device = device
+    model = build_model(args)
+    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
+    checkpoint = torch.load(cache_file, map_location=device)
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    model.eval()
+    return model
+def download_file_hf(repo_id, filename, cache_dir="./cache"):
+    cache_file = hf_hub_download(
+        repo_id=repo_id, filename=filename, force_filename=filename, cache_dir=cache_dir
+    )
+    return cache_file
+def transform_image_tag2text(image_pil: Image) -> torch.Tensor:
+    transform = T.Compose(
+        [
+            T.Resize((384, 384)),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image = transform(image_pil)  # 3, h, w
+    return image
+def show_anns_sam(anns: List[Dict]):
+    """Extracts the mask annotations from the Segment Anything model output and plots them.
+    https://github.com/facebookresearch/segment-anything.
+    Arguments:
+      anns (List[Dict]): Segment Anything model output.
+    Returns:
+      (np.ndarray): Masked image.
+      (np.ndarray): annotation encoding from https://github.com/LUSSeg/ImageNet-S
+    """
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x["area"]), reverse=True)
+    full_img = None
+    # for ann in sorted_anns:
+    for i in range(len(sorted_anns)):
+        ann = anns[i]
+        m = ann["segmentation"]
+        if full_img is None:
+            full_img = np.zeros((m.shape[0], m.shape[1], 3))
+            map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
+        map[m != 0] = i + 1
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        full_img[m != 0] = color_mask
+    full_img = full_img * 255
+    # anno encoding from https://github.com/LUSSeg/ImageNet-S
+    res = np.zeros((map.shape[0], map.shape[1], 3))
+    res[:, :, 0] = map % 256
+    res[:, :, 1] = map // 256
+    res.astype(np.float32)
+    full_img = np.uint8(full_img)
+    return full_img, res
+def show_anns_sv(detections: sv.Detections):
+    """Extracts the mask annotations from the Supervision Detections object.
+    https://roboflow.github.io/supervision/detection/core/.
+    Arguments:
+      anns (sv.Detections): Containing information about the detections.
+    Returns:
+      (np.ndarray): Masked image.
+      (np.ndarray): annotation encoding from https://github.com/LUSSeg/ImageNet-S
+    """
+    if detections.mask is None:
+        return
+    full_img = None
+    for i in np.flip(np.argsort(detections.area)):
+        m = detections.mask[i]
+        if full_img is None:
+            full_img = np.zeros((m.shape[0], m.shape[1], 3))
+            map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
+        map[m != 0] = i + 1
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        full_img[m != 0] = color_mask
+    full_img = full_img * 255
+    # anno encoding from https://github.com/LUSSeg/ImageNet-S
+    res = np.zeros((map.shape[0], map.shape[1], 3))
+    res[:, :, 0] = map % 256
+    res[:, :, 1] = map // 256
+    res.astype(np.float32)
+    full_img = np.uint8(full_img)
+    return full_img, res
+def generate_tags(tag2text_model, image, specified_tags, device="cpu"):
+    """Generate image tags and caption using Tag2Text model.
+    Arguments:
+      tag2text_model (nn.Module): Tag2Text model to use for prediction.
+      image (np.ndarray): The image for calculating. Expects an
+        image in HWC uint8 format, with pixel values in [0, 255].
+      specified_tags(str): User input specified tags
+    Returns:
+      (List[str]): Predicted image tags.
+      (str): Predicted image caption
+    """
+    image = transform_image_tag2text(image).unsqueeze(0).to(device)
+    res = tag2text_inference(image, tag2text_model, specified_tags)
+    tags = res[0].split(" | ")
+    caption = res[2]
+    return tags, caption
+def detect(
+    grounding_dino_model: DinoModel,
+    image: np.ndarray,
+    caption: str,
+    box_threshold: float = 0.3,
+    text_threshold: float = 0.25,
+    iou_threshold: float = 0.5,
+    post_process: bool = True,
+):
+    """Detect bounding boxes for the given image, using the input caption.
+    Arguments:
+      grounding_dino_model (DinoModel): The model to use for detection.
+      image (np.ndarray): The image for calculating masks. Expects an
+        image in HWC uint8 format, with pixel values in [0, 255].
+      caption (str): Input caption contain object names to detect. To detect multiple objects, seperating each name with '.', like this: cat . dog . chair
+      box_threshold (float): Box confidence threshold
+      text_threshold (float): Text confidence threshold
+      iou_threshold (float): IOU score threshold for post processing
+      post_process (bool): If True, run NMS algorithm to remove duplicates segments.
+    Returns:
+      (sv.Detections): Containing information about the detections in a video frame.
+      (str): Predicted phrases.
+      (List[str]): Predicted classes.
+    """
+    detections, phrases = grounding_dino_model.predict_with_caption(
+        image=image,
+        caption=caption,
+        box_threshold=box_threshold,
+        text_threshold=text_threshold,
+    )
+    classes = list(map(lambda x: x.strip(), caption.split(".")))
+    detections.class_id = DinoModel.phrases2classes(phrases=phrases, classes=classes)
+    # NMS post process
+    if post_process:
+        # print(f"Before NMS: {len(detections.xyxy)} boxes")
+        nms_idx = (
+            torchvision.ops.nms(
+                torch.from_numpy(detections.xyxy),
+                torch.from_numpy(detections.confidence),
+                iou_threshold,
+            )
+            .numpy()
+            .tolist()
+        )
+        phrases = [phrases[idx] for idx in nms_idx]
+        detections.xyxy = detections.xyxy[nms_idx]
+        detections.confidence = detections.confidence[nms_idx]
+        detections.class_id = detections.class_id[nms_idx]
+        # print(f"After NMS: {len(detections.xyxy)} boxes")
+    return detections, phrases, classes
+def segment(sam_model: SamPredictor, image: np.ndarray, boxes: np.ndarray):
+    """Predict masks for the given input boxes, using the currently set image.
+    Arguments:
+      sam_model (SamPredictor): The model to use for mask prediction.
+      image (np.ndarray): The image for calculating masks. Expects an
+        image in HWC uint8 format, with pixel values in [0, 255].
+      boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+        model, in XYXY format.
+      return_logits (bool): If true, returns un-thresholded masks logits
+        instead of a binary mask.
+    Returns:
+      (torch.Tensor): The output masks in BxCxHxW format, where C is the
+        number of masks, and (H, W) is the original image size.
+      (torch.Tensor): An array of shape BxC containing the model's
+        predictions for the quality of each mask.
+      (torch.Tensor): An array of shape BxCxHxW, where C is the number
+        of masks and H=W=256. These low res logits can be passed to
+        a subsequent iteration as mask input.
+    """
+    sam_model.set_image(image)
+    transformed_boxes = None
+    if boxes is not None:
+        boxes = torch.from_numpy(boxes)
+        transformed_boxes = sam_model.transform.apply_boxes_torch(
+            boxes.to(sam_model.device), image.shape[:2]
+        )
+    masks, scores, _ = sam_model.predict_torch(
+        point_coords=None,
+        point_labels=None,
+        boxes=transformed_boxes,
+        multimask_output=False,
+    )
+    masks = masks[:, 0, :, :]
+    scores = scores[:, 0]
+    return masks.cpu().numpy(), scores.cpu().numpy()
+def draw_mask(mask, draw, random_color=False):
+    if random_color:
+        color = (
+            random.randint(0, 255),
+            random.randint(0, 255),
+            random.randint(0, 255),
+            153,
+        )
+    else:
+        color = (30, 144, 255, 153)
+    nonzero_coords = np.transpose(np.nonzero(mask))
+    for coord in nonzero_coords:
+        draw.point(coord[::-1], fill=color)