Spaces:
Runtime error
Runtime error
Commit
·
5b31094
1
Parent(s):
78f7941
Add application files
Browse files- .gitignore +200 -0
- LICENSE +21 -0
- annotate_anything.py +384 -0
- app.py +277 -0
- config.py +68 -0
- examples/dog.png +0 -0
- examples/eiffel.jpg +0 -0
- examples/eiffel.png +0 -0
- examples/girl.png +0 -0
- examples/horse.png +0 -0
- examples/horses.jpg +0 -0
- examples/traffic.jpg +0 -0
- requirements.txt +31 -0
- style.css +11 -0
- tag2text/LICENSE +21 -0
- tag2text/README.md +101 -0
- tag2text/configs/med_config.json +21 -0
- tag2text/configs/q2l_config.json +22 -0
- tag2text/configs/swin/config_swinB_384.json +9 -0
- tag2text/data/tag_list.txt +3429 -0
- tag2text/inference.py +102 -0
- tag2text/models/bert.py +1157 -0
- tag2text/models/swin_transformer.py +831 -0
- tag2text/models/tag2text.py +274 -0
- tag2text/models/utils.py +241 -0
- tag2text/models/vit.py +430 -0
- tag2text/requirements.txt +8 -0
- utils.py +263 -0
.gitignore
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,metals
|
| 2 |
+
# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,metals
|
| 3 |
+
|
| 4 |
+
### Metals ###
|
| 5 |
+
.metals/
|
| 6 |
+
.bloop/
|
| 7 |
+
project/**/metals.sbt
|
| 8 |
+
|
| 9 |
+
### Python ###
|
| 10 |
+
# Byte-compiled / optimized / DLL files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
|
| 15 |
+
# C extensions
|
| 16 |
+
*.so
|
| 17 |
+
|
| 18 |
+
# Distribution / packaging
|
| 19 |
+
.Python
|
| 20 |
+
build/
|
| 21 |
+
develop-eggs/
|
| 22 |
+
dist/
|
| 23 |
+
downloads/
|
| 24 |
+
eggs/
|
| 25 |
+
.eggs/
|
| 26 |
+
lib/
|
| 27 |
+
lib64/
|
| 28 |
+
parts/
|
| 29 |
+
sdist/
|
| 30 |
+
var/
|
| 31 |
+
wheels/
|
| 32 |
+
share/python-wheels/
|
| 33 |
+
*.egg-info/
|
| 34 |
+
.installed.cfg
|
| 35 |
+
*.egg
|
| 36 |
+
MANIFEST
|
| 37 |
+
|
| 38 |
+
# PyInstaller
|
| 39 |
+
# Usually these files are written by a python script from a template
|
| 40 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 41 |
+
*.manifest
|
| 42 |
+
*.spec
|
| 43 |
+
|
| 44 |
+
# Installer logs
|
| 45 |
+
pip-log.txt
|
| 46 |
+
pip-delete-this-directory.txt
|
| 47 |
+
|
| 48 |
+
# Unit test / coverage reports
|
| 49 |
+
htmlcov/
|
| 50 |
+
.tox/
|
| 51 |
+
.nox/
|
| 52 |
+
.coverage
|
| 53 |
+
.coverage.*
|
| 54 |
+
.cache
|
| 55 |
+
nosetests.xml
|
| 56 |
+
coverage.xml
|
| 57 |
+
*.cover
|
| 58 |
+
*.py,cover
|
| 59 |
+
.hypothesis/
|
| 60 |
+
.pytest_cache/
|
| 61 |
+
cover/
|
| 62 |
+
|
| 63 |
+
# Translations
|
| 64 |
+
*.mo
|
| 65 |
+
*.pot
|
| 66 |
+
|
| 67 |
+
# Django stuff:
|
| 68 |
+
*.log
|
| 69 |
+
local_settings.py
|
| 70 |
+
db.sqlite3
|
| 71 |
+
db.sqlite3-journal
|
| 72 |
+
|
| 73 |
+
# Flask stuff:
|
| 74 |
+
instance/
|
| 75 |
+
.webassets-cache
|
| 76 |
+
|
| 77 |
+
# Scrapy stuff:
|
| 78 |
+
.scrapy
|
| 79 |
+
|
| 80 |
+
# Sphinx documentation
|
| 81 |
+
docs/_build/
|
| 82 |
+
|
| 83 |
+
# PyBuilder
|
| 84 |
+
.pybuilder/
|
| 85 |
+
target/
|
| 86 |
+
|
| 87 |
+
# Jupyter Notebook
|
| 88 |
+
.ipynb_checkpoints
|
| 89 |
+
|
| 90 |
+
# IPython
|
| 91 |
+
profile_default/
|
| 92 |
+
ipython_config.py
|
| 93 |
+
|
| 94 |
+
# pyenv
|
| 95 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 96 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 97 |
+
# .python-version
|
| 98 |
+
|
| 99 |
+
# pipenv
|
| 100 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 101 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 102 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 103 |
+
# install all needed dependencies.
|
| 104 |
+
#Pipfile.lock
|
| 105 |
+
|
| 106 |
+
# poetry
|
| 107 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 108 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 109 |
+
# commonly ignored for libraries.
|
| 110 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 111 |
+
#poetry.lock
|
| 112 |
+
|
| 113 |
+
# pdm
|
| 114 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 115 |
+
#pdm.lock
|
| 116 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 117 |
+
# in version control.
|
| 118 |
+
# https://pdm.fming.dev/#use-with-ide
|
| 119 |
+
.pdm.toml
|
| 120 |
+
|
| 121 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 122 |
+
__pypackages__/
|
| 123 |
+
|
| 124 |
+
# Celery stuff
|
| 125 |
+
celerybeat-schedule
|
| 126 |
+
celerybeat.pid
|
| 127 |
+
|
| 128 |
+
# SageMath parsed files
|
| 129 |
+
*.sage.py
|
| 130 |
+
|
| 131 |
+
# Environments
|
| 132 |
+
.env
|
| 133 |
+
.venv
|
| 134 |
+
env/
|
| 135 |
+
venv/
|
| 136 |
+
ENV/
|
| 137 |
+
env.bak/
|
| 138 |
+
venv.bak/
|
| 139 |
+
|
| 140 |
+
# Spyder project settings
|
| 141 |
+
.spyderproject
|
| 142 |
+
.spyproject
|
| 143 |
+
|
| 144 |
+
# Rope project settings
|
| 145 |
+
.ropeproject
|
| 146 |
+
|
| 147 |
+
# mkdocs documentation
|
| 148 |
+
/site
|
| 149 |
+
|
| 150 |
+
# mypy
|
| 151 |
+
.mypy_cache/
|
| 152 |
+
.dmypy.json
|
| 153 |
+
dmypy.json
|
| 154 |
+
|
| 155 |
+
# Pyre type checker
|
| 156 |
+
.pyre/
|
| 157 |
+
|
| 158 |
+
# pytype static type analyzer
|
| 159 |
+
.pytype/
|
| 160 |
+
|
| 161 |
+
# Cython debug symbols
|
| 162 |
+
cython_debug/
|
| 163 |
+
|
| 164 |
+
# PyCharm
|
| 165 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 166 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 167 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 168 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 169 |
+
#.idea/
|
| 170 |
+
|
| 171 |
+
### Python Patch ###
|
| 172 |
+
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
| 173 |
+
poetry.toml
|
| 174 |
+
|
| 175 |
+
# ruff
|
| 176 |
+
.ruff_cache/
|
| 177 |
+
|
| 178 |
+
# LSP config files
|
| 179 |
+
pyrightconfig.json
|
| 180 |
+
|
| 181 |
+
### VisualStudioCode ###
|
| 182 |
+
.vscode/*
|
| 183 |
+
!.vscode/settings.json
|
| 184 |
+
!.vscode/tasks.json
|
| 185 |
+
!.vscode/launch.json
|
| 186 |
+
!.vscode/extensions.json
|
| 187 |
+
!.vscode/*.code-snippets
|
| 188 |
+
|
| 189 |
+
# Local History for Visual Studio Code
|
| 190 |
+
.history/
|
| 191 |
+
|
| 192 |
+
# Built Visual Studio Code Extensions
|
| 193 |
+
*.vsix
|
| 194 |
+
|
| 195 |
+
### VisualStudioCode Patch ###
|
| 196 |
+
# Ignore all local history of files
|
| 197 |
+
.history
|
| 198 |
+
.ionide
|
| 199 |
+
|
| 200 |
+
# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,metals
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2023 Binh Le
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
annotate_anything.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import tempfile
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import supervision as sv
|
| 9 |
+
from groundingdino.util.inference import Model as DinoModel
|
| 10 |
+
from imutils import paths
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from segment_anything import sam_model_registry
|
| 13 |
+
from segment_anything import SamAutomaticMaskGenerator
|
| 14 |
+
from segment_anything import SamPredictor
|
| 15 |
+
from supervision.detection.utils import xywh_to_xyxy
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
|
| 18 |
+
sys.path.append("tag2text")
|
| 19 |
+
|
| 20 |
+
from tag2text.models import tag2text
|
| 21 |
+
from config import *
|
| 22 |
+
from utils import detect, download_file_hf, segment, generate_tags, show_anns_sv
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def process(
|
| 26 |
+
tag2text_model,
|
| 27 |
+
grounding_dino_model,
|
| 28 |
+
sam_predictor,
|
| 29 |
+
sam_automask_generator,
|
| 30 |
+
image_path,
|
| 31 |
+
task,
|
| 32 |
+
prompt,
|
| 33 |
+
box_threshold,
|
| 34 |
+
text_threshold,
|
| 35 |
+
iou_threshold,
|
| 36 |
+
device,
|
| 37 |
+
output_dir=None,
|
| 38 |
+
save_mask=False,
|
| 39 |
+
):
|
| 40 |
+
detections = None
|
| 41 |
+
metadata = {"image": {}, "annotations": [], "assets": {}}
|
| 42 |
+
|
| 43 |
+
if save_mask:
|
| 44 |
+
metadata["assets"]["intermediate_mask"] = []
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
# Load image
|
| 48 |
+
image = Image.open(image_path)
|
| 49 |
+
image_pil = image.convert("RGB")
|
| 50 |
+
image = np.array(image_pil)
|
| 51 |
+
|
| 52 |
+
# Extract image metadata
|
| 53 |
+
filename = os.path.basename(image_path)
|
| 54 |
+
basename = os.path.splitext(filename)[0]
|
| 55 |
+
h, w = image.shape[:2]
|
| 56 |
+
metadata["image"]["file_name"] = filename
|
| 57 |
+
metadata["image"]["width"] = w
|
| 58 |
+
metadata["image"]["height"] = h
|
| 59 |
+
|
| 60 |
+
# Generate tags
|
| 61 |
+
if task in ["auto", "detection"] and prompt == "":
|
| 62 |
+
tags, caption = generate_tags(tag2text_model, image_pil, "None", device)
|
| 63 |
+
prompt = " . ".join(tags)
|
| 64 |
+
# print(f"Caption: {caption}")
|
| 65 |
+
# print(f"Tags: {tags}")
|
| 66 |
+
|
| 67 |
+
# ToDo: Extract metadata
|
| 68 |
+
metadata["image"]["caption"] = caption
|
| 69 |
+
metadata["image"]["tags"] = tags
|
| 70 |
+
|
| 71 |
+
if prompt:
|
| 72 |
+
metadata["prompt"] = prompt
|
| 73 |
+
|
| 74 |
+
# Detect boxes
|
| 75 |
+
if prompt != "":
|
| 76 |
+
detections, _, classes = detect(
|
| 77 |
+
grounding_dino_model,
|
| 78 |
+
image,
|
| 79 |
+
caption=prompt,
|
| 80 |
+
box_threshold=box_threshold,
|
| 81 |
+
text_threshold=text_threshold,
|
| 82 |
+
iou_threshold=iou_threshold,
|
| 83 |
+
post_process=True,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Save detection image
|
| 87 |
+
if output_dir:
|
| 88 |
+
# Draw boxes
|
| 89 |
+
box_annotator = sv.BoxAnnotator()
|
| 90 |
+
labels = [
|
| 91 |
+
f"{classes[class_id] if class_id else 'Unkown'} {confidence:0.2f}"
|
| 92 |
+
for _, _, confidence, class_id, _ in detections
|
| 93 |
+
]
|
| 94 |
+
box_image = box_annotator.annotate(
|
| 95 |
+
scene=image, detections=detections, labels=labels
|
| 96 |
+
)
|
| 97 |
+
box_image_path = os.path.join(output_dir, basename + "_detect.png")
|
| 98 |
+
metadata["assets"]["detection"] = box_image_path
|
| 99 |
+
Image.fromarray(box_image).save(box_image_path)
|
| 100 |
+
|
| 101 |
+
# Segmentation
|
| 102 |
+
if task in ["auto", "segment"]:
|
| 103 |
+
if detections:
|
| 104 |
+
masks, scores = segment(
|
| 105 |
+
sam_predictor, image=image, boxes=detections.xyxy
|
| 106 |
+
)
|
| 107 |
+
detections.mask = masks
|
| 108 |
+
else:
|
| 109 |
+
masks = sam_automask_generator.generate(image)
|
| 110 |
+
sorted_generated_masks = sorted(
|
| 111 |
+
masks, key=lambda x: x["area"], reverse=True
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
xywh = np.array([mask["bbox"] for mask in sorted_generated_masks])
|
| 115 |
+
mask = np.array(
|
| 116 |
+
[mask["segmentation"] for mask in sorted_generated_masks]
|
| 117 |
+
)
|
| 118 |
+
scores = np.array(
|
| 119 |
+
[mask["predicted_iou"] for mask in sorted_generated_masks]
|
| 120 |
+
)
|
| 121 |
+
detections = sv.Detections(
|
| 122 |
+
xyxy=xywh_to_xyxy(boxes_xywh=xywh), mask=mask
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
# Save annotated image
|
| 126 |
+
if output_dir:
|
| 127 |
+
mask_annotator = sv.MaskAnnotator()
|
| 128 |
+
mask_image, res = show_anns_sv(detections)
|
| 129 |
+
annotated_image = mask_annotator.annotate(image, detections=detections)
|
| 130 |
+
|
| 131 |
+
mask_image_path = os.path.join(output_dir, basename + "_mask.png")
|
| 132 |
+
metadata["assets"]["mask"] = mask_image_path
|
| 133 |
+
Image.fromarray(mask_image).save(mask_image_path)
|
| 134 |
+
|
| 135 |
+
# Save annotation encoding from https://github.com/LUSSeg/ImageNet-S
|
| 136 |
+
mask_enc_path = os.path.join(output_dir, basename + "_mask_enc.npy")
|
| 137 |
+
np.save(mask_enc_path, res)
|
| 138 |
+
metadata["assets"]["mask_enc"] = mask_enc_path
|
| 139 |
+
|
| 140 |
+
annotated_image_path = os.path.join(
|
| 141 |
+
output_dir, basename + "_annotate.png"
|
| 142 |
+
)
|
| 143 |
+
metadata["assets"]["annotate"] = annotated_image_path
|
| 144 |
+
Image.fromarray(annotated_image).save(annotated_image_path)
|
| 145 |
+
|
| 146 |
+
# ToDo: Extract metadata
|
| 147 |
+
if detections:
|
| 148 |
+
id = 1
|
| 149 |
+
for (xyxy, mask, confidence, class_id, _), area, box_area, score in zip(
|
| 150 |
+
detections, detections.area, detections.box_area, scores
|
| 151 |
+
):
|
| 152 |
+
annotation = {
|
| 153 |
+
"id": id,
|
| 154 |
+
"bbox": [int(x) for x in xyxy],
|
| 155 |
+
"box_area": float(box_area),
|
| 156 |
+
}
|
| 157 |
+
if class_id:
|
| 158 |
+
annotation["box_confidence"] = float(confidence)
|
| 159 |
+
annotation["label"] = classes[class_id] if class_id else "Unkown"
|
| 160 |
+
if mask is not None:
|
| 161 |
+
annotation["area"] = int(area)
|
| 162 |
+
annotation["predicted_iou"] = float(score)
|
| 163 |
+
metadata["annotations"].append(annotation)
|
| 164 |
+
|
| 165 |
+
if output_dir and save_mask:
|
| 166 |
+
mask_image_path = os.path.join(
|
| 167 |
+
output_dir, f"{basename}_mask_{id}.png"
|
| 168 |
+
)
|
| 169 |
+
metadata["assets"]["intermediate_mask"].append(mask_image_path)
|
| 170 |
+
Image.fromarray(mask * 255).save(mask_image_path)
|
| 171 |
+
|
| 172 |
+
id += 1
|
| 173 |
+
|
| 174 |
+
if output_dir:
|
| 175 |
+
meta_file_path = os.path.join(output_dir, basename + "_meta.json")
|
| 176 |
+
with open(meta_file_path, "w") as fp:
|
| 177 |
+
json.dump(metadata, fp)
|
| 178 |
+
else:
|
| 179 |
+
meta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 180 |
+
meta_file_path = meta_file.name
|
| 181 |
+
|
| 182 |
+
return meta_file_path
|
| 183 |
+
except Exception as error:
|
| 184 |
+
raise ValueError(f"global exception: {error}")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def main(args: argparse.Namespace) -> None:
|
| 188 |
+
device = args.device
|
| 189 |
+
prompt = args.prompt
|
| 190 |
+
task = args.task
|
| 191 |
+
|
| 192 |
+
tag2text_model = None
|
| 193 |
+
grounding_dino_model = None
|
| 194 |
+
sam_predictor = None
|
| 195 |
+
sam_automask_generator = None
|
| 196 |
+
|
| 197 |
+
box_threshold = args.box_threshold
|
| 198 |
+
text_threshold = args.text_threshold
|
| 199 |
+
iou_threshold = args.iou_threshold
|
| 200 |
+
save_mask = args.save_mask
|
| 201 |
+
|
| 202 |
+
# load model
|
| 203 |
+
if task in ["auto", "detection"] and prompt == "":
|
| 204 |
+
print("Loading Tag2Text model...")
|
| 205 |
+
tag2text_type = args.tag2text
|
| 206 |
+
tag2text_checkpoint = os.path.join(
|
| 207 |
+
abs_weight_dir, tag2text_dict[tag2text_type]["checkpoint_file"]
|
| 208 |
+
)
|
| 209 |
+
if not os.path.exists(tag2text_checkpoint):
|
| 210 |
+
print(f"Downloading weights for Tag2Text {tag2text_type} model")
|
| 211 |
+
os.system(
|
| 212 |
+
f"wget {tag2text_dict[tag2text_type]['checkpoint_url']} -O {tag2text_checkpoint}"
|
| 213 |
+
)
|
| 214 |
+
tag2text_model = tag2text.tag2text_caption(
|
| 215 |
+
pretrained=tag2text_checkpoint,
|
| 216 |
+
image_size=384,
|
| 217 |
+
vit="swin_b",
|
| 218 |
+
delete_tag_index=delete_tag_index,
|
| 219 |
+
)
|
| 220 |
+
# threshold for tagging
|
| 221 |
+
# we reduce the threshold to obtain more tags
|
| 222 |
+
tag2text_model.threshold = 0.64
|
| 223 |
+
tag2text_model.to(device)
|
| 224 |
+
tag2text_model.eval()
|
| 225 |
+
|
| 226 |
+
if task in ["auto", "detection"] or prompt != "":
|
| 227 |
+
print("Loading Grounding Dino model...")
|
| 228 |
+
dino_type = args.dino
|
| 229 |
+
dino_checkpoint = os.path.join(
|
| 230 |
+
abs_weight_dir, dino_dict[dino_type]["checkpoint_file"]
|
| 231 |
+
)
|
| 232 |
+
dino_config_file = os.path.join(
|
| 233 |
+
abs_weight_dir, dino_dict[dino_type]["config_file"]
|
| 234 |
+
)
|
| 235 |
+
if not os.path.exists(dino_checkpoint):
|
| 236 |
+
print(f"Downloading weights for Grounding Dino {dino_type} model")
|
| 237 |
+
dino_repo_id = dino_dict[dino_type]["repo_id"]
|
| 238 |
+
download_file_hf(
|
| 239 |
+
repo_id=dino_repo_id,
|
| 240 |
+
filename=dino_dict[dino_type]["checkpoint_file"],
|
| 241 |
+
cache_dir=weight_dir,
|
| 242 |
+
)
|
| 243 |
+
download_file_hf(
|
| 244 |
+
repo_id=dino_repo_id,
|
| 245 |
+
filename=dino_dict[dino_type]["checkpoint_file"],
|
| 246 |
+
cache_dir=weight_dir,
|
| 247 |
+
)
|
| 248 |
+
grounding_dino_model = DinoModel(
|
| 249 |
+
model_config_path=dino_config_file, model_checkpoint_path=dino_checkpoint
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
if task in ["auto", "segment"]:
|
| 253 |
+
print("Loading SAM...")
|
| 254 |
+
sam_type = args.sam
|
| 255 |
+
sam_checkpoint = os.path.join(
|
| 256 |
+
abs_weight_dir, sam_dict[sam_type]["checkpoint_file"]
|
| 257 |
+
)
|
| 258 |
+
if not os.path.exists(sam_checkpoint):
|
| 259 |
+
print(f"Downloading weights for SAM {sam_type}")
|
| 260 |
+
os.system(
|
| 261 |
+
f"wget {sam_dict[sam_type]['checkpoint_url']} -O {sam_checkpoint}"
|
| 262 |
+
)
|
| 263 |
+
sam = sam_model_registry[sam_type](checkpoint=sam_checkpoint)
|
| 264 |
+
sam.to(device=device)
|
| 265 |
+
sam_predictor = SamPredictor(sam)
|
| 266 |
+
sam_automask_generator = SamAutomaticMaskGenerator(sam)
|
| 267 |
+
|
| 268 |
+
if not os.path.exists(args.input):
|
| 269 |
+
raise ValueError("The input directory doesn't exist!")
|
| 270 |
+
elif not os.path.isdir(args.input):
|
| 271 |
+
image_paths = [args.input]
|
| 272 |
+
else:
|
| 273 |
+
image_paths = paths.list_images(args.input)
|
| 274 |
+
|
| 275 |
+
os.makedirs(args.output, exist_ok=True)
|
| 276 |
+
|
| 277 |
+
with tqdm(image_paths) as pbar:
|
| 278 |
+
for image_path in pbar:
|
| 279 |
+
pbar.set_postfix_str(f"Processing {image_path}")
|
| 280 |
+
process(
|
| 281 |
+
tag2text_model=tag2text_model,
|
| 282 |
+
grounding_dino_model=grounding_dino_model,
|
| 283 |
+
sam_predictor=sam_predictor,
|
| 284 |
+
sam_automask_generator=sam_automask_generator,
|
| 285 |
+
image_path=image_path,
|
| 286 |
+
task=task,
|
| 287 |
+
prompt=prompt,
|
| 288 |
+
box_threshold=box_threshold,
|
| 289 |
+
text_threshold=text_threshold,
|
| 290 |
+
iou_threshold=iou_threshold,
|
| 291 |
+
device=device,
|
| 292 |
+
output_dir=args.output,
|
| 293 |
+
save_mask=save_mask,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
if __name__ == "__main__":
|
| 298 |
+
if not os.path.exists(abs_weight_dir):
|
| 299 |
+
os.makedirs(abs_weight_dir, exist_ok=True)
|
| 300 |
+
|
| 301 |
+
parser = argparse.ArgumentParser(
|
| 302 |
+
description=(
|
| 303 |
+
"Runs automatic detection and mask generation on an input image or directory of images"
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
parser.add_argument(
|
| 308 |
+
"--input",
|
| 309 |
+
"-i",
|
| 310 |
+
type=str,
|
| 311 |
+
required=True,
|
| 312 |
+
help="Path to either a single input image or folder of images.",
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
parser.add_argument(
|
| 316 |
+
"--output",
|
| 317 |
+
"-o",
|
| 318 |
+
type=str,
|
| 319 |
+
required=True,
|
| 320 |
+
help=(
|
| 321 |
+
"Path to the directory where masks will be output. Output will be either a folder "
|
| 322 |
+
"of PNGs per image or a single json with COCO-style masks."
|
| 323 |
+
),
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
parser.add_argument(
|
| 327 |
+
"--sam",
|
| 328 |
+
type=str,
|
| 329 |
+
default=default_sam,
|
| 330 |
+
choices=sam_dict.keys(),
|
| 331 |
+
help="The type of SA model to load",
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
parser.add_argument(
|
| 335 |
+
"--tag2text",
|
| 336 |
+
type=str,
|
| 337 |
+
default=default_tag2text,
|
| 338 |
+
choices=tag2text_dict.keys(),
|
| 339 |
+
help="The path to the Tag2Text checkpoint to use for tags and caption generation.",
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
parser.add_argument(
|
| 343 |
+
"--dino",
|
| 344 |
+
type=str,
|
| 345 |
+
default=default_dino,
|
| 346 |
+
choices=dino_dict.keys(),
|
| 347 |
+
help="The config file of Grounding Dino model to load",
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
parser.add_argument(
|
| 351 |
+
"--task",
|
| 352 |
+
help="Task to run",
|
| 353 |
+
default="auto",
|
| 354 |
+
choices=["auto", "detect", "segment"],
|
| 355 |
+
type=str,
|
| 356 |
+
)
|
| 357 |
+
parser.add_argument(
|
| 358 |
+
"--prompt",
|
| 359 |
+
help="Detection prompt",
|
| 360 |
+
default="",
|
| 361 |
+
type=str,
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
parser.add_argument(
|
| 365 |
+
"--box-threshold", type=float, default=0.25, help="box threshold"
|
| 366 |
+
)
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--text-threshold", type=float, default=0.2, help="text threshold"
|
| 369 |
+
)
|
| 370 |
+
parser.add_argument(
|
| 371 |
+
"--iou-threshold", type=float, default=0.5, help="iou threshold"
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
parser.add_argument(
|
| 375 |
+
"--save-mask",
|
| 376 |
+
action="store_true",
|
| 377 |
+
default=False,
|
| 378 |
+
help="If True, save all intermidiate masks.",
|
| 379 |
+
)
|
| 380 |
+
parser.add_argument(
|
| 381 |
+
"--device", type=str, default="cuda", help="The device to run generation on."
|
| 382 |
+
)
|
| 383 |
+
args = parser.parse_args()
|
| 384 |
+
main(args)
|
app.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import tempfile
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import numpy as np
|
| 8 |
+
import supervision as sv
|
| 9 |
+
import torch
|
| 10 |
+
from groundingdino.util.inference import Model as DinoModel
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from segment_anything import build_sam
|
| 13 |
+
from segment_anything import SamAutomaticMaskGenerator
|
| 14 |
+
from segment_anything import SamPredictor
|
| 15 |
+
from supervision.detection.utils import mask_to_polygons
|
| 16 |
+
from supervision.detection.utils import xywh_to_xyxy
|
| 17 |
+
|
| 18 |
+
# segment anything
|
| 19 |
+
# Grounding DINO
|
| 20 |
+
|
| 21 |
+
sys.path.append("tag2text")
|
| 22 |
+
|
| 23 |
+
from tag2text.models import tag2text
|
| 24 |
+
from config import *
|
| 25 |
+
from utils import download_file_hf, detect, segment, show_anns, generate_tags
|
| 26 |
+
|
| 27 |
+
if not os.path.exists(abs_weight_dir):
|
| 28 |
+
os.makedirs(abs_weight_dir, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
sam_checkpoint = os.path.join(abs_weight_dir, sam_dict[default_sam]["checkpoint_file"])
|
| 31 |
+
if not os.path.exists(sam_checkpoint):
|
| 32 |
+
os.system(f"wget {sam_dict[default_sam]['checkpoint_url']} -O {sam_checkpoint}")
|
| 33 |
+
|
| 34 |
+
tag2text_checkpoint = os.path.join(
|
| 35 |
+
abs_weight_dir, tag2text_dict[default_tag2text]["checkpoint_file"]
|
| 36 |
+
)
|
| 37 |
+
if not os.path.exists(tag2text_checkpoint):
|
| 38 |
+
os.system(
|
| 39 |
+
f"wget {tag2text_dict[default_tag2text]['checkpoint_url']} -O {tag2text_checkpoint}"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
dino_checkpoint = os.path.join(
|
| 43 |
+
abs_weight_dir, dino_dict[default_dino]["checkpoint_file"]
|
| 44 |
+
)
|
| 45 |
+
dino_config_file = os.path.join(abs_weight_dir, dino_dict[default_dino]["config_file"])
|
| 46 |
+
if not os.path.exists(dino_checkpoint):
|
| 47 |
+
dino_repo_id = dino_dict[default_dino]["repo_id"]
|
| 48 |
+
download_file_hf(
|
| 49 |
+
repo_id=dino_repo_id,
|
| 50 |
+
filename=dino_dict[default_dino]["config_file"],
|
| 51 |
+
cache_dir=weight_dir,
|
| 52 |
+
)
|
| 53 |
+
download_file_hf(
|
| 54 |
+
repo_id=dino_repo_id,
|
| 55 |
+
filename=dino_dict[default_dino]["checkpoint_file"],
|
| 56 |
+
cache_dir=weight_dir,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# load model
|
| 60 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 61 |
+
tag2text_model = tag2text.tag2text_caption(
|
| 62 |
+
pretrained=tag2text_checkpoint,
|
| 63 |
+
image_size=384,
|
| 64 |
+
vit="swin_b",
|
| 65 |
+
delete_tag_index=delete_tag_index,
|
| 66 |
+
)
|
| 67 |
+
# threshold for tagging
|
| 68 |
+
# we reduce the threshold to obtain more tags
|
| 69 |
+
tag2text_model.threshold = 0.64
|
| 70 |
+
tag2text_model.to(device)
|
| 71 |
+
tag2text_model.eval()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
sam = build_sam(checkpoint=sam_checkpoint)
|
| 75 |
+
sam.to(device=device)
|
| 76 |
+
sam_predictor = SamPredictor(sam)
|
| 77 |
+
sam_automask_generator = SamAutomaticMaskGenerator(sam)
|
| 78 |
+
|
| 79 |
+
grounding_dino_model = DinoModel(
|
| 80 |
+
model_config_path=dino_config_file, model_checkpoint_path=dino_checkpoint
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def process(image_path, task, prompt, box_threshold, text_threshold, iou_threshold):
|
| 85 |
+
global tag2text_model, sam_predictor, sam_automask_generator, grounding_dino_model, device
|
| 86 |
+
output_gallery = []
|
| 87 |
+
detections = None
|
| 88 |
+
metadata = {"image": {}, "annotations": []}
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
# Load image
|
| 92 |
+
image = Image.open(image_path)
|
| 93 |
+
image_pil = image.convert("RGB")
|
| 94 |
+
image = np.array(image_pil)
|
| 95 |
+
|
| 96 |
+
# Extract image metadata
|
| 97 |
+
filename = os.path.basename(image_path)
|
| 98 |
+
h, w = image.shape[:2]
|
| 99 |
+
metadata["image"]["file_name"] = filename
|
| 100 |
+
metadata["image"]["width"] = w
|
| 101 |
+
metadata["image"]["height"] = h
|
| 102 |
+
|
| 103 |
+
# Generate tags
|
| 104 |
+
if task in ["auto", "detection"] and prompt == "":
|
| 105 |
+
tags, caption = generate_tags(tag2text_model, image_pil, "None", device)
|
| 106 |
+
prompt = " . ".join(tags)
|
| 107 |
+
print(f"Caption: {caption}")
|
| 108 |
+
print(f"Tags: {tags}")
|
| 109 |
+
|
| 110 |
+
# ToDo: Extract metadata
|
| 111 |
+
metadata["image"]["caption"] = caption
|
| 112 |
+
metadata["image"]["tags"] = tags
|
| 113 |
+
|
| 114 |
+
if prompt:
|
| 115 |
+
metadata["prompt"] = prompt
|
| 116 |
+
print(f"Prompt: {prompt}")
|
| 117 |
+
|
| 118 |
+
# Detect boxes
|
| 119 |
+
if prompt != "":
|
| 120 |
+
detections, phrases, classes = detect(
|
| 121 |
+
grounding_dino_model,
|
| 122 |
+
image,
|
| 123 |
+
caption=prompt,
|
| 124 |
+
box_threshold=box_threshold,
|
| 125 |
+
text_threshold=text_threshold,
|
| 126 |
+
iou_threshold=iou_threshold,
|
| 127 |
+
post_process=True,
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Draw boxes
|
| 131 |
+
box_annotator = sv.BoxAnnotator()
|
| 132 |
+
labels = [
|
| 133 |
+
f"{classes[class_id] if class_id else 'Unkown'} {confidence:0.2f}"
|
| 134 |
+
for _, _, confidence, class_id, _ in detections
|
| 135 |
+
]
|
| 136 |
+
image = box_annotator.annotate(
|
| 137 |
+
scene=image, detections=detections, labels=labels
|
| 138 |
+
)
|
| 139 |
+
output_gallery.append(image)
|
| 140 |
+
|
| 141 |
+
# Segmentation
|
| 142 |
+
if task in ["auto", "segment"]:
|
| 143 |
+
if detections:
|
| 144 |
+
masks, scores = segment(
|
| 145 |
+
sam_predictor, image=image, boxes=detections.xyxy
|
| 146 |
+
)
|
| 147 |
+
detections.mask = masks
|
| 148 |
+
else:
|
| 149 |
+
masks = sam_automask_generator.generate(image)
|
| 150 |
+
sorted_generated_masks = sorted(
|
| 151 |
+
masks, key=lambda x: x["area"], reverse=True
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
xywh = np.array([mask["bbox"] for mask in sorted_generated_masks])
|
| 155 |
+
mask = np.array(
|
| 156 |
+
[mask["segmentation"] for mask in sorted_generated_masks]
|
| 157 |
+
)
|
| 158 |
+
scores = np.array(
|
| 159 |
+
[mask["predicted_iou"] for mask in sorted_generated_masks]
|
| 160 |
+
)
|
| 161 |
+
detections = sv.Detections(
|
| 162 |
+
xyxy=xywh_to_xyxy(boxes_xywh=xywh), mask=mask
|
| 163 |
+
)
|
| 164 |
+
# opacity = 0.4
|
| 165 |
+
# mask_image, _ = show_anns_sam(masks)
|
| 166 |
+
# annotated_image = np.uint8(mask_image * opacity + image * (1 - opacity))
|
| 167 |
+
|
| 168 |
+
mask_annotator = sv.MaskAnnotator()
|
| 169 |
+
mask_image = np.zeros_like(image, dtype=np.uint8)
|
| 170 |
+
mask_image = mask_annotator.annotate(
|
| 171 |
+
mask_image, detections=detections, opacity=1
|
| 172 |
+
)
|
| 173 |
+
annotated_image = mask_annotator.annotate(image, detections=detections)
|
| 174 |
+
output_gallery.append(mask_image)
|
| 175 |
+
output_gallery.append(annotated_image)
|
| 176 |
+
|
| 177 |
+
# ToDo: Extract metadata
|
| 178 |
+
if detections:
|
| 179 |
+
id = 1
|
| 180 |
+
for (xyxy, mask, confidence, class_id, _), area, box_area, score in zip(
|
| 181 |
+
detections, detections.area, detections.box_area, scores
|
| 182 |
+
):
|
| 183 |
+
annotation = {
|
| 184 |
+
"id": id,
|
| 185 |
+
"bbox": [int(x) for x in xyxy],
|
| 186 |
+
"box_area": float(box_area),
|
| 187 |
+
}
|
| 188 |
+
if class_id:
|
| 189 |
+
annotation["box_confidence"] = float(confidence)
|
| 190 |
+
annotation["label"] = classes[class_id] if class_id else "Unkown"
|
| 191 |
+
if mask is not None:
|
| 192 |
+
# annotation["segmentation"] = mask_to_polygons(mask)
|
| 193 |
+
annotation["area"] = int(area)
|
| 194 |
+
annotation["predicted_iou"] = float(score)
|
| 195 |
+
metadata["annotations"].append(annotation)
|
| 196 |
+
id += 1
|
| 197 |
+
|
| 198 |
+
meta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 199 |
+
meta_file_path = meta_file.name
|
| 200 |
+
with open(meta_file_path, "w") as fp:
|
| 201 |
+
json.dump(metadata, fp)
|
| 202 |
+
|
| 203 |
+
return output_gallery, meta_file_path
|
| 204 |
+
except Exception as error:
|
| 205 |
+
raise gr.Error(f"global exception: {error}")
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
title = "Annotate Anything"
|
| 209 |
+
|
| 210 |
+
with gr.Blocks(css="style.css", title=title) as demo:
|
| 211 |
+
with gr.Row(elem_classes=["container"]):
|
| 212 |
+
with gr.Column(scale=1):
|
| 213 |
+
input_image = gr.Image(type="filepath", label="Input")
|
| 214 |
+
task = gr.Dropdown(
|
| 215 |
+
["detect", "segment", "auto"], value="auto", label="task_type"
|
| 216 |
+
)
|
| 217 |
+
text_prompt = gr.Textbox(label="Detection Prompt")
|
| 218 |
+
with gr.Accordion("Advanced parameters", open=False):
|
| 219 |
+
box_threshold = gr.Slider(
|
| 220 |
+
minimum=0,
|
| 221 |
+
maximum=1,
|
| 222 |
+
value=0.3,
|
| 223 |
+
step=0.05,
|
| 224 |
+
label="Box threshold",
|
| 225 |
+
info="Hash size to use for image hashing",
|
| 226 |
+
)
|
| 227 |
+
text_threshold = gr.Slider(
|
| 228 |
+
minimum=0,
|
| 229 |
+
maximum=1,
|
| 230 |
+
value=0.25,
|
| 231 |
+
step=0.05,
|
| 232 |
+
label="Text threshold",
|
| 233 |
+
info="Number of history images used to find out duplicate image",
|
| 234 |
+
)
|
| 235 |
+
iou_threshold = gr.Slider(
|
| 236 |
+
minimum=0,
|
| 237 |
+
maximum=1,
|
| 238 |
+
value=0.5,
|
| 239 |
+
step=0.05,
|
| 240 |
+
label="IOU threshold",
|
| 241 |
+
info="Minimum similarity threshold (in percent) to consider 2 images to be similar",
|
| 242 |
+
)
|
| 243 |
+
run_button = gr.Button(label="Run")
|
| 244 |
+
|
| 245 |
+
with gr.Column(scale=2):
|
| 246 |
+
gallery = gr.Gallery(
|
| 247 |
+
label="Generated images", show_label=False, elem_id="gallery"
|
| 248 |
+
).style(preview=True, grid=2, object_fit="scale-down")
|
| 249 |
+
meta_file = gr.File(label="Metadata file")
|
| 250 |
+
|
| 251 |
+
with gr.Row(elem_classes=["container"]):
|
| 252 |
+
gr.Examples(
|
| 253 |
+
[
|
| 254 |
+
["examples/dog.png", "auto", ""],
|
| 255 |
+
["examples/eiffel.png", "auto", ""],
|
| 256 |
+
["examples/eiffel.png", "segment", ""],
|
| 257 |
+
["examples/girl.png", "auto", "girl . face"],
|
| 258 |
+
["examples/horse.png", "detect", "horse"],
|
| 259 |
+
["examples/horses.jpg", "auto", "horse"],
|
| 260 |
+
["examples/traffic.jpg", "auto", ""],
|
| 261 |
+
],
|
| 262 |
+
[input_image, task, text_prompt],
|
| 263 |
+
)
|
| 264 |
+
run_button.click(
|
| 265 |
+
fn=process,
|
| 266 |
+
inputs=[
|
| 267 |
+
input_image,
|
| 268 |
+
task,
|
| 269 |
+
text_prompt,
|
| 270 |
+
box_threshold,
|
| 271 |
+
text_threshold,
|
| 272 |
+
iou_threshold,
|
| 273 |
+
],
|
| 274 |
+
outputs=[gallery, meta_file],
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
demo.queue(concurrency_count=2).launch()
|
config.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# Configurations
|
| 4 |
+
tag2text_dict = {
|
| 5 |
+
"swin_14m": {
|
| 6 |
+
"checkpoint_url": "https://huggingface.co/spaces/xinyu1205/Tag2Text/resolve/main/tag2text_swin_14m.pth",
|
| 7 |
+
"checkpoint_file": "tag2text_swin_14m.pth",
|
| 8 |
+
}
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
sam_dict = {
|
| 12 |
+
"default": {
|
| 13 |
+
"checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
|
| 14 |
+
"checkpoint_file": "sam_vit_h_4b8939.pth",
|
| 15 |
+
},
|
| 16 |
+
"vit_h": {
|
| 17 |
+
"checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
|
| 18 |
+
"checkpoint_file": "sam_vit_h_4b8939.pth",
|
| 19 |
+
},
|
| 20 |
+
"vit_l": {
|
| 21 |
+
"checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
|
| 22 |
+
"checkpoint_file": "sam_vit_l_0b3195.pth",
|
| 23 |
+
},
|
| 24 |
+
"vit_b": {
|
| 25 |
+
"checkpoint_url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth",
|
| 26 |
+
"checkpoint_file": "sam_vit_b_01ec64.pth",
|
| 27 |
+
},
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
dino_dict = {
|
| 31 |
+
"swinb": {
|
| 32 |
+
"repo_id": "ShilongLiu/GroundingDINO",
|
| 33 |
+
"config_file": "GroundingDINO_SwinB.cfg.py",
|
| 34 |
+
"checkpoint_file": "groundingdino_swinb_cogcoor.pth",
|
| 35 |
+
},
|
| 36 |
+
"swint_ogc": {
|
| 37 |
+
"repo_id": "ShilongLiu/GroundingDINO",
|
| 38 |
+
"config_file": "GroundingDINO_SwinT_OGC.cfg.py",
|
| 39 |
+
"checkpoint_file": "groundingdino_swint_ogc.pth",
|
| 40 |
+
},
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
default_sam = "default"
|
| 44 |
+
default_tag2text = "swin_14m"
|
| 45 |
+
default_dino = "swint_ogc"
|
| 46 |
+
|
| 47 |
+
root_dir = os.path.dirname(os.path.abspath(__file__))
|
| 48 |
+
weight_dir = "weights"
|
| 49 |
+
abs_weight_dir = os.path.join(root_dir, weight_dir)
|
| 50 |
+
|
| 51 |
+
tag2text_checkpoint = "tag2text_swin_14m.pth"
|
| 52 |
+
tag2text_url = "https://huggingface.co/spaces/xinyu1205/Tag2Text/resolve/main/tag2text_swin_14m.pth"
|
| 53 |
+
sam_checkpoint = "sam_vit_h_4b8939.pth"
|
| 54 |
+
sam_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
|
| 55 |
+
output_dir = "outputs"
|
| 56 |
+
|
| 57 |
+
dino_config_file = "GroundingDINO_SwinB.cfg.py"
|
| 58 |
+
dino_repo_id = "ShilongLiu/GroundingDINO"
|
| 59 |
+
dino_checkpoint = "groundingdino_swinb_cogcoor.pth"
|
| 60 |
+
|
| 61 |
+
iou_threshold = 0.5
|
| 62 |
+
box_threshold = 0.3
|
| 63 |
+
text_threshold = 0.25
|
| 64 |
+
|
| 65 |
+
# filter out attributes and action categories which are difficult to grounding
|
| 66 |
+
delete_tag_index = []
|
| 67 |
+
for i in range(3012, 3429):
|
| 68 |
+
delete_tag_index.append(i)
|
examples/dog.png
ADDED
|
examples/eiffel.jpg
ADDED
|
examples/eiffel.png
ADDED
|
examples/girl.png
ADDED
|
examples/horse.png
ADDED
|
examples/horses.jpg
ADDED
|
examples/traffic.jpg
ADDED
|
requirements.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate
|
| 2 |
+
addict
|
| 3 |
+
gradio
|
| 4 |
+
huggingface_hub
|
| 5 |
+
matplotlib
|
| 6 |
+
numpy
|
| 7 |
+
onnxruntime
|
| 8 |
+
opencv_python
|
| 9 |
+
Pillow
|
| 10 |
+
pycocotools
|
| 11 |
+
pycocoevalcap
|
| 12 |
+
PyYAML
|
| 13 |
+
requests
|
| 14 |
+
setuptools
|
| 15 |
+
supervision
|
| 16 |
+
termcolor
|
| 17 |
+
timm
|
| 18 |
+
torch
|
| 19 |
+
torchvision
|
| 20 |
+
transformers
|
| 21 |
+
yapf
|
| 22 |
+
numba
|
| 23 |
+
scipy
|
| 24 |
+
safetensors
|
| 25 |
+
pynvml
|
| 26 |
+
fairscale
|
| 27 |
+
imutils
|
| 28 |
+
argparse
|
| 29 |
+
tqdm
|
| 30 |
+
git+https://github.com/facebookresearch/segment-anything.git
|
| 31 |
+
git+https://github.com/IDEA-Research/GroundingDINO
|
style.css
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.container {
|
| 2 |
+
max-width: 1368px;
|
| 3 |
+
margin-left: auto;
|
| 4 |
+
margin-right: auto;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
#row-flex {
|
| 8 |
+
display: flex;
|
| 9 |
+
align-items: center;
|
| 10 |
+
justify-content: center;
|
| 11 |
+
}
|
tag2text/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2022 OPPO LLC
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
tag2text/README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# :label: Tag2Text: Guiding Vision-Language Model via Image Tagging
|
| 2 |
+
|
| 3 |
+
Official PyTorch Implementation of the <a href="https://arxiv.org/abs/2303.05657">Tag2Text</a>, an efficient and controllable vision-language model with tagging guidance. Code is available now!
|
| 4 |
+
|
| 5 |
+
Welcome to try out [Tag2Text Web demo🤗](https://huggingface.co/spaces/xinyu1205/Tag2Text)! Both Tagging and Captioning are included.
|
| 6 |
+
|
| 7 |
+
Tag2Text now is combine with [Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything), which can automatically recognize, detect, and segment for an image! Tag2Text showcases powerful image recognition capabilities:
|
| 8 |
+

|
| 9 |
+
|
| 10 |
+
## :fire: News
|
| 11 |
+
|
| 12 |
+
- **`2023/05/20`**: Tag2Text is combined with [VideoChat](https://github.com/OpenGVLab/Ask-Anything), Tag2Text provides powerful tagging and captioning capabilities as a fundamental component!
|
| 13 |
+
- **`2023/04/20`**: We marry [Tag2Text with with Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything) to provide powerful image recognition capabilities!
|
| 14 |
+
- **`2023/04/10`**: Code and checkpoint is available Now!
|
| 15 |
+
- **`2023/03/14`**: [Tag2Text web demo 🤗](https://huggingface.co/spaces/xinyu1205/Tag2Text) is available on Hugging Face Space!
|
| 16 |
+
|
| 17 |
+
## :bulb: Highlight
|
| 18 |
+
|
| 19 |
+
- **Tagging.** Without manual annotations, Tag2Text achieves **superior** image tag recognition ability of [**3,429**](./data/tag_list.txt) commonly human-used categories.
|
| 20 |
+
- **Efficient.** Tagging guidance effectively enhances the performance of vision-language models on both **generation-based** and **alignment-based** tasks.
|
| 21 |
+
- **Controllable.** Tag2Text permits users to input **desired tags**, providing the flexibility in composing corresponding texts based on the input tags.
|
| 22 |
+
|
| 23 |
+
<p align="center">
|
| 24 |
+
<table class="tg">
|
| 25 |
+
<tr>
|
| 26 |
+
<td class="tg-c3ow"><img src="images/tag2text_framework.png" align="center" width="800" ></td>
|
| 27 |
+
</tr>
|
| 28 |
+
</table>
|
| 29 |
+
</p>
|
| 30 |
+
|
| 31 |
+
## :writing_hand: TODO
|
| 32 |
+
|
| 33 |
+
- [x] Release demo.
|
| 34 |
+
- [x] Release checkpoints.
|
| 35 |
+
- [x] Release inference code.
|
| 36 |
+
- [ ] Release training codes.
|
| 37 |
+
- [ ] Release training datasets.
|
| 38 |
+
|
| 39 |
+
## :toolbox: Checkpoints
|
| 40 |
+
|
| 41 |
+
<!-- insert a table -->
|
| 42 |
+
|
| 43 |
+
<table>
|
| 44 |
+
<thead>
|
| 45 |
+
<tr style="text-align: right;">
|
| 46 |
+
<th></th>
|
| 47 |
+
<th>name</th>
|
| 48 |
+
<th>backbone</th>
|
| 49 |
+
<th>Data</th>
|
| 50 |
+
<th>Illustration</th>
|
| 51 |
+
<th>Checkpoint</th>
|
| 52 |
+
</tr>
|
| 53 |
+
</thead>
|
| 54 |
+
<tbody>
|
| 55 |
+
<tr>
|
| 56 |
+
<th>1</th>
|
| 57 |
+
<td>Tag2Text-Swin</td>
|
| 58 |
+
<td>Swin-Base</td>
|
| 59 |
+
<td>COCO, VG, SBU, CC-3M, CC-12M</td>
|
| 60 |
+
<td>Demo version with comprehensive captions.</td>
|
| 61 |
+
<td><a href="https://huggingface.co/spaces/xinyu1205/Tag2Text/blob/main/tag2text_swin_14m.pth">Download link</a></td>
|
| 62 |
+
</tr>
|
| 63 |
+
</tbody>
|
| 64 |
+
</table>
|
| 65 |
+
|
| 66 |
+
## :running: Model Inference
|
| 67 |
+
|
| 68 |
+
1. Install the dependencies, run:
|
| 69 |
+
|
| 70 |
+
<pre/>pip install -r requirements.txt</pre>
|
| 71 |
+
|
| 72 |
+
2. Download Tag2Text pretrained checkpoints.
|
| 73 |
+
|
| 74 |
+
1. Get the tagging and captioning results:
|
| 75 |
+
<pre/>
|
| 76 |
+
python inference.py --image images/1641173_2291260800.jpg \
|
| 77 |
+
--pretrained pretrained/tag2text_swin_14m.pth
|
| 78 |
+
</pre>
|
| 79 |
+
Or get the tagging and sepcifed captioning results (optional):
|
| 80 |
+
<pre/>python inference.py --image images/1641173_2291260800.jpg \
|
| 81 |
+
--pretrained pretrained/tag2text_swin_14m.pth \
|
| 82 |
+
--specified-tags "cloud,sky"</pre>
|
| 83 |
+
|
| 84 |
+
## :black_nib: Citation
|
| 85 |
+
|
| 86 |
+
If you find our work to be useful for your research, please consider citing.
|
| 87 |
+
|
| 88 |
+
```
|
| 89 |
+
@article{huang2023tag2text,
|
| 90 |
+
title={Tag2Text: Guiding Vision-Language Model via Image Tagging},
|
| 91 |
+
author={Huang, Xinyu and Zhang, Youcai and Ma, Jinyu and Tian, Weiwei and Feng, Rui and Zhang, Yuejie and Li, Yaqian and Guo, Yandong and Zhang, Lei},
|
| 92 |
+
journal={arXiv preprint arXiv:2303.05657},
|
| 93 |
+
year={2023}
|
| 94 |
+
}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## :hearts: Acknowledgements
|
| 98 |
+
|
| 99 |
+
This work is done with the help of the amazing code base of [BLIP](https://github.com/salesforce/BLIP), thanks very much!
|
| 100 |
+
|
| 101 |
+
We also want to thank @Cheng Rui @Shilong Liu @Ren Tianhe for their help in [marrying Tag2Text with Grounded-SAM](https://github.com/IDEA-Research/Grounded-Segment-Anything).
|
tag2text/configs/med_config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"hidden_act": "gelu",
|
| 7 |
+
"hidden_dropout_prob": 0.1,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"intermediate_size": 3072,
|
| 11 |
+
"layer_norm_eps": 1e-12,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "bert",
|
| 14 |
+
"num_attention_heads": 12,
|
| 15 |
+
"num_hidden_layers": 12,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"type_vocab_size": 2,
|
| 18 |
+
"vocab_size": 30524,
|
| 19 |
+
"encoder_width": 768,
|
| 20 |
+
"add_cross_attention": true
|
| 21 |
+
}
|
tag2text/configs/q2l_config.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"BertModel"
|
| 4 |
+
],
|
| 5 |
+
"attention_probs_dropout_prob": 0.1,
|
| 6 |
+
"hidden_act": "gelu",
|
| 7 |
+
"hidden_dropout_prob": 0.1,
|
| 8 |
+
"hidden_size": 768,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"intermediate_size": 3072,
|
| 11 |
+
"layer_norm_eps": 1e-12,
|
| 12 |
+
"max_position_embeddings": 512,
|
| 13 |
+
"model_type": "bert",
|
| 14 |
+
"num_attention_heads": 4,
|
| 15 |
+
"num_hidden_layers": 2,
|
| 16 |
+
"pad_token_id": 0,
|
| 17 |
+
"type_vocab_size": 2,
|
| 18 |
+
"vocab_size": 30522,
|
| 19 |
+
"encoder_width": 768,
|
| 20 |
+
"add_cross_attention": true,
|
| 21 |
+
"add_tag_cross_attention": false
|
| 22 |
+
}
|
tag2text/configs/swin/config_swinB_384.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth",
|
| 3 |
+
"vision_width": 1024,
|
| 4 |
+
"image_res": 384,
|
| 5 |
+
"window_size": 12,
|
| 6 |
+
"embed_dim": 128,
|
| 7 |
+
"depths": [ 2, 2, 18, 2 ],
|
| 8 |
+
"num_heads": [ 4, 8, 16, 32 ]
|
| 9 |
+
}
|
tag2text/data/tag_list.txt
ADDED
|
@@ -0,0 +1,3429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
tennis
|
| 2 |
+
bear cub
|
| 3 |
+
observatory
|
| 4 |
+
bicycle
|
| 5 |
+
hillside
|
| 6 |
+
judge
|
| 7 |
+
watercolor illustration
|
| 8 |
+
granite
|
| 9 |
+
lobster
|
| 10 |
+
livery
|
| 11 |
+
stone
|
| 12 |
+
ceramic
|
| 13 |
+
ranch
|
| 14 |
+
cloth
|
| 15 |
+
smile
|
| 16 |
+
building
|
| 17 |
+
tattoo
|
| 18 |
+
cricketer
|
| 19 |
+
cheek
|
| 20 |
+
pear
|
| 21 |
+
source
|
| 22 |
+
winter
|
| 23 |
+
surface
|
| 24 |
+
spray
|
| 25 |
+
ceremony
|
| 26 |
+
magic
|
| 27 |
+
curve
|
| 28 |
+
container
|
| 29 |
+
fair
|
| 30 |
+
medicine
|
| 31 |
+
baby
|
| 32 |
+
tennis racquet
|
| 33 |
+
ornament
|
| 34 |
+
bamboo
|
| 35 |
+
duckling
|
| 36 |
+
song
|
| 37 |
+
safari
|
| 38 |
+
team presentation
|
| 39 |
+
daffodil
|
| 40 |
+
cross
|
| 41 |
+
toothpaste
|
| 42 |
+
shield
|
| 43 |
+
fashion model
|
| 44 |
+
capsule
|
| 45 |
+
map
|
| 46 |
+
creek
|
| 47 |
+
glass house
|
| 48 |
+
glass plate
|
| 49 |
+
siding
|
| 50 |
+
corner
|
| 51 |
+
water buffalo
|
| 52 |
+
bison
|
| 53 |
+
figure skater
|
| 54 |
+
diploma
|
| 55 |
+
tire
|
| 56 |
+
race
|
| 57 |
+
cable car
|
| 58 |
+
brain
|
| 59 |
+
gas stove
|
| 60 |
+
soap bubble
|
| 61 |
+
palette
|
| 62 |
+
snowboard
|
| 63 |
+
school child
|
| 64 |
+
trench coat
|
| 65 |
+
monk
|
| 66 |
+
fiber
|
| 67 |
+
kitchen window
|
| 68 |
+
sunglass
|
| 69 |
+
coffee
|
| 70 |
+
security
|
| 71 |
+
strawberry
|
| 72 |
+
penguin
|
| 73 |
+
tree root
|
| 74 |
+
loaf
|
| 75 |
+
engagement ring
|
| 76 |
+
lamb
|
| 77 |
+
vector cartoon illustration
|
| 78 |
+
sandwich
|
| 79 |
+
mountain village
|
| 80 |
+
shape
|
| 81 |
+
charm
|
| 82 |
+
fiction
|
| 83 |
+
knot
|
| 84 |
+
greenhouse
|
| 85 |
+
sushi
|
| 86 |
+
text
|
| 87 |
+
disaster
|
| 88 |
+
trophy
|
| 89 |
+
gang
|
| 90 |
+
strap
|
| 91 |
+
soccer game
|
| 92 |
+
cardinal
|
| 93 |
+
tee
|
| 94 |
+
turtle
|
| 95 |
+
water surface
|
| 96 |
+
grassland
|
| 97 |
+
dolphin
|
| 98 |
+
store
|
| 99 |
+
dirt
|
| 100 |
+
iceberg
|
| 101 |
+
pergola
|
| 102 |
+
farmer market
|
| 103 |
+
publicity portrait
|
| 104 |
+
tote bag
|
| 105 |
+
teenage girl
|
| 106 |
+
view mirror
|
| 107 |
+
session
|
| 108 |
+
commuter
|
| 109 |
+
dressing room
|
| 110 |
+
tricycle
|
| 111 |
+
christmas ball
|
| 112 |
+
headlight
|
| 113 |
+
police
|
| 114 |
+
armchair
|
| 115 |
+
chart
|
| 116 |
+
yacht
|
| 117 |
+
saw
|
| 118 |
+
printer
|
| 119 |
+
rock band
|
| 120 |
+
gingerbread house
|
| 121 |
+
tag
|
| 122 |
+
table lamp
|
| 123 |
+
hockey game
|
| 124 |
+
slope
|
| 125 |
+
font
|
| 126 |
+
wicker basket
|
| 127 |
+
jewelry
|
| 128 |
+
quarter
|
| 129 |
+
software
|
| 130 |
+
weapon
|
| 131 |
+
pin
|
| 132 |
+
worship
|
| 133 |
+
painter
|
| 134 |
+
goal
|
| 135 |
+
morning light
|
| 136 |
+
bike
|
| 137 |
+
baseball bat
|
| 138 |
+
elevator
|
| 139 |
+
cuisine
|
| 140 |
+
sausage
|
| 141 |
+
stunt
|
| 142 |
+
wrestler
|
| 143 |
+
statue
|
| 144 |
+
landing
|
| 145 |
+
pillar
|
| 146 |
+
willow tree
|
| 147 |
+
sea wave
|
| 148 |
+
chicken
|
| 149 |
+
peanut
|
| 150 |
+
muscle
|
| 151 |
+
bob
|
| 152 |
+
tv genre
|
| 153 |
+
bathroom window
|
| 154 |
+
radish
|
| 155 |
+
textile
|
| 156 |
+
pelican
|
| 157 |
+
marketplace
|
| 158 |
+
crest
|
| 159 |
+
elevation map
|
| 160 |
+
gift
|
| 161 |
+
parish
|
| 162 |
+
traffic light
|
| 163 |
+
campfire
|
| 164 |
+
fog
|
| 165 |
+
award winner
|
| 166 |
+
beach ball
|
| 167 |
+
mat
|
| 168 |
+
white house
|
| 169 |
+
plaster
|
| 170 |
+
moped
|
| 171 |
+
football team
|
| 172 |
+
solution
|
| 173 |
+
bicyclist
|
| 174 |
+
bit
|
| 175 |
+
playground
|
| 176 |
+
darkness
|
| 177 |
+
cake
|
| 178 |
+
maple leave
|
| 179 |
+
mold
|
| 180 |
+
cracker
|
| 181 |
+
blueberry
|
| 182 |
+
rubble
|
| 183 |
+
container ship
|
| 184 |
+
pedestrian bridge
|
| 185 |
+
snail
|
| 186 |
+
parrot
|
| 187 |
+
form
|
| 188 |
+
circuit
|
| 189 |
+
highlight
|
| 190 |
+
pickup truck
|
| 191 |
+
koala
|
| 192 |
+
rain
|
| 193 |
+
system
|
| 194 |
+
weather
|
| 195 |
+
raincoat
|
| 196 |
+
soccer team
|
| 197 |
+
windshield
|
| 198 |
+
thunderstorm
|
| 199 |
+
mike
|
| 200 |
+
bird house
|
| 201 |
+
bridge
|
| 202 |
+
grandfather
|
| 203 |
+
restroom
|
| 204 |
+
animation
|
| 205 |
+
wilderness
|
| 206 |
+
clown
|
| 207 |
+
banana
|
| 208 |
+
brown
|
| 209 |
+
braid
|
| 210 |
+
dining room
|
| 211 |
+
kindergarten
|
| 212 |
+
launch event
|
| 213 |
+
purple
|
| 214 |
+
school
|
| 215 |
+
stairwell
|
| 216 |
+
brooch
|
| 217 |
+
movie poster image
|
| 218 |
+
mountain river
|
| 219 |
+
shelf
|
| 220 |
+
wicket
|
| 221 |
+
headboard
|
| 222 |
+
buddha
|
| 223 |
+
flower field
|
| 224 |
+
dugout
|
| 225 |
+
cd
|
| 226 |
+
bald eagle
|
| 227 |
+
lagoon
|
| 228 |
+
seaweed
|
| 229 |
+
agriculture
|
| 230 |
+
emergency service
|
| 231 |
+
maple tree
|
| 232 |
+
parachute
|
| 233 |
+
continent
|
| 234 |
+
amusement park
|
| 235 |
+
remote
|
| 236 |
+
bun
|
| 237 |
+
tackle
|
| 238 |
+
hospital
|
| 239 |
+
garage door
|
| 240 |
+
birthday party
|
| 241 |
+
friendship
|
| 242 |
+
go
|
| 243 |
+
mausoleum
|
| 244 |
+
jeep
|
| 245 |
+
raccoon
|
| 246 |
+
step
|
| 247 |
+
ice hockey team
|
| 248 |
+
cigarette
|
| 249 |
+
lace dress
|
| 250 |
+
forest floor
|
| 251 |
+
mall
|
| 252 |
+
captain
|
| 253 |
+
milk
|
| 254 |
+
golf course
|
| 255 |
+
meal
|
| 256 |
+
picnic table
|
| 257 |
+
sail
|
| 258 |
+
volleyball
|
| 259 |
+
canal
|
| 260 |
+
terrace
|
| 261 |
+
computer desk
|
| 262 |
+
caravan
|
| 263 |
+
hotel
|
| 264 |
+
cheerleader
|
| 265 |
+
nurse
|
| 266 |
+
museum
|
| 267 |
+
marsh
|
| 268 |
+
fox
|
| 269 |
+
plateau
|
| 270 |
+
night
|
| 271 |
+
twin
|
| 272 |
+
letter logo
|
| 273 |
+
autumn tree
|
| 274 |
+
powder
|
| 275 |
+
convention
|
| 276 |
+
creature
|
| 277 |
+
lighthouse
|
| 278 |
+
shop window
|
| 279 |
+
jacket
|
| 280 |
+
stork
|
| 281 |
+
taxi
|
| 282 |
+
trade
|
| 283 |
+
blackboard
|
| 284 |
+
olive
|
| 285 |
+
road sign
|
| 286 |
+
resort
|
| 287 |
+
snowflake
|
| 288 |
+
cemetery
|
| 289 |
+
travel
|
| 290 |
+
evening dress
|
| 291 |
+
picnic
|
| 292 |
+
drink
|
| 293 |
+
winter morning
|
| 294 |
+
football player
|
| 295 |
+
snack
|
| 296 |
+
boxing glove
|
| 297 |
+
dinner party
|
| 298 |
+
airline
|
| 299 |
+
swing
|
| 300 |
+
port
|
| 301 |
+
wheelbarrow
|
| 302 |
+
bathroom sink
|
| 303 |
+
sweater
|
| 304 |
+
ambulance
|
| 305 |
+
gear
|
| 306 |
+
oil
|
| 307 |
+
wii controller
|
| 308 |
+
array
|
| 309 |
+
home office
|
| 310 |
+
car show
|
| 311 |
+
mixture
|
| 312 |
+
profession
|
| 313 |
+
tree frog
|
| 314 |
+
square
|
| 315 |
+
facility
|
| 316 |
+
coral reef
|
| 317 |
+
sea wall
|
| 318 |
+
pizza
|
| 319 |
+
exhibit
|
| 320 |
+
demolition
|
| 321 |
+
trout
|
| 322 |
+
ring
|
| 323 |
+
coffee shop
|
| 324 |
+
bracelet
|
| 325 |
+
bean
|
| 326 |
+
lip
|
| 327 |
+
fencing
|
| 328 |
+
landscape
|
| 329 |
+
sitting
|
| 330 |
+
package
|
| 331 |
+
metal
|
| 332 |
+
bust
|
| 333 |
+
king
|
| 334 |
+
hair
|
| 335 |
+
window seat
|
| 336 |
+
wildlife
|
| 337 |
+
trunk
|
| 338 |
+
greenery
|
| 339 |
+
stencil
|
| 340 |
+
fire hydrant
|
| 341 |
+
bridesmaid
|
| 342 |
+
plaza
|
| 343 |
+
alps
|
| 344 |
+
tower bridge
|
| 345 |
+
crop top
|
| 346 |
+
crossing
|
| 347 |
+
cinema
|
| 348 |
+
pedestrian crossing
|
| 349 |
+
family
|
| 350 |
+
shopping cart
|
| 351 |
+
stomach
|
| 352 |
+
church building
|
| 353 |
+
screen door
|
| 354 |
+
skater
|
| 355 |
+
soccer field
|
| 356 |
+
kettle
|
| 357 |
+
mussel
|
| 358 |
+
raindrop
|
| 359 |
+
candy cane
|
| 360 |
+
water lily
|
| 361 |
+
flower girl
|
| 362 |
+
desert
|
| 363 |
+
enclosure
|
| 364 |
+
christmas light
|
| 365 |
+
kitchen
|
| 366 |
+
caterpillar
|
| 367 |
+
plaid
|
| 368 |
+
bath
|
| 369 |
+
bush
|
| 370 |
+
mud
|
| 371 |
+
ballet
|
| 372 |
+
knee
|
| 373 |
+
adult
|
| 374 |
+
raft
|
| 375 |
+
sea view
|
| 376 |
+
cactus
|
| 377 |
+
office chair
|
| 378 |
+
overall
|
| 379 |
+
rim
|
| 380 |
+
scaffolding
|
| 381 |
+
pig
|
| 382 |
+
cover
|
| 383 |
+
poster page
|
| 384 |
+
sprinkle
|
| 385 |
+
chandelier
|
| 386 |
+
algae
|
| 387 |
+
traffic
|
| 388 |
+
surfboard
|
| 389 |
+
book
|
| 390 |
+
filming
|
| 391 |
+
flash
|
| 392 |
+
mansion
|
| 393 |
+
camouflage
|
| 394 |
+
trouser
|
| 395 |
+
ticket
|
| 396 |
+
weed
|
| 397 |
+
cab
|
| 398 |
+
trench
|
| 399 |
+
elephant
|
| 400 |
+
huddle
|
| 401 |
+
sphere
|
| 402 |
+
christmas decoration
|
| 403 |
+
city
|
| 404 |
+
launch
|
| 405 |
+
doll
|
| 406 |
+
christmas ornament
|
| 407 |
+
fabric
|
| 408 |
+
bikini
|
| 409 |
+
biplane
|
| 410 |
+
breakfast
|
| 411 |
+
neighbourhood
|
| 412 |
+
race track
|
| 413 |
+
foliage
|
| 414 |
+
avocado
|
| 415 |
+
school bus
|
| 416 |
+
footwear
|
| 417 |
+
highway
|
| 418 |
+
ocean view
|
| 419 |
+
art vector illustration
|
| 420 |
+
wall clock
|
| 421 |
+
curtain
|
| 422 |
+
teenager
|
| 423 |
+
kitchen area
|
| 424 |
+
robot
|
| 425 |
+
tusk
|
| 426 |
+
lounge chair
|
| 427 |
+
beam
|
| 428 |
+
paddle
|
| 429 |
+
camel
|
| 430 |
+
lid
|
| 431 |
+
world map
|
| 432 |
+
city view
|
| 433 |
+
newlywed
|
| 434 |
+
cargo ship
|
| 435 |
+
yellow
|
| 436 |
+
exhibition
|
| 437 |
+
bend
|
| 438 |
+
novel
|
| 439 |
+
wool
|
| 440 |
+
ontario
|
| 441 |
+
bread
|
| 442 |
+
campus
|
| 443 |
+
coastline
|
| 444 |
+
cutting board
|
| 445 |
+
booth
|
| 446 |
+
table top
|
| 447 |
+
carpet
|
| 448 |
+
beach chair
|
| 449 |
+
workout
|
| 450 |
+
street food
|
| 451 |
+
fun
|
| 452 |
+
costumer film designer
|
| 453 |
+
gadget
|
| 454 |
+
artist
|
| 455 |
+
fishing village
|
| 456 |
+
builder
|
| 457 |
+
violinist
|
| 458 |
+
iphone
|
| 459 |
+
spider web
|
| 460 |
+
traffic sign
|
| 461 |
+
ruin
|
| 462 |
+
rescue
|
| 463 |
+
clipboard
|
| 464 |
+
seal
|
| 465 |
+
film director
|
| 466 |
+
paw
|
| 467 |
+
nursery
|
| 468 |
+
intersection
|
| 469 |
+
tomato sauce
|
| 470 |
+
taste
|
| 471 |
+
paddy field
|
| 472 |
+
christmas tree
|
| 473 |
+
wave
|
| 474 |
+
stool
|
| 475 |
+
watering can
|
| 476 |
+
rug
|
| 477 |
+
daytime
|
| 478 |
+
subway station
|
| 479 |
+
craft
|
| 480 |
+
pine forest
|
| 481 |
+
black
|
| 482 |
+
planet
|
| 483 |
+
motif
|
| 484 |
+
christmas market
|
| 485 |
+
glass window
|
| 486 |
+
college
|
| 487 |
+
wheat
|
| 488 |
+
damage
|
| 489 |
+
rectangle
|
| 490 |
+
picture frame
|
| 491 |
+
chess
|
| 492 |
+
guest room
|
| 493 |
+
street corner
|
| 494 |
+
religion
|
| 495 |
+
seed
|
| 496 |
+
puzzle
|
| 497 |
+
freeway
|
| 498 |
+
beauty
|
| 499 |
+
ocean
|
| 500 |
+
watch
|
| 501 |
+
mother
|
| 502 |
+
garage
|
| 503 |
+
quote
|
| 504 |
+
dj
|
| 505 |
+
supporter
|
| 506 |
+
hip hop artist
|
| 507 |
+
muffin
|
| 508 |
+
eiffel tower
|
| 509 |
+
cash
|
| 510 |
+
firefighter
|
| 511 |
+
cauliflower
|
| 512 |
+
bunker
|
| 513 |
+
sled
|
| 514 |
+
manicure
|
| 515 |
+
shark
|
| 516 |
+
stall
|
| 517 |
+
jungle
|
| 518 |
+
family home
|
| 519 |
+
tour bus
|
| 520 |
+
chimney
|
| 521 |
+
touchdown
|
| 522 |
+
roundabout
|
| 523 |
+
coyote
|
| 524 |
+
street scene
|
| 525 |
+
tank
|
| 526 |
+
wedding dress
|
| 527 |
+
mantle
|
| 528 |
+
bedroom window
|
| 529 |
+
coconut
|
| 530 |
+
chapel
|
| 531 |
+
goat
|
| 532 |
+
living space
|
| 533 |
+
rock wall
|
| 534 |
+
polka dot
|
| 535 |
+
railway
|
| 536 |
+
mandala
|
| 537 |
+
mango
|
| 538 |
+
lesson
|
| 539 |
+
mountain landscape
|
| 540 |
+
team photo
|
| 541 |
+
bookshelf
|
| 542 |
+
meter
|
| 543 |
+
bulldog
|
| 544 |
+
evening sun
|
| 545 |
+
stick
|
| 546 |
+
card
|
| 547 |
+
pink
|
| 548 |
+
fish pond
|
| 549 |
+
paint
|
| 550 |
+
pill
|
| 551 |
+
cart
|
| 552 |
+
pea
|
| 553 |
+
van
|
| 554 |
+
album
|
| 555 |
+
football college game
|
| 556 |
+
mountain pass
|
| 557 |
+
doughnut
|
| 558 |
+
ski slope
|
| 559 |
+
match
|
| 560 |
+
official
|
| 561 |
+
shadow
|
| 562 |
+
organ
|
| 563 |
+
celebration
|
| 564 |
+
coin
|
| 565 |
+
log cabin
|
| 566 |
+
firework display
|
| 567 |
+
present
|
| 568 |
+
twig
|
| 569 |
+
chef
|
| 570 |
+
confetti
|
| 571 |
+
footpath
|
| 572 |
+
tour
|
| 573 |
+
ponytail
|
| 574 |
+
artwork
|
| 575 |
+
race car
|
| 576 |
+
club
|
| 577 |
+
season
|
| 578 |
+
hose
|
| 579 |
+
pencil
|
| 580 |
+
aircraft
|
| 581 |
+
rock formation
|
| 582 |
+
wardrobe
|
| 583 |
+
participant
|
| 584 |
+
politician
|
| 585 |
+
engineer
|
| 586 |
+
peace
|
| 587 |
+
filter
|
| 588 |
+
sailing boat
|
| 589 |
+
water bottle
|
| 590 |
+
service dog
|
| 591 |
+
poodle
|
| 592 |
+
loki
|
| 593 |
+
statesman
|
| 594 |
+
sleeping bag
|
| 595 |
+
outskirt
|
| 596 |
+
clock
|
| 597 |
+
factory
|
| 598 |
+
oak tree
|
| 599 |
+
physician
|
| 600 |
+
color
|
| 601 |
+
room
|
| 602 |
+
stairway
|
| 603 |
+
company
|
| 604 |
+
lady
|
| 605 |
+
graph
|
| 606 |
+
faucet
|
| 607 |
+
tablecloth
|
| 608 |
+
subway train
|
| 609 |
+
chocolate chip cookie
|
| 610 |
+
headquarters
|
| 611 |
+
screw
|
| 612 |
+
goggle
|
| 613 |
+
halloween
|
| 614 |
+
city street
|
| 615 |
+
swirl
|
| 616 |
+
cord
|
| 617 |
+
forward
|
| 618 |
+
bone
|
| 619 |
+
bedding
|
| 620 |
+
archway
|
| 621 |
+
wig
|
| 622 |
+
lobby
|
| 623 |
+
mask
|
| 624 |
+
attic
|
| 625 |
+
kitchen table
|
| 626 |
+
skylight
|
| 627 |
+
fire
|
| 628 |
+
exit
|
| 629 |
+
oil painting
|
| 630 |
+
passenger
|
| 631 |
+
meditation
|
| 632 |
+
salmon
|
| 633 |
+
fedora
|
| 634 |
+
rubber stamp
|
| 635 |
+
orange juice
|
| 636 |
+
arch
|
| 637 |
+
scientist
|
| 638 |
+
stroll
|
| 639 |
+
manhattan
|
| 640 |
+
float
|
| 641 |
+
baseball uniform
|
| 642 |
+
circle
|
| 643 |
+
church
|
| 644 |
+
decker bus
|
| 645 |
+
competitor
|
| 646 |
+
zoo
|
| 647 |
+
basketball team
|
| 648 |
+
tourist
|
| 649 |
+
daughter
|
| 650 |
+
silverware
|
| 651 |
+
ceiling fan
|
| 652 |
+
birth
|
| 653 |
+
vase
|
| 654 |
+
jack
|
| 655 |
+
mushroom
|
| 656 |
+
spiral
|
| 657 |
+
cage
|
| 658 |
+
limb
|
| 659 |
+
salad
|
| 660 |
+
ad
|
| 661 |
+
control
|
| 662 |
+
earth
|
| 663 |
+
party
|
| 664 |
+
bolt
|
| 665 |
+
tractor
|
| 666 |
+
barley
|
| 667 |
+
wedding photo
|
| 668 |
+
hawk
|
| 669 |
+
warehouse
|
| 670 |
+
vegetable garden
|
| 671 |
+
chocolate cake
|
| 672 |
+
cabbage
|
| 673 |
+
floor window
|
| 674 |
+
baby shower
|
| 675 |
+
magnifying glass
|
| 676 |
+
table
|
| 677 |
+
stethoscope
|
| 678 |
+
reading
|
| 679 |
+
mission
|
| 680 |
+
croissant
|
| 681 |
+
gift box
|
| 682 |
+
rocket
|
| 683 |
+
forest road
|
| 684 |
+
cooking
|
| 685 |
+
suite
|
| 686 |
+
hill country
|
| 687 |
+
motorcycle
|
| 688 |
+
baseball player
|
| 689 |
+
angle
|
| 690 |
+
drug
|
| 691 |
+
sport association
|
| 692 |
+
championship
|
| 693 |
+
family portrait
|
| 694 |
+
florist
|
| 695 |
+
softball
|
| 696 |
+
egret
|
| 697 |
+
office
|
| 698 |
+
plywood
|
| 699 |
+
jockey
|
| 700 |
+
mosque
|
| 701 |
+
brunch
|
| 702 |
+
beanie
|
| 703 |
+
office building
|
| 704 |
+
pattern
|
| 705 |
+
calendar
|
| 706 |
+
indoor
|
| 707 |
+
pepper
|
| 708 |
+
ledge
|
| 709 |
+
trail
|
| 710 |
+
fuel
|
| 711 |
+
laptop computer
|
| 712 |
+
tennis shoe
|
| 713 |
+
deck chair
|
| 714 |
+
guitarist
|
| 715 |
+
barn
|
| 716 |
+
surgery
|
| 717 |
+
cartoon illustration
|
| 718 |
+
nebula
|
| 719 |
+
railroad
|
| 720 |
+
mountain goat
|
| 721 |
+
goose
|
| 722 |
+
car door
|
| 723 |
+
cheer
|
| 724 |
+
liquid
|
| 725 |
+
hardwood floor
|
| 726 |
+
pathway
|
| 727 |
+
acorn
|
| 728 |
+
gull
|
| 729 |
+
airliner
|
| 730 |
+
couch
|
| 731 |
+
lake house
|
| 732 |
+
spaghetti
|
| 733 |
+
promenade
|
| 734 |
+
collection
|
| 735 |
+
garden
|
| 736 |
+
bank
|
| 737 |
+
robin
|
| 738 |
+
tennis ball
|
| 739 |
+
peony
|
| 740 |
+
gymnast
|
| 741 |
+
lavender
|
| 742 |
+
deck
|
| 743 |
+
test
|
| 744 |
+
riverside
|
| 745 |
+
rapper
|
| 746 |
+
domino
|
| 747 |
+
bride
|
| 748 |
+
mouse
|
| 749 |
+
basil
|
| 750 |
+
wedding couple
|
| 751 |
+
ocean wave
|
| 752 |
+
arm
|
| 753 |
+
kitchen floor
|
| 754 |
+
grove
|
| 755 |
+
family member
|
| 756 |
+
backyard
|
| 757 |
+
raspberry
|
| 758 |
+
forest fire
|
| 759 |
+
officer
|
| 760 |
+
hibiscus
|
| 761 |
+
canyon
|
| 762 |
+
composer
|
| 763 |
+
signature
|
| 764 |
+
olive oil
|
| 765 |
+
hibiscus flower
|
| 766 |
+
rose
|
| 767 |
+
vector icon
|
| 768 |
+
sunrise
|
| 769 |
+
horseback
|
| 770 |
+
motor scooter
|
| 771 |
+
office worker
|
| 772 |
+
tradition
|
| 773 |
+
ingredient
|
| 774 |
+
washing machine
|
| 775 |
+
lighting
|
| 776 |
+
bagel
|
| 777 |
+
sailboat
|
| 778 |
+
policeman
|
| 779 |
+
mare
|
| 780 |
+
graphic
|
| 781 |
+
halloween pumpkin
|
| 782 |
+
stock
|
| 783 |
+
pilot
|
| 784 |
+
education
|
| 785 |
+
team
|
| 786 |
+
body
|
| 787 |
+
horse
|
| 788 |
+
kimono
|
| 789 |
+
bazaar
|
| 790 |
+
bag
|
| 791 |
+
recording studio
|
| 792 |
+
parsley
|
| 793 |
+
entrance
|
| 794 |
+
denim
|
| 795 |
+
vet
|
| 796 |
+
horse farm
|
| 797 |
+
charcoal
|
| 798 |
+
architecture
|
| 799 |
+
glass vase
|
| 800 |
+
puppy
|
| 801 |
+
estuary
|
| 802 |
+
television show host
|
| 803 |
+
city bus
|
| 804 |
+
shoulder
|
| 805 |
+
beast
|
| 806 |
+
balance
|
| 807 |
+
golfer
|
| 808 |
+
roadside
|
| 809 |
+
denim jacket
|
| 810 |
+
stone wall
|
| 811 |
+
counter top
|
| 812 |
+
app icon
|
| 813 |
+
toast
|
| 814 |
+
head coach
|
| 815 |
+
ham
|
| 816 |
+
warrior
|
| 817 |
+
gem
|
| 818 |
+
refrigerator
|
| 819 |
+
snowman
|
| 820 |
+
construction worker
|
| 821 |
+
coal
|
| 822 |
+
website
|
| 823 |
+
morning fog
|
| 824 |
+
mustard
|
| 825 |
+
human
|
| 826 |
+
owl
|
| 827 |
+
puppy dog
|
| 828 |
+
piggy bank
|
| 829 |
+
vegetation
|
| 830 |
+
pirate
|
| 831 |
+
action film
|
| 832 |
+
marshmallow
|
| 833 |
+
thanksgiving
|
| 834 |
+
business
|
| 835 |
+
disease
|
| 836 |
+
signage
|
| 837 |
+
greeting
|
| 838 |
+
skate park
|
| 839 |
+
tile
|
| 840 |
+
mouth
|
| 841 |
+
spinach
|
| 842 |
+
vacation
|
| 843 |
+
leader
|
| 844 |
+
shrine
|
| 845 |
+
walker
|
| 846 |
+
science fiction film
|
| 847 |
+
bill
|
| 848 |
+
rabbit
|
| 849 |
+
motor boat
|
| 850 |
+
bar
|
| 851 |
+
radio
|
| 852 |
+
barge
|
| 853 |
+
tail
|
| 854 |
+
chainsaw
|
| 855 |
+
gallery
|
| 856 |
+
rainbow
|
| 857 |
+
pasta
|
| 858 |
+
padlock
|
| 859 |
+
web
|
| 860 |
+
pastry
|
| 861 |
+
ink
|
| 862 |
+
reef
|
| 863 |
+
school uniform
|
| 864 |
+
shawl
|
| 865 |
+
treasure
|
| 866 |
+
peach
|
| 867 |
+
dinner table
|
| 868 |
+
injury
|
| 869 |
+
harbor
|
| 870 |
+
witch
|
| 871 |
+
car dealership
|
| 872 |
+
litter
|
| 873 |
+
gesture
|
| 874 |
+
documentary
|
| 875 |
+
marriage
|
| 876 |
+
sea shell
|
| 877 |
+
priest
|
| 878 |
+
dome
|
| 879 |
+
kit
|
| 880 |
+
icon
|
| 881 |
+
seaside
|
| 882 |
+
bucket
|
| 883 |
+
entertainment
|
| 884 |
+
stable
|
| 885 |
+
hat
|
| 886 |
+
puddle
|
| 887 |
+
sock
|
| 888 |
+
shopper
|
| 889 |
+
technology
|
| 890 |
+
harbour
|
| 891 |
+
orbit
|
| 892 |
+
antler
|
| 893 |
+
tube
|
| 894 |
+
flag waving
|
| 895 |
+
cook
|
| 896 |
+
tight
|
| 897 |
+
commander
|
| 898 |
+
farmland
|
| 899 |
+
switch
|
| 900 |
+
hiker
|
| 901 |
+
wedding ceremony
|
| 902 |
+
award ceremony
|
| 903 |
+
champion
|
| 904 |
+
chopstick
|
| 905 |
+
farmhouse
|
| 906 |
+
performer
|
| 907 |
+
spike
|
| 908 |
+
accident
|
| 909 |
+
cruise ship
|
| 910 |
+
passenger train
|
| 911 |
+
attraction
|
| 912 |
+
entertainer
|
| 913 |
+
rear view
|
| 914 |
+
sidewalk
|
| 915 |
+
parade
|
| 916 |
+
racing
|
| 917 |
+
plane
|
| 918 |
+
ritual
|
| 919 |
+
peacock
|
| 920 |
+
pocket
|
| 921 |
+
plum
|
| 922 |
+
drop
|
| 923 |
+
carrot
|
| 924 |
+
floor
|
| 925 |
+
sunset
|
| 926 |
+
troop
|
| 927 |
+
architect
|
| 928 |
+
coffee table
|
| 929 |
+
dust
|
| 930 |
+
outline
|
| 931 |
+
leather
|
| 932 |
+
charity event
|
| 933 |
+
heat
|
| 934 |
+
whale
|
| 935 |
+
laundry
|
| 936 |
+
coconut tree
|
| 937 |
+
crosswalk
|
| 938 |
+
pony
|
| 939 |
+
ant
|
| 940 |
+
pipe
|
| 941 |
+
string
|
| 942 |
+
coat
|
| 943 |
+
angel
|
| 944 |
+
beef
|
| 945 |
+
church tower
|
| 946 |
+
dish
|
| 947 |
+
pitch
|
| 948 |
+
cupboard
|
| 949 |
+
thermometer
|
| 950 |
+
dirt field
|
| 951 |
+
fireworks
|
| 952 |
+
minute
|
| 953 |
+
cane
|
| 954 |
+
pajama
|
| 955 |
+
flower garden
|
| 956 |
+
autumn
|
| 957 |
+
trash can
|
| 958 |
+
dachshund
|
| 959 |
+
banana tree
|
| 960 |
+
tray
|
| 961 |
+
moose
|
| 962 |
+
roadway
|
| 963 |
+
carnival
|
| 964 |
+
antenna
|
| 965 |
+
pole
|
| 966 |
+
castle wall
|
| 967 |
+
ram
|
| 968 |
+
cattle
|
| 969 |
+
hay
|
| 970 |
+
cookie
|
| 971 |
+
swimmer
|
| 972 |
+
baseball team
|
| 973 |
+
strait
|
| 974 |
+
hedge
|
| 975 |
+
jet
|
| 976 |
+
fire pit
|
| 977 |
+
octopus
|
| 978 |
+
calf
|
| 979 |
+
cube
|
| 980 |
+
opera
|
| 981 |
+
cardboard box
|
| 982 |
+
tiara
|
| 983 |
+
kitchen sink
|
| 984 |
+
prairie
|
| 985 |
+
bowl
|
| 986 |
+
galaxy
|
| 987 |
+
straw hat
|
| 988 |
+
linen
|
| 989 |
+
ski resort
|
| 990 |
+
stitch
|
| 991 |
+
street lamp
|
| 992 |
+
motorist
|
| 993 |
+
icicle
|
| 994 |
+
stain
|
| 995 |
+
flora
|
| 996 |
+
drain
|
| 997 |
+
kitchen cabinet
|
| 998 |
+
decor
|
| 999 |
+
bouquet
|
| 1000 |
+
pound
|
| 1001 |
+
interior design
|
| 1002 |
+
nail polish
|
| 1003 |
+
figurine
|
| 1004 |
+
tomb
|
| 1005 |
+
disc
|
| 1006 |
+
twist
|
| 1007 |
+
blouse
|
| 1008 |
+
ribbon
|
| 1009 |
+
figure
|
| 1010 |
+
burger
|
| 1011 |
+
cork
|
| 1012 |
+
soccer goalkeeper
|
| 1013 |
+
train bridge
|
| 1014 |
+
drinking water
|
| 1015 |
+
dew
|
| 1016 |
+
baker
|
| 1017 |
+
storm cloud
|
| 1018 |
+
tarmac
|
| 1019 |
+
tv drama
|
| 1020 |
+
sponge
|
| 1021 |
+
magnet
|
| 1022 |
+
sailor
|
| 1023 |
+
entry
|
| 1024 |
+
swan
|
| 1025 |
+
exercise
|
| 1026 |
+
sloth
|
| 1027 |
+
jewel
|
| 1028 |
+
scuba diver
|
| 1029 |
+
bite
|
| 1030 |
+
cat tree
|
| 1031 |
+
tent
|
| 1032 |
+
can
|
| 1033 |
+
tennis match
|
| 1034 |
+
ecosystem
|
| 1035 |
+
picket fence
|
| 1036 |
+
palm
|
| 1037 |
+
train car
|
| 1038 |
+
frying pan
|
| 1039 |
+
rally
|
| 1040 |
+
tablet pc
|
| 1041 |
+
reindeer
|
| 1042 |
+
image
|
| 1043 |
+
wolf
|
| 1044 |
+
chin
|
| 1045 |
+
conservatory
|
| 1046 |
+
flood water
|
| 1047 |
+
cityscape
|
| 1048 |
+
beach sand
|
| 1049 |
+
car park
|
| 1050 |
+
pavement
|
| 1051 |
+
farm field
|
| 1052 |
+
swimming
|
| 1053 |
+
winter storm
|
| 1054 |
+
stem
|
| 1055 |
+
pillow
|
| 1056 |
+
inning
|
| 1057 |
+
gorilla
|
| 1058 |
+
desk
|
| 1059 |
+
avenue
|
| 1060 |
+
fern
|
| 1061 |
+
money
|
| 1062 |
+
pearl
|
| 1063 |
+
train station
|
| 1064 |
+
skillet
|
| 1065 |
+
nap
|
| 1066 |
+
barber
|
| 1067 |
+
library
|
| 1068 |
+
freezer
|
| 1069 |
+
label
|
| 1070 |
+
rainforest
|
| 1071 |
+
parking sign
|
| 1072 |
+
mirror
|
| 1073 |
+
wing
|
| 1074 |
+
noodle
|
| 1075 |
+
press room
|
| 1076 |
+
sculpture
|
| 1077 |
+
tablet
|
| 1078 |
+
viewer
|
| 1079 |
+
prayer
|
| 1080 |
+
mini
|
| 1081 |
+
mechanic
|
| 1082 |
+
laugh
|
| 1083 |
+
rice field
|
| 1084 |
+
hand
|
| 1085 |
+
mustache
|
| 1086 |
+
mountain road
|
| 1087 |
+
catwalk
|
| 1088 |
+
conference
|
| 1089 |
+
cape
|
| 1090 |
+
installation
|
| 1091 |
+
musician
|
| 1092 |
+
stream
|
| 1093 |
+
machine
|
| 1094 |
+
speech
|
| 1095 |
+
crocodile
|
| 1096 |
+
soccer match
|
| 1097 |
+
town square
|
| 1098 |
+
passport
|
| 1099 |
+
post box
|
| 1100 |
+
point
|
| 1101 |
+
stone building
|
| 1102 |
+
motorway
|
| 1103 |
+
mix
|
| 1104 |
+
dentist
|
| 1105 |
+
businessperson
|
| 1106 |
+
happiness
|
| 1107 |
+
boat
|
| 1108 |
+
vineyard
|
| 1109 |
+
treadmill
|
| 1110 |
+
glass wall
|
| 1111 |
+
water droplet
|
| 1112 |
+
coffee mug
|
| 1113 |
+
graduate
|
| 1114 |
+
sunflower
|
| 1115 |
+
parliament
|
| 1116 |
+
shepherd
|
| 1117 |
+
movie
|
| 1118 |
+
wine
|
| 1119 |
+
orchard
|
| 1120 |
+
tulip
|
| 1121 |
+
motherboard
|
| 1122 |
+
cup
|
| 1123 |
+
broom
|
| 1124 |
+
spot
|
| 1125 |
+
drawing
|
| 1126 |
+
polo shirt
|
| 1127 |
+
graduation
|
| 1128 |
+
film producer
|
| 1129 |
+
moonlight
|
| 1130 |
+
glow
|
| 1131 |
+
film format
|
| 1132 |
+
t shirt
|
| 1133 |
+
rock face
|
| 1134 |
+
sword
|
| 1135 |
+
clinic
|
| 1136 |
+
festival day
|
| 1137 |
+
meadow
|
| 1138 |
+
staple
|
| 1139 |
+
pupil
|
| 1140 |
+
training ground
|
| 1141 |
+
rider
|
| 1142 |
+
flower
|
| 1143 |
+
foal
|
| 1144 |
+
wharf
|
| 1145 |
+
foot bridge
|
| 1146 |
+
shooting
|
| 1147 |
+
top
|
| 1148 |
+
mast
|
| 1149 |
+
police car
|
| 1150 |
+
robe
|
| 1151 |
+
wedding bouquet
|
| 1152 |
+
stop sign
|
| 1153 |
+
birthday cake
|
| 1154 |
+
glitter
|
| 1155 |
+
butter
|
| 1156 |
+
scooter
|
| 1157 |
+
tundra
|
| 1158 |
+
superhero
|
| 1159 |
+
pocket watch
|
| 1160 |
+
inscription
|
| 1161 |
+
youngster
|
| 1162 |
+
fruit tree
|
| 1163 |
+
movie poster
|
| 1164 |
+
engine
|
| 1165 |
+
foundation
|
| 1166 |
+
motorcyclist
|
| 1167 |
+
take
|
| 1168 |
+
woman
|
| 1169 |
+
antelope
|
| 1170 |
+
country artist
|
| 1171 |
+
road trip
|
| 1172 |
+
typewriter
|
| 1173 |
+
tuxedo
|
| 1174 |
+
brand
|
| 1175 |
+
pine
|
| 1176 |
+
bathroom
|
| 1177 |
+
paradise
|
| 1178 |
+
texture
|
| 1179 |
+
balloon
|
| 1180 |
+
dining table
|
| 1181 |
+
home
|
| 1182 |
+
computer screen
|
| 1183 |
+
actor
|
| 1184 |
+
clip
|
| 1185 |
+
tv tower
|
| 1186 |
+
panorama
|
| 1187 |
+
summit
|
| 1188 |
+
cat
|
| 1189 |
+
plot
|
| 1190 |
+
eagle
|
| 1191 |
+
dancer
|
| 1192 |
+
pup
|
| 1193 |
+
studio shot
|
| 1194 |
+
tear
|
| 1195 |
+
bird bath
|
| 1196 |
+
classroom
|
| 1197 |
+
bookstore
|
| 1198 |
+
city wall
|
| 1199 |
+
tv programme
|
| 1200 |
+
blade
|
| 1201 |
+
easel
|
| 1202 |
+
buttercream
|
| 1203 |
+
sweet
|
| 1204 |
+
designer
|
| 1205 |
+
diamond
|
| 1206 |
+
handshake
|
| 1207 |
+
herb
|
| 1208 |
+
corn field
|
| 1209 |
+
seafront
|
| 1210 |
+
concrete
|
| 1211 |
+
street artist
|
| 1212 |
+
gas
|
| 1213 |
+
stamp
|
| 1214 |
+
window display
|
| 1215 |
+
paper
|
| 1216 |
+
note
|
| 1217 |
+
pint
|
| 1218 |
+
quarry
|
| 1219 |
+
research
|
| 1220 |
+
fixture
|
| 1221 |
+
manager
|
| 1222 |
+
soil
|
| 1223 |
+
leopard
|
| 1224 |
+
board game
|
| 1225 |
+
ladder
|
| 1226 |
+
stop light
|
| 1227 |
+
island
|
| 1228 |
+
ramp
|
| 1229 |
+
football match
|
| 1230 |
+
icing
|
| 1231 |
+
drill
|
| 1232 |
+
currency
|
| 1233 |
+
summer evening
|
| 1234 |
+
topping
|
| 1235 |
+
pyramid
|
| 1236 |
+
pomegranate
|
| 1237 |
+
cell
|
| 1238 |
+
ivy
|
| 1239 |
+
squad
|
| 1240 |
+
scenery
|
| 1241 |
+
computer
|
| 1242 |
+
locomotive
|
| 1243 |
+
surf
|
| 1244 |
+
mascot
|
| 1245 |
+
dune
|
| 1246 |
+
path
|
| 1247 |
+
duck
|
| 1248 |
+
twilight
|
| 1249 |
+
wire
|
| 1250 |
+
bow tie
|
| 1251 |
+
strike
|
| 1252 |
+
cormorant
|
| 1253 |
+
car wash
|
| 1254 |
+
crane
|
| 1255 |
+
market
|
| 1256 |
+
philosopher
|
| 1257 |
+
alarm clock
|
| 1258 |
+
camera
|
| 1259 |
+
birch
|
| 1260 |
+
greeting card
|
| 1261 |
+
plain
|
| 1262 |
+
clay
|
| 1263 |
+
donut
|
| 1264 |
+
lock
|
| 1265 |
+
moth
|
| 1266 |
+
laboratory
|
| 1267 |
+
fan
|
| 1268 |
+
violin
|
| 1269 |
+
jazz fusion artist
|
| 1270 |
+
mountain biker
|
| 1271 |
+
terrain
|
| 1272 |
+
magazine
|
| 1273 |
+
pickup
|
| 1274 |
+
comedy film
|
| 1275 |
+
smartphone
|
| 1276 |
+
film
|
| 1277 |
+
bed
|
| 1278 |
+
microwave oven
|
| 1279 |
+
tournament
|
| 1280 |
+
lawn
|
| 1281 |
+
car window
|
| 1282 |
+
alligator
|
| 1283 |
+
screen
|
| 1284 |
+
jetty
|
| 1285 |
+
shopping bag
|
| 1286 |
+
landscape view
|
| 1287 |
+
cabinetry
|
| 1288 |
+
friendly match
|
| 1289 |
+
thing
|
| 1290 |
+
petal
|
| 1291 |
+
shopping center
|
| 1292 |
+
transport
|
| 1293 |
+
ballet dancer
|
| 1294 |
+
shoreline
|
| 1295 |
+
princess
|
| 1296 |
+
car seat
|
| 1297 |
+
parking meter
|
| 1298 |
+
green
|
| 1299 |
+
vodka
|
| 1300 |
+
band
|
| 1301 |
+
rock
|
| 1302 |
+
costume
|
| 1303 |
+
warning sign
|
| 1304 |
+
strip
|
| 1305 |
+
plaque
|
| 1306 |
+
wheelchair
|
| 1307 |
+
headband
|
| 1308 |
+
ginger
|
| 1309 |
+
dice
|
| 1310 |
+
media
|
| 1311 |
+
hairdresser
|
| 1312 |
+
press
|
| 1313 |
+
living room
|
| 1314 |
+
stove
|
| 1315 |
+
player
|
| 1316 |
+
cherry
|
| 1317 |
+
workshop
|
| 1318 |
+
carving
|
| 1319 |
+
embroidery
|
| 1320 |
+
doodle
|
| 1321 |
+
adventure
|
| 1322 |
+
rugby player
|
| 1323 |
+
monument
|
| 1324 |
+
brush
|
| 1325 |
+
marker
|
| 1326 |
+
loft
|
| 1327 |
+
postcard
|
| 1328 |
+
collage
|
| 1329 |
+
ball
|
| 1330 |
+
professor
|
| 1331 |
+
dresser
|
| 1332 |
+
gig
|
| 1333 |
+
festival
|
| 1334 |
+
blackbird
|
| 1335 |
+
makeup artist
|
| 1336 |
+
video camera
|
| 1337 |
+
sticker
|
| 1338 |
+
peak
|
| 1339 |
+
wildflower
|
| 1340 |
+
santa hat
|
| 1341 |
+
rodeo
|
| 1342 |
+
wedding photographer
|
| 1343 |
+
guy
|
| 1344 |
+
staff
|
| 1345 |
+
waterfall
|
| 1346 |
+
operation
|
| 1347 |
+
defender
|
| 1348 |
+
falcon
|
| 1349 |
+
haze
|
| 1350 |
+
individual
|
| 1351 |
+
gentleman
|
| 1352 |
+
greyhound
|
| 1353 |
+
rocking chair
|
| 1354 |
+
rice
|
| 1355 |
+
garbage
|
| 1356 |
+
platter
|
| 1357 |
+
chocolate
|
| 1358 |
+
splash
|
| 1359 |
+
business suit
|
| 1360 |
+
cheetah
|
| 1361 |
+
valley
|
| 1362 |
+
maze
|
| 1363 |
+
trampoline
|
| 1364 |
+
garland
|
| 1365 |
+
slalom
|
| 1366 |
+
unicorn
|
| 1367 |
+
tree stump
|
| 1368 |
+
painting
|
| 1369 |
+
romance
|
| 1370 |
+
fight
|
| 1371 |
+
alcohol
|
| 1372 |
+
ghost
|
| 1373 |
+
fondant
|
| 1374 |
+
spa
|
| 1375 |
+
shutter
|
| 1376 |
+
death
|
| 1377 |
+
demonstration
|
| 1378 |
+
cotton
|
| 1379 |
+
pier
|
| 1380 |
+
flea market
|
| 1381 |
+
history
|
| 1382 |
+
savannah
|
| 1383 |
+
fist
|
| 1384 |
+
aisle
|
| 1385 |
+
crew
|
| 1386 |
+
jug
|
| 1387 |
+
pose
|
| 1388 |
+
anchor
|
| 1389 |
+
teapot
|
| 1390 |
+
boat house
|
| 1391 |
+
business team
|
| 1392 |
+
tripod
|
| 1393 |
+
bee
|
| 1394 |
+
pebble
|
| 1395 |
+
mattress
|
| 1396 |
+
canvas
|
| 1397 |
+
hallway
|
| 1398 |
+
campaign
|
| 1399 |
+
pod
|
| 1400 |
+
lake district
|
| 1401 |
+
article
|
| 1402 |
+
white
|
| 1403 |
+
sofa
|
| 1404 |
+
honey
|
| 1405 |
+
marathon
|
| 1406 |
+
pancake
|
| 1407 |
+
tourist attraction
|
| 1408 |
+
wedding gown
|
| 1409 |
+
battle
|
| 1410 |
+
shelving
|
| 1411 |
+
sea
|
| 1412 |
+
sheet music
|
| 1413 |
+
pie
|
| 1414 |
+
yarn
|
| 1415 |
+
construction site
|
| 1416 |
+
flyer
|
| 1417 |
+
tie
|
| 1418 |
+
star
|
| 1419 |
+
lettuce
|
| 1420 |
+
martial artist
|
| 1421 |
+
dart
|
| 1422 |
+
straw
|
| 1423 |
+
reflection
|
| 1424 |
+
conference room
|
| 1425 |
+
temperature
|
| 1426 |
+
rugby
|
| 1427 |
+
mosquito
|
| 1428 |
+
physicist
|
| 1429 |
+
rock climber
|
| 1430 |
+
crash
|
| 1431 |
+
backdrop
|
| 1432 |
+
toilet seat
|
| 1433 |
+
sand castle
|
| 1434 |
+
water park
|
| 1435 |
+
toy car
|
| 1436 |
+
waste
|
| 1437 |
+
luxury
|
| 1438 |
+
hangar
|
| 1439 |
+
rv
|
| 1440 |
+
tree trunk
|
| 1441 |
+
board
|
| 1442 |
+
gold
|
| 1443 |
+
project picture
|
| 1444 |
+
cap
|
| 1445 |
+
cottage
|
| 1446 |
+
relief
|
| 1447 |
+
attire
|
| 1448 |
+
microscope
|
| 1449 |
+
battery
|
| 1450 |
+
roll
|
| 1451 |
+
line
|
| 1452 |
+
parking garage
|
| 1453 |
+
crystal
|
| 1454 |
+
broadcasting
|
| 1455 |
+
brick wall
|
| 1456 |
+
lab
|
| 1457 |
+
flooring
|
| 1458 |
+
meeting
|
| 1459 |
+
3d cg rendering
|
| 1460 |
+
desktop computer
|
| 1461 |
+
cowboy
|
| 1462 |
+
sailing ship
|
| 1463 |
+
junction
|
| 1464 |
+
hairstyle
|
| 1465 |
+
homework
|
| 1466 |
+
profile
|
| 1467 |
+
model
|
| 1468 |
+
flower pot
|
| 1469 |
+
street light
|
| 1470 |
+
salt lake
|
| 1471 |
+
maple
|
| 1472 |
+
space
|
| 1473 |
+
blizzard
|
| 1474 |
+
throw
|
| 1475 |
+
zebras
|
| 1476 |
+
brochure
|
| 1477 |
+
constellation
|
| 1478 |
+
beak
|
| 1479 |
+
kilt
|
| 1480 |
+
pond
|
| 1481 |
+
blue sky
|
| 1482 |
+
sneaker
|
| 1483 |
+
sand dune
|
| 1484 |
+
morning sun
|
| 1485 |
+
almond
|
| 1486 |
+
grill
|
| 1487 |
+
curl
|
| 1488 |
+
basketball girl game
|
| 1489 |
+
chameleon
|
| 1490 |
+
toilet bowl
|
| 1491 |
+
prince
|
| 1492 |
+
keyboard
|
| 1493 |
+
queen
|
| 1494 |
+
computer monitor
|
| 1495 |
+
writing
|
| 1496 |
+
crown
|
| 1497 |
+
basilica
|
| 1498 |
+
kiss
|
| 1499 |
+
house
|
| 1500 |
+
parking
|
| 1501 |
+
football competition
|
| 1502 |
+
shell
|
| 1503 |
+
sport equipment
|
| 1504 |
+
comedy
|
| 1505 |
+
baboon
|
| 1506 |
+
vendor
|
| 1507 |
+
rise building
|
| 1508 |
+
wrap
|
| 1509 |
+
food truck
|
| 1510 |
+
cat bed
|
| 1511 |
+
rickshaw
|
| 1512 |
+
flare
|
| 1513 |
+
teal
|
| 1514 |
+
nectar
|
| 1515 |
+
eclipse
|
| 1516 |
+
vehicle
|
| 1517 |
+
steam locomotive
|
| 1518 |
+
gorge
|
| 1519 |
+
cow
|
| 1520 |
+
christmas card
|
| 1521 |
+
demonstrator
|
| 1522 |
+
memorial
|
| 1523 |
+
towel
|
| 1524 |
+
jewellery
|
| 1525 |
+
train
|
| 1526 |
+
frisbee
|
| 1527 |
+
baseball game
|
| 1528 |
+
fur
|
| 1529 |
+
afternoon sun
|
| 1530 |
+
community
|
| 1531 |
+
sparkler
|
| 1532 |
+
bandage
|
| 1533 |
+
firework
|
| 1534 |
+
dollar
|
| 1535 |
+
pasture
|
| 1536 |
+
video
|
| 1537 |
+
bus
|
| 1538 |
+
tree house
|
| 1539 |
+
seashore
|
| 1540 |
+
field
|
| 1541 |
+
hamburger
|
| 1542 |
+
souvenir
|
| 1543 |
+
hedgehog
|
| 1544 |
+
worm
|
| 1545 |
+
pine cone
|
| 1546 |
+
osprey
|
| 1547 |
+
dinosaur
|
| 1548 |
+
vegetable
|
| 1549 |
+
junk
|
| 1550 |
+
poster
|
| 1551 |
+
army
|
| 1552 |
+
winger
|
| 1553 |
+
bundle
|
| 1554 |
+
stage
|
| 1555 |
+
growth
|
| 1556 |
+
wedding party
|
| 1557 |
+
service
|
| 1558 |
+
blanket
|
| 1559 |
+
ruler
|
| 1560 |
+
eye
|
| 1561 |
+
credit card
|
| 1562 |
+
castle
|
| 1563 |
+
diner
|
| 1564 |
+
hut
|
| 1565 |
+
elk
|
| 1566 |
+
hard rock artist
|
| 1567 |
+
nun
|
| 1568 |
+
dog breed
|
| 1569 |
+
nest
|
| 1570 |
+
drama film
|
| 1571 |
+
number icon
|
| 1572 |
+
water tank
|
| 1573 |
+
giraffe
|
| 1574 |
+
altar
|
| 1575 |
+
pavilion
|
| 1576 |
+
tv personality
|
| 1577 |
+
suv
|
| 1578 |
+
street vendor
|
| 1579 |
+
street sign
|
| 1580 |
+
ditch
|
| 1581 |
+
debris
|
| 1582 |
+
foam
|
| 1583 |
+
takeoff
|
| 1584 |
+
spice
|
| 1585 |
+
mountain lake
|
| 1586 |
+
tea
|
| 1587 |
+
orchestra
|
| 1588 |
+
spacecraft
|
| 1589 |
+
counter
|
| 1590 |
+
abbey
|
| 1591 |
+
mountain
|
| 1592 |
+
hydrangea
|
| 1593 |
+
racer
|
| 1594 |
+
orange tree
|
| 1595 |
+
tide
|
| 1596 |
+
cowboy hat
|
| 1597 |
+
rapid
|
| 1598 |
+
town
|
| 1599 |
+
wild
|
| 1600 |
+
herd
|
| 1601 |
+
vein
|
| 1602 |
+
driveway
|
| 1603 |
+
jar
|
| 1604 |
+
bark
|
| 1605 |
+
illustration
|
| 1606 |
+
horror film
|
| 1607 |
+
corn
|
| 1608 |
+
stroller
|
| 1609 |
+
industry
|
| 1610 |
+
mountain stream
|
| 1611 |
+
gym
|
| 1612 |
+
neckline
|
| 1613 |
+
pan
|
| 1614 |
+
client
|
| 1615 |
+
spectator
|
| 1616 |
+
eggplant
|
| 1617 |
+
camper
|
| 1618 |
+
fawn
|
| 1619 |
+
hoodie
|
| 1620 |
+
meat
|
| 1621 |
+
lemonade
|
| 1622 |
+
food market
|
| 1623 |
+
slum
|
| 1624 |
+
comic book character
|
| 1625 |
+
flower market
|
| 1626 |
+
love
|
| 1627 |
+
palace
|
| 1628 |
+
gun
|
| 1629 |
+
heel
|
| 1630 |
+
shopping street
|
| 1631 |
+
shooting basketball guard
|
| 1632 |
+
family photo
|
| 1633 |
+
rooftop
|
| 1634 |
+
laundry basket
|
| 1635 |
+
airport runway
|
| 1636 |
+
horn
|
| 1637 |
+
face mask
|
| 1638 |
+
flight
|
| 1639 |
+
appetizer
|
| 1640 |
+
violet
|
| 1641 |
+
country lane
|
| 1642 |
+
cement
|
| 1643 |
+
instrument
|
| 1644 |
+
tv actor
|
| 1645 |
+
spark
|
| 1646 |
+
celebrity
|
| 1647 |
+
award
|
| 1648 |
+
country house
|
| 1649 |
+
standing
|
| 1650 |
+
auction
|
| 1651 |
+
date
|
| 1652 |
+
engagement
|
| 1653 |
+
puck
|
| 1654 |
+
advertisement
|
| 1655 |
+
chair
|
| 1656 |
+
zebra
|
| 1657 |
+
driftwood
|
| 1658 |
+
bumblebee
|
| 1659 |
+
maple leaf
|
| 1660 |
+
bonnet
|
| 1661 |
+
orange
|
| 1662 |
+
water tower
|
| 1663 |
+
door
|
| 1664 |
+
singer
|
| 1665 |
+
floor plan
|
| 1666 |
+
discussion
|
| 1667 |
+
theatre
|
| 1668 |
+
pilgrim
|
| 1669 |
+
mug
|
| 1670 |
+
branch
|
| 1671 |
+
window sill
|
| 1672 |
+
baseball pitcher
|
| 1673 |
+
bakery
|
| 1674 |
+
lollipop
|
| 1675 |
+
basketball player
|
| 1676 |
+
toilet paper
|
| 1677 |
+
chalkboard
|
| 1678 |
+
cabin
|
| 1679 |
+
sign
|
| 1680 |
+
night sky
|
| 1681 |
+
cannon
|
| 1682 |
+
fishing net
|
| 1683 |
+
submarine
|
| 1684 |
+
suit
|
| 1685 |
+
fur coat
|
| 1686 |
+
wine bottle
|
| 1687 |
+
folder
|
| 1688 |
+
street art
|
| 1689 |
+
suspension bridge
|
| 1690 |
+
evening sky
|
| 1691 |
+
billboard
|
| 1692 |
+
postage stamp
|
| 1693 |
+
newspaper
|
| 1694 |
+
transportation
|
| 1695 |
+
surgeon
|
| 1696 |
+
light
|
| 1697 |
+
park
|
| 1698 |
+
horizon
|
| 1699 |
+
road
|
| 1700 |
+
sand bar
|
| 1701 |
+
trumpet
|
| 1702 |
+
lounge
|
| 1703 |
+
cloud forest
|
| 1704 |
+
birthday celebration
|
| 1705 |
+
balcony
|
| 1706 |
+
anime
|
| 1707 |
+
beehive
|
| 1708 |
+
umbrella
|
| 1709 |
+
goldfish
|
| 1710 |
+
baseball cap
|
| 1711 |
+
waterhole
|
| 1712 |
+
ceiling
|
| 1713 |
+
carousel
|
| 1714 |
+
backpack
|
| 1715 |
+
plant pot
|
| 1716 |
+
atmosphere
|
| 1717 |
+
sunflower field
|
| 1718 |
+
spire
|
| 1719 |
+
vision
|
| 1720 |
+
woodpecker
|
| 1721 |
+
chip
|
| 1722 |
+
pool table
|
| 1723 |
+
lotus flower
|
| 1724 |
+
cone
|
| 1725 |
+
humpback whale
|
| 1726 |
+
reservoir
|
| 1727 |
+
hunt
|
| 1728 |
+
piano
|
| 1729 |
+
plate
|
| 1730 |
+
dining area
|
| 1731 |
+
luggage
|
| 1732 |
+
skier
|
| 1733 |
+
dance floor
|
| 1734 |
+
crow
|
| 1735 |
+
stair
|
| 1736 |
+
overpass
|
| 1737 |
+
opera house
|
| 1738 |
+
bear
|
| 1739 |
+
jazz artist
|
| 1740 |
+
water
|
| 1741 |
+
vessel
|
| 1742 |
+
cast
|
| 1743 |
+
yard
|
| 1744 |
+
cathedral
|
| 1745 |
+
basketball hoop
|
| 1746 |
+
graveyard
|
| 1747 |
+
sound
|
| 1748 |
+
berry
|
| 1749 |
+
onlooker
|
| 1750 |
+
fauna
|
| 1751 |
+
birch tree
|
| 1752 |
+
retail
|
| 1753 |
+
hill
|
| 1754 |
+
skeleton
|
| 1755 |
+
journalist
|
| 1756 |
+
frost
|
| 1757 |
+
basket
|
| 1758 |
+
nail
|
| 1759 |
+
dusk
|
| 1760 |
+
trash
|
| 1761 |
+
dawn
|
| 1762 |
+
clover
|
| 1763 |
+
hen
|
| 1764 |
+
volcano
|
| 1765 |
+
basketball coach
|
| 1766 |
+
home decor
|
| 1767 |
+
charge
|
| 1768 |
+
haircut
|
| 1769 |
+
sense
|
| 1770 |
+
university
|
| 1771 |
+
lizard
|
| 1772 |
+
daisy
|
| 1773 |
+
tablet computer
|
| 1774 |
+
grass field
|
| 1775 |
+
prison
|
| 1776 |
+
metal artist
|
| 1777 |
+
bathroom mirror
|
| 1778 |
+
window frame
|
| 1779 |
+
chest
|
| 1780 |
+
flavor
|
| 1781 |
+
pop country artist
|
| 1782 |
+
market square
|
| 1783 |
+
monkey
|
| 1784 |
+
blog
|
| 1785 |
+
deer
|
| 1786 |
+
speech bubble
|
| 1787 |
+
dog
|
| 1788 |
+
independence day
|
| 1789 |
+
girl
|
| 1790 |
+
boy
|
| 1791 |
+
tartan
|
| 1792 |
+
furniture
|
| 1793 |
+
appliance
|
| 1794 |
+
office window
|
| 1795 |
+
fish boat
|
| 1796 |
+
sand box
|
| 1797 |
+
tv sitcom
|
| 1798 |
+
drama
|
| 1799 |
+
sleigh
|
| 1800 |
+
depression
|
| 1801 |
+
paper towel
|
| 1802 |
+
baseball
|
| 1803 |
+
protestor
|
| 1804 |
+
grape
|
| 1805 |
+
wedding cake
|
| 1806 |
+
invitation
|
| 1807 |
+
accessory
|
| 1808 |
+
pick
|
| 1809 |
+
grandparent
|
| 1810 |
+
racket
|
| 1811 |
+
tea plantation
|
| 1812 |
+
outdoors
|
| 1813 |
+
egg
|
| 1814 |
+
glass bowl
|
| 1815 |
+
sun
|
| 1816 |
+
organization
|
| 1817 |
+
lion
|
| 1818 |
+
panel
|
| 1819 |
+
station
|
| 1820 |
+
wallpaper
|
| 1821 |
+
helicopter
|
| 1822 |
+
salt
|
| 1823 |
+
vanity
|
| 1824 |
+
patio
|
| 1825 |
+
lunch
|
| 1826 |
+
street performer
|
| 1827 |
+
mountain range
|
| 1828 |
+
soup
|
| 1829 |
+
bacon
|
| 1830 |
+
power station
|
| 1831 |
+
cantilever bridge
|
| 1832 |
+
hummingbird
|
| 1833 |
+
shirt
|
| 1834 |
+
rope
|
| 1835 |
+
hip
|
| 1836 |
+
chalk
|
| 1837 |
+
pendant
|
| 1838 |
+
choir
|
| 1839 |
+
tv
|
| 1840 |
+
lichen
|
| 1841 |
+
railway bridge
|
| 1842 |
+
art gallery
|
| 1843 |
+
bartender
|
| 1844 |
+
wagon
|
| 1845 |
+
baby elephant
|
| 1846 |
+
accordion
|
| 1847 |
+
horseshoe
|
| 1848 |
+
building site
|
| 1849 |
+
clutch
|
| 1850 |
+
harvest
|
| 1851 |
+
savanna
|
| 1852 |
+
geranium
|
| 1853 |
+
business woman
|
| 1854 |
+
paddock
|
| 1855 |
+
patch
|
| 1856 |
+
beech tree
|
| 1857 |
+
war
|
| 1858 |
+
suburbs
|
| 1859 |
+
hospital bed
|
| 1860 |
+
motorcycle racer
|
| 1861 |
+
moss
|
| 1862 |
+
gravel
|
| 1863 |
+
government agency
|
| 1864 |
+
dollar bill
|
| 1865 |
+
father
|
| 1866 |
+
fjord
|
| 1867 |
+
concert
|
| 1868 |
+
nut
|
| 1869 |
+
wedding photography
|
| 1870 |
+
finish line
|
| 1871 |
+
home plate
|
| 1872 |
+
food
|
| 1873 |
+
nose
|
| 1874 |
+
thumb
|
| 1875 |
+
village
|
| 1876 |
+
dining room table
|
| 1877 |
+
bumper
|
| 1878 |
+
monster
|
| 1879 |
+
blackberry
|
| 1880 |
+
lime
|
| 1881 |
+
conflict
|
| 1882 |
+
gala
|
| 1883 |
+
wallet
|
| 1884 |
+
wrist
|
| 1885 |
+
hug
|
| 1886 |
+
mermaid
|
| 1887 |
+
lava
|
| 1888 |
+
lawyer
|
| 1889 |
+
folk rock artist
|
| 1890 |
+
arena
|
| 1891 |
+
onion
|
| 1892 |
+
toothbrush
|
| 1893 |
+
fashion
|
| 1894 |
+
perfume
|
| 1895 |
+
flip
|
| 1896 |
+
triangle
|
| 1897 |
+
woodland
|
| 1898 |
+
mail
|
| 1899 |
+
grasshopper
|
| 1900 |
+
studio
|
| 1901 |
+
wood floor
|
| 1902 |
+
den
|
| 1903 |
+
racquet
|
| 1904 |
+
cello
|
| 1905 |
+
lemur
|
| 1906 |
+
astronaut
|
| 1907 |
+
glass table
|
| 1908 |
+
blood
|
| 1909 |
+
dvd
|
| 1910 |
+
planter
|
| 1911 |
+
silver
|
| 1912 |
+
leash
|
| 1913 |
+
master bedroom
|
| 1914 |
+
forest
|
| 1915 |
+
batter
|
| 1916 |
+
shoe
|
| 1917 |
+
engraving
|
| 1918 |
+
opening
|
| 1919 |
+
product
|
| 1920 |
+
toe
|
| 1921 |
+
cocktail
|
| 1922 |
+
mallard duck
|
| 1923 |
+
bike ride
|
| 1924 |
+
oasis
|
| 1925 |
+
wedding ring
|
| 1926 |
+
cinematographer
|
| 1927 |
+
holly
|
| 1928 |
+
autograph
|
| 1929 |
+
fence
|
| 1930 |
+
ice cube
|
| 1931 |
+
cove
|
| 1932 |
+
pineapple
|
| 1933 |
+
aurora
|
| 1934 |
+
glass bead
|
| 1935 |
+
produce
|
| 1936 |
+
apartment building
|
| 1937 |
+
cob
|
| 1938 |
+
miniature
|
| 1939 |
+
cockpit
|
| 1940 |
+
flashlight
|
| 1941 |
+
frog
|
| 1942 |
+
sheep
|
| 1943 |
+
groom
|
| 1944 |
+
steel
|
| 1945 |
+
watermelon
|
| 1946 |
+
clip art
|
| 1947 |
+
paper plate
|
| 1948 |
+
ostrich
|
| 1949 |
+
contour
|
| 1950 |
+
mural
|
| 1951 |
+
cub
|
| 1952 |
+
paisley bandanna
|
| 1953 |
+
winery
|
| 1954 |
+
turn
|
| 1955 |
+
handle
|
| 1956 |
+
satellite
|
| 1957 |
+
post
|
| 1958 |
+
pork
|
| 1959 |
+
child
|
| 1960 |
+
asphalt
|
| 1961 |
+
grocery store
|
| 1962 |
+
vulture
|
| 1963 |
+
trolley
|
| 1964 |
+
nightclub
|
| 1965 |
+
brick
|
| 1966 |
+
trailer
|
| 1967 |
+
compass
|
| 1968 |
+
cereal
|
| 1969 |
+
cafe
|
| 1970 |
+
cartoon character
|
| 1971 |
+
sugar
|
| 1972 |
+
fiction book
|
| 1973 |
+
glass floor
|
| 1974 |
+
umpire
|
| 1975 |
+
guitar
|
| 1976 |
+
hamster
|
| 1977 |
+
protester
|
| 1978 |
+
airplane
|
| 1979 |
+
garment
|
| 1980 |
+
blazer
|
| 1981 |
+
railway line
|
| 1982 |
+
wedding
|
| 1983 |
+
shoe box
|
| 1984 |
+
parking lot
|
| 1985 |
+
construction
|
| 1986 |
+
graduation ceremony
|
| 1987 |
+
tram
|
| 1988 |
+
telescope
|
| 1989 |
+
copper
|
| 1990 |
+
pain
|
| 1991 |
+
autumn forest
|
| 1992 |
+
guest house
|
| 1993 |
+
partner
|
| 1994 |
+
crayon
|
| 1995 |
+
dip
|
| 1996 |
+
boot
|
| 1997 |
+
corridor
|
| 1998 |
+
computer keyboard
|
| 1999 |
+
hockey player
|
| 2000 |
+
chicken coop
|
| 2001 |
+
bus station
|
| 2002 |
+
gathering
|
| 2003 |
+
ankle
|
| 2004 |
+
bunk bed
|
| 2005 |
+
wood table
|
| 2006 |
+
football coach
|
| 2007 |
+
monarch
|
| 2008 |
+
pharmacy
|
| 2009 |
+
legging
|
| 2010 |
+
mannequin
|
| 2011 |
+
female
|
| 2012 |
+
train track
|
| 2013 |
+
stack
|
| 2014 |
+
canopy
|
| 2015 |
+
design element
|
| 2016 |
+
grandmother
|
| 2017 |
+
symbol
|
| 2018 |
+
beach hut
|
| 2019 |
+
zucchini
|
| 2020 |
+
bomb
|
| 2021 |
+
businessman
|
| 2022 |
+
skyscraper
|
| 2023 |
+
tongue
|
| 2024 |
+
case
|
| 2025 |
+
sparkle
|
| 2026 |
+
highland
|
| 2027 |
+
ballroom
|
| 2028 |
+
prom
|
| 2029 |
+
estate
|
| 2030 |
+
customer
|
| 2031 |
+
archipelago
|
| 2032 |
+
cheese
|
| 2033 |
+
debate
|
| 2034 |
+
carriage
|
| 2035 |
+
bulldozer
|
| 2036 |
+
pumpkin
|
| 2037 |
+
sitting room
|
| 2038 |
+
gas station
|
| 2039 |
+
wedding reception
|
| 2040 |
+
camp
|
| 2041 |
+
dog bed
|
| 2042 |
+
tower
|
| 2043 |
+
property
|
| 2044 |
+
river bed
|
| 2045 |
+
pop latin artist
|
| 2046 |
+
fridge
|
| 2047 |
+
wine glass
|
| 2048 |
+
coast
|
| 2049 |
+
beer
|
| 2050 |
+
tow truck
|
| 2051 |
+
fire truck
|
| 2052 |
+
mountain bike
|
| 2053 |
+
thigh
|
| 2054 |
+
heron
|
| 2055 |
+
boat ride
|
| 2056 |
+
gondola
|
| 2057 |
+
turquoise
|
| 2058 |
+
lake
|
| 2059 |
+
llama
|
| 2060 |
+
kitty
|
| 2061 |
+
tin
|
| 2062 |
+
waiting room
|
| 2063 |
+
coffee cup
|
| 2064 |
+
socialite
|
| 2065 |
+
guard
|
| 2066 |
+
tap
|
| 2067 |
+
waterway
|
| 2068 |
+
forehead
|
| 2069 |
+
list
|
| 2070 |
+
erosion
|
| 2071 |
+
box
|
| 2072 |
+
sea lion
|
| 2073 |
+
pollen
|
| 2074 |
+
dam
|
| 2075 |
+
wasp
|
| 2076 |
+
salon
|
| 2077 |
+
tennis tournament
|
| 2078 |
+
flower box
|
| 2079 |
+
aquarium
|
| 2080 |
+
rain cloud
|
| 2081 |
+
clothing store
|
| 2082 |
+
lead singer
|
| 2083 |
+
cupcake
|
| 2084 |
+
tortoise
|
| 2085 |
+
lettering
|
| 2086 |
+
sport facility
|
| 2087 |
+
dance
|
| 2088 |
+
dog house
|
| 2089 |
+
nature
|
| 2090 |
+
football
|
| 2091 |
+
rooster
|
| 2092 |
+
footballer
|
| 2093 |
+
railway track
|
| 2094 |
+
crowd
|
| 2095 |
+
fishing rod
|
| 2096 |
+
silhouette
|
| 2097 |
+
wind turbine
|
| 2098 |
+
sari
|
| 2099 |
+
bus window
|
| 2100 |
+
cloud
|
| 2101 |
+
charity
|
| 2102 |
+
medal
|
| 2103 |
+
yoga
|
| 2104 |
+
event
|
| 2105 |
+
veil
|
| 2106 |
+
fashion menswear milan week
|
| 2107 |
+
news
|
| 2108 |
+
knife
|
| 2109 |
+
print
|
| 2110 |
+
screen tv
|
| 2111 |
+
walnut
|
| 2112 |
+
fungus
|
| 2113 |
+
ice cream
|
| 2114 |
+
computer mouse
|
| 2115 |
+
play
|
| 2116 |
+
tribe
|
| 2117 |
+
picture
|
| 2118 |
+
video game
|
| 2119 |
+
business card
|
| 2120 |
+
music festival
|
| 2121 |
+
rack
|
| 2122 |
+
envelope
|
| 2123 |
+
shower
|
| 2124 |
+
dirt road
|
| 2125 |
+
mine
|
| 2126 |
+
oyster
|
| 2127 |
+
monarch butterfly
|
| 2128 |
+
dude
|
| 2129 |
+
fruit salad
|
| 2130 |
+
podium
|
| 2131 |
+
fork
|
| 2132 |
+
lace
|
| 2133 |
+
test match
|
| 2134 |
+
boulder
|
| 2135 |
+
cricket player
|
| 2136 |
+
staircase
|
| 2137 |
+
peninsula
|
| 2138 |
+
shopping
|
| 2139 |
+
popcorn
|
| 2140 |
+
oak
|
| 2141 |
+
market stall
|
| 2142 |
+
pine tree
|
| 2143 |
+
mountaineer
|
| 2144 |
+
student
|
| 2145 |
+
closet
|
| 2146 |
+
hood
|
| 2147 |
+
handstand
|
| 2148 |
+
centerpiece
|
| 2149 |
+
insect
|
| 2150 |
+
patient
|
| 2151 |
+
makeover
|
| 2152 |
+
tennis player
|
| 2153 |
+
sheet
|
| 2154 |
+
park bench
|
| 2155 |
+
apple
|
| 2156 |
+
organism
|
| 2157 |
+
hook
|
| 2158 |
+
turkey
|
| 2159 |
+
tangerine
|
| 2160 |
+
sibling
|
| 2161 |
+
shopping mall
|
| 2162 |
+
bird
|
| 2163 |
+
scarf
|
| 2164 |
+
smoothie
|
| 2165 |
+
net
|
| 2166 |
+
grass
|
| 2167 |
+
napkin
|
| 2168 |
+
ray
|
| 2169 |
+
eyebrow
|
| 2170 |
+
laptop keyboard
|
| 2171 |
+
motorbike
|
| 2172 |
+
woman hand
|
| 2173 |
+
oven
|
| 2174 |
+
book cover
|
| 2175 |
+
easter egg
|
| 2176 |
+
microwave
|
| 2177 |
+
sand
|
| 2178 |
+
snapshot
|
| 2179 |
+
soccer ball
|
| 2180 |
+
makeup
|
| 2181 |
+
knight
|
| 2182 |
+
bowling ball
|
| 2183 |
+
shower curtain
|
| 2184 |
+
flame
|
| 2185 |
+
lightning
|
| 2186 |
+
running
|
| 2187 |
+
power plant
|
| 2188 |
+
crib
|
| 2189 |
+
cartoon
|
| 2190 |
+
moat
|
| 2191 |
+
fashion girl
|
| 2192 |
+
wedding invitation
|
| 2193 |
+
bottle
|
| 2194 |
+
cliff
|
| 2195 |
+
monastery
|
| 2196 |
+
file photo
|
| 2197 |
+
apartment
|
| 2198 |
+
casino
|
| 2199 |
+
cream
|
| 2200 |
+
sweatshirt
|
| 2201 |
+
storm
|
| 2202 |
+
cruise
|
| 2203 |
+
teddy bear
|
| 2204 |
+
shovel
|
| 2205 |
+
wind farm
|
| 2206 |
+
writer
|
| 2207 |
+
dock
|
| 2208 |
+
professional
|
| 2209 |
+
hotel room
|
| 2210 |
+
job
|
| 2211 |
+
monitor
|
| 2212 |
+
donkey
|
| 2213 |
+
pass
|
| 2214 |
+
interview
|
| 2215 |
+
duchess
|
| 2216 |
+
mark
|
| 2217 |
+
plank
|
| 2218 |
+
beard
|
| 2219 |
+
zombie
|
| 2220 |
+
trio
|
| 2221 |
+
channel
|
| 2222 |
+
cricket team
|
| 2223 |
+
windmill
|
| 2224 |
+
vest
|
| 2225 |
+
diagram
|
| 2226 |
+
cable
|
| 2227 |
+
winter scene
|
| 2228 |
+
golden gate bridge
|
| 2229 |
+
buffalo
|
| 2230 |
+
studio portrait
|
| 2231 |
+
pagoda
|
| 2232 |
+
whiskey
|
| 2233 |
+
freight train
|
| 2234 |
+
kite
|
| 2235 |
+
future
|
| 2236 |
+
steam train
|
| 2237 |
+
phone box
|
| 2238 |
+
headset
|
| 2239 |
+
wood
|
| 2240 |
+
snowboarder
|
| 2241 |
+
paper bag
|
| 2242 |
+
slide
|
| 2243 |
+
grapefruit
|
| 2244 |
+
seating
|
| 2245 |
+
morning
|
| 2246 |
+
bronze sculpture
|
| 2247 |
+
theatre actor
|
| 2248 |
+
stump
|
| 2249 |
+
jean
|
| 2250 |
+
landmark
|
| 2251 |
+
jam
|
| 2252 |
+
waist
|
| 2253 |
+
watercolor
|
| 2254 |
+
hammock
|
| 2255 |
+
light fixture
|
| 2256 |
+
ice
|
| 2257 |
+
basin
|
| 2258 |
+
beverage
|
| 2259 |
+
shelter
|
| 2260 |
+
premiere
|
| 2261 |
+
mound
|
| 2262 |
+
ear
|
| 2263 |
+
bronze
|
| 2264 |
+
sunlight
|
| 2265 |
+
street
|
| 2266 |
+
energy
|
| 2267 |
+
barn door
|
| 2268 |
+
hike
|
| 2269 |
+
fleet
|
| 2270 |
+
claw
|
| 2271 |
+
beach
|
| 2272 |
+
pepperoni
|
| 2273 |
+
bin
|
| 2274 |
+
trainer
|
| 2275 |
+
buffet
|
| 2276 |
+
archive
|
| 2277 |
+
toddler
|
| 2278 |
+
referee
|
| 2279 |
+
bay window
|
| 2280 |
+
dove
|
| 2281 |
+
production company
|
| 2282 |
+
evening light
|
| 2283 |
+
gate
|
| 2284 |
+
farm
|
| 2285 |
+
reed
|
| 2286 |
+
fruit stand
|
| 2287 |
+
explorer
|
| 2288 |
+
snow storm
|
| 2289 |
+
throw pillow
|
| 2290 |
+
button
|
| 2291 |
+
display case
|
| 2292 |
+
bookcase
|
| 2293 |
+
lead
|
| 2294 |
+
lipstick
|
| 2295 |
+
basketball court
|
| 2296 |
+
cargo
|
| 2297 |
+
ensemble
|
| 2298 |
+
pope
|
| 2299 |
+
clock tower
|
| 2300 |
+
teen
|
| 2301 |
+
speaker
|
| 2302 |
+
rat
|
| 2303 |
+
laptop
|
| 2304 |
+
ski
|
| 2305 |
+
mess
|
| 2306 |
+
stadium
|
| 2307 |
+
ferry boat
|
| 2308 |
+
bunny
|
| 2309 |
+
waterfront
|
| 2310 |
+
downtown
|
| 2311 |
+
sink
|
| 2312 |
+
press conference
|
| 2313 |
+
dinner
|
| 2314 |
+
condiment
|
| 2315 |
+
thread
|
| 2316 |
+
audience
|
| 2317 |
+
grid
|
| 2318 |
+
car
|
| 2319 |
+
plastic
|
| 2320 |
+
people
|
| 2321 |
+
barbecue
|
| 2322 |
+
pigeon
|
| 2323 |
+
urinal
|
| 2324 |
+
seagull
|
| 2325 |
+
volunteer
|
| 2326 |
+
hockey
|
| 2327 |
+
fir tree
|
| 2328 |
+
pollution
|
| 2329 |
+
trial
|
| 2330 |
+
collar
|
| 2331 |
+
area
|
| 2332 |
+
meeting room
|
| 2333 |
+
circus
|
| 2334 |
+
yogurt
|
| 2335 |
+
orangutan
|
| 2336 |
+
viaduct
|
| 2337 |
+
comedian
|
| 2338 |
+
drone
|
| 2339 |
+
scissor
|
| 2340 |
+
pop rock artist
|
| 2341 |
+
biscuit
|
| 2342 |
+
panda
|
| 2343 |
+
water feature
|
| 2344 |
+
air balloon
|
| 2345 |
+
remote control
|
| 2346 |
+
watercolor painting
|
| 2347 |
+
show
|
| 2348 |
+
walk
|
| 2349 |
+
post office
|
| 2350 |
+
bike path
|
| 2351 |
+
rap gangsta artist
|
| 2352 |
+
microphone
|
| 2353 |
+
crack
|
| 2354 |
+
sunset sky
|
| 2355 |
+
glass
|
| 2356 |
+
tv show
|
| 2357 |
+
cartoon style
|
| 2358 |
+
stripe
|
| 2359 |
+
foyer
|
| 2360 |
+
signal
|
| 2361 |
+
calligraphy
|
| 2362 |
+
bulb
|
| 2363 |
+
gardener
|
| 2364 |
+
coffee bean
|
| 2365 |
+
spider
|
| 2366 |
+
tapestry
|
| 2367 |
+
city skyline
|
| 2368 |
+
necklace
|
| 2369 |
+
kitten
|
| 2370 |
+
traveler
|
| 2371 |
+
veteran
|
| 2372 |
+
frosting
|
| 2373 |
+
fry
|
| 2374 |
+
tennis court
|
| 2375 |
+
tank top
|
| 2376 |
+
butterfly house
|
| 2377 |
+
mist
|
| 2378 |
+
drummer
|
| 2379 |
+
water level
|
| 2380 |
+
scale
|
| 2381 |
+
baseball glove
|
| 2382 |
+
music video performer
|
| 2383 |
+
champagne
|
| 2384 |
+
camping
|
| 2385 |
+
clothing
|
| 2386 |
+
water drop
|
| 2387 |
+
telephone box
|
| 2388 |
+
pen
|
| 2389 |
+
morning mist
|
| 2390 |
+
fire engine
|
| 2391 |
+
porch
|
| 2392 |
+
opening ceremony
|
| 2393 |
+
style
|
| 2394 |
+
palm tree
|
| 2395 |
+
fashion show
|
| 2396 |
+
universe
|
| 2397 |
+
scratch
|
| 2398 |
+
axe
|
| 2399 |
+
ottoman
|
| 2400 |
+
explosion
|
| 2401 |
+
rib
|
| 2402 |
+
boutique
|
| 2403 |
+
game
|
| 2404 |
+
cucumber
|
| 2405 |
+
fruit
|
| 2406 |
+
stone bridge
|
| 2407 |
+
nature reserve
|
| 2408 |
+
track
|
| 2409 |
+
train window
|
| 2410 |
+
punch
|
| 2411 |
+
telephone pole
|
| 2412 |
+
velvet
|
| 2413 |
+
sauce
|
| 2414 |
+
moon
|
| 2415 |
+
contrast
|
| 2416 |
+
flamingo
|
| 2417 |
+
bat
|
| 2418 |
+
vending machine
|
| 2419 |
+
ship
|
| 2420 |
+
equestrian
|
| 2421 |
+
shade
|
| 2422 |
+
comforter
|
| 2423 |
+
pallet
|
| 2424 |
+
sparrow
|
| 2425 |
+
wii
|
| 2426 |
+
glaze
|
| 2427 |
+
grocery
|
| 2428 |
+
steeple
|
| 2429 |
+
soccer player
|
| 2430 |
+
contract
|
| 2431 |
+
advertising
|
| 2432 |
+
runner
|
| 2433 |
+
chimpanzee
|
| 2434 |
+
world
|
| 2435 |
+
seat
|
| 2436 |
+
project
|
| 2437 |
+
chihuahua
|
| 2438 |
+
bubble
|
| 2439 |
+
willow
|
| 2440 |
+
pedestal
|
| 2441 |
+
soul hip hop artist
|
| 2442 |
+
curb
|
| 2443 |
+
drawer
|
| 2444 |
+
leaf
|
| 2445 |
+
banner
|
| 2446 |
+
launch party
|
| 2447 |
+
coach
|
| 2448 |
+
government
|
| 2449 |
+
snowball
|
| 2450 |
+
toy
|
| 2451 |
+
portrait
|
| 2452 |
+
doctor
|
| 2453 |
+
whiteboard
|
| 2454 |
+
electronic
|
| 2455 |
+
tiger
|
| 2456 |
+
graffiti
|
| 2457 |
+
column
|
| 2458 |
+
nightstand
|
| 2459 |
+
whistle
|
| 2460 |
+
maxi dress
|
| 2461 |
+
bench
|
| 2462 |
+
wetsuit
|
| 2463 |
+
bird feeder
|
| 2464 |
+
football game
|
| 2465 |
+
basketball
|
| 2466 |
+
class
|
| 2467 |
+
bathroom door
|
| 2468 |
+
store window
|
| 2469 |
+
text message
|
| 2470 |
+
wreath
|
| 2471 |
+
street view
|
| 2472 |
+
binocular
|
| 2473 |
+
pet
|
| 2474 |
+
facade
|
| 2475 |
+
drought
|
| 2476 |
+
lemon
|
| 2477 |
+
new year
|
| 2478 |
+
night view
|
| 2479 |
+
airplane window
|
| 2480 |
+
specie
|
| 2481 |
+
rule
|
| 2482 |
+
jaw
|
| 2483 |
+
wheat field
|
| 2484 |
+
diet
|
| 2485 |
+
pop artist
|
| 2486 |
+
habitat
|
| 2487 |
+
screenshot
|
| 2488 |
+
scoreboard
|
| 2489 |
+
shore
|
| 2490 |
+
mane
|
| 2491 |
+
quilt
|
| 2492 |
+
ski lift
|
| 2493 |
+
orchid
|
| 2494 |
+
turban
|
| 2495 |
+
christmas
|
| 2496 |
+
airport
|
| 2497 |
+
marina
|
| 2498 |
+
glass door
|
| 2499 |
+
glass bottle
|
| 2500 |
+
restaurant
|
| 2501 |
+
conductor
|
| 2502 |
+
logo
|
| 2503 |
+
sleep
|
| 2504 |
+
tape
|
| 2505 |
+
tomato
|
| 2506 |
+
river bank
|
| 2507 |
+
lilac
|
| 2508 |
+
tooth
|
| 2509 |
+
training
|
| 2510 |
+
pottery
|
| 2511 |
+
shop
|
| 2512 |
+
steam engine
|
| 2513 |
+
mason jar
|
| 2514 |
+
base
|
| 2515 |
+
procession
|
| 2516 |
+
border
|
| 2517 |
+
shoot
|
| 2518 |
+
footprint
|
| 2519 |
+
hotdog
|
| 2520 |
+
bull
|
| 2521 |
+
stocking
|
| 2522 |
+
recreation
|
| 2523 |
+
automobile model
|
| 2524 |
+
design
|
| 2525 |
+
country pop artist
|
| 2526 |
+
river
|
| 2527 |
+
retriever
|
| 2528 |
+
department store
|
| 2529 |
+
auditorium
|
| 2530 |
+
sport car
|
| 2531 |
+
supermarket
|
| 2532 |
+
belt
|
| 2533 |
+
cricket
|
| 2534 |
+
window box
|
| 2535 |
+
dress shirt
|
| 2536 |
+
letter
|
| 2537 |
+
residence
|
| 2538 |
+
megaphone
|
| 2539 |
+
pant
|
| 2540 |
+
wildfire
|
| 2541 |
+
bird nest
|
| 2542 |
+
crab
|
| 2543 |
+
swimsuit
|
| 2544 |
+
candle
|
| 2545 |
+
funeral
|
| 2546 |
+
mill
|
| 2547 |
+
national park
|
| 2548 |
+
plant
|
| 2549 |
+
cop
|
| 2550 |
+
power line
|
| 2551 |
+
perch
|
| 2552 |
+
blue
|
| 2553 |
+
finger
|
| 2554 |
+
ferris wheel
|
| 2555 |
+
globe
|
| 2556 |
+
skateboard
|
| 2557 |
+
helmet
|
| 2558 |
+
movie theater
|
| 2559 |
+
uniform
|
| 2560 |
+
hammer
|
| 2561 |
+
material
|
| 2562 |
+
kid
|
| 2563 |
+
well
|
| 2564 |
+
butterfly
|
| 2565 |
+
sideline
|
| 2566 |
+
fashion fall show
|
| 2567 |
+
planet earth
|
| 2568 |
+
lift
|
| 2569 |
+
male
|
| 2570 |
+
sauna
|
| 2571 |
+
gray
|
| 2572 |
+
flour
|
| 2573 |
+
sand sculpture
|
| 2574 |
+
program
|
| 2575 |
+
cabinet
|
| 2576 |
+
infant
|
| 2577 |
+
wheel
|
| 2578 |
+
aircraft model
|
| 2579 |
+
dough
|
| 2580 |
+
garlic
|
| 2581 |
+
skate
|
| 2582 |
+
arrow
|
| 2583 |
+
wrapping paper
|
| 2584 |
+
ripple
|
| 2585 |
+
lamp
|
| 2586 |
+
iron
|
| 2587 |
+
banknote
|
| 2588 |
+
beaver
|
| 2589 |
+
ferry
|
| 2590 |
+
courtyard
|
| 2591 |
+
bassist
|
| 2592 |
+
countryside
|
| 2593 |
+
steak
|
| 2594 |
+
comfort
|
| 2595 |
+
boxer
|
| 2596 |
+
laundry room
|
| 2597 |
+
campsite
|
| 2598 |
+
brick building
|
| 2599 |
+
golf
|
| 2600 |
+
subway
|
| 2601 |
+
headphone
|
| 2602 |
+
fort
|
| 2603 |
+
handbag
|
| 2604 |
+
drum
|
| 2605 |
+
flood
|
| 2606 |
+
saddle
|
| 2607 |
+
bass
|
| 2608 |
+
labyrinth
|
| 2609 |
+
needle
|
| 2610 |
+
sun ray
|
| 2611 |
+
app
|
| 2612 |
+
menu
|
| 2613 |
+
president
|
| 2614 |
+
cardigan
|
| 2615 |
+
dandelion
|
| 2616 |
+
wetland
|
| 2617 |
+
ice hockey player
|
| 2618 |
+
number
|
| 2619 |
+
city hall
|
| 2620 |
+
fishing
|
| 2621 |
+
portrait session
|
| 2622 |
+
pug
|
| 2623 |
+
key
|
| 2624 |
+
art print
|
| 2625 |
+
minister
|
| 2626 |
+
hurdle
|
| 2627 |
+
emergency
|
| 2628 |
+
painting artist
|
| 2629 |
+
flag pole
|
| 2630 |
+
evening
|
| 2631 |
+
purse
|
| 2632 |
+
recipe
|
| 2633 |
+
golf ball
|
| 2634 |
+
coloring book
|
| 2635 |
+
mountain peak
|
| 2636 |
+
senior
|
| 2637 |
+
holiday
|
| 2638 |
+
bud
|
| 2639 |
+
cousin
|
| 2640 |
+
pantry
|
| 2641 |
+
lap
|
| 2642 |
+
skin
|
| 2643 |
+
flag
|
| 2644 |
+
tissue paper
|
| 2645 |
+
ridge
|
| 2646 |
+
wire fence
|
| 2647 |
+
surfer
|
| 2648 |
+
climber
|
| 2649 |
+
photograph
|
| 2650 |
+
sewing machine
|
| 2651 |
+
cooler
|
| 2652 |
+
actress
|
| 2653 |
+
apple tree
|
| 2654 |
+
cancer
|
| 2655 |
+
starfish
|
| 2656 |
+
automobile make
|
| 2657 |
+
dumbbell
|
| 2658 |
+
brace
|
| 2659 |
+
tunnel
|
| 2660 |
+
window
|
| 2661 |
+
paint artist
|
| 2662 |
+
composition
|
| 2663 |
+
school student
|
| 2664 |
+
condo
|
| 2665 |
+
convertible
|
| 2666 |
+
cushion
|
| 2667 |
+
selfie
|
| 2668 |
+
territory
|
| 2669 |
+
guide
|
| 2670 |
+
tree
|
| 2671 |
+
court
|
| 2672 |
+
shrimp
|
| 2673 |
+
stone house
|
| 2674 |
+
dress
|
| 2675 |
+
eyelash
|
| 2676 |
+
juice
|
| 2677 |
+
broccoli
|
| 2678 |
+
chain
|
| 2679 |
+
tourism
|
| 2680 |
+
mountain top
|
| 2681 |
+
concept car
|
| 2682 |
+
film premiere
|
| 2683 |
+
light bulb
|
| 2684 |
+
cafeteria
|
| 2685 |
+
badge
|
| 2686 |
+
flower bed
|
| 2687 |
+
theater
|
| 2688 |
+
root
|
| 2689 |
+
racecar driver
|
| 2690 |
+
basketball boy game
|
| 2691 |
+
glove
|
| 2692 |
+
skyline
|
| 2693 |
+
wall
|
| 2694 |
+
glacier
|
| 2695 |
+
airport terminal
|
| 2696 |
+
bug
|
| 2697 |
+
trim
|
| 2698 |
+
railway station
|
| 2699 |
+
briefcase
|
| 2700 |
+
flat
|
| 2701 |
+
fountain
|
| 2702 |
+
person
|
| 2703 |
+
lane
|
| 2704 |
+
asparagus
|
| 2705 |
+
art
|
| 2706 |
+
lantern
|
| 2707 |
+
dishwasher
|
| 2708 |
+
director
|
| 2709 |
+
snake
|
| 2710 |
+
lecture
|
| 2711 |
+
game controller
|
| 2712 |
+
tree branch
|
| 2713 |
+
pub
|
| 2714 |
+
bathing suit
|
| 2715 |
+
queue
|
| 2716 |
+
belly
|
| 2717 |
+
poppy
|
| 2718 |
+
bow
|
| 2719 |
+
pitcher
|
| 2720 |
+
ice cream cone
|
| 2721 |
+
cave
|
| 2722 |
+
candy
|
| 2723 |
+
road bridge
|
| 2724 |
+
host
|
| 2725 |
+
traffic jam
|
| 2726 |
+
earring
|
| 2727 |
+
file
|
| 2728 |
+
foot
|
| 2729 |
+
watermark overlay stamp
|
| 2730 |
+
mailbox
|
| 2731 |
+
supercar
|
| 2732 |
+
railing
|
| 2733 |
+
bedroom
|
| 2734 |
+
seafood
|
| 2735 |
+
waffle
|
| 2736 |
+
bronze statue
|
| 2737 |
+
plan
|
| 2738 |
+
flow
|
| 2739 |
+
marble
|
| 2740 |
+
basketball game
|
| 2741 |
+
automobile
|
| 2742 |
+
scene
|
| 2743 |
+
cypress tree
|
| 2744 |
+
soldier
|
| 2745 |
+
skateboarder
|
| 2746 |
+
glass building
|
| 2747 |
+
cherry tree
|
| 2748 |
+
pump
|
| 2749 |
+
grain
|
| 2750 |
+
wildebeest
|
| 2751 |
+
loop
|
| 2752 |
+
frame
|
| 2753 |
+
bathtub
|
| 2754 |
+
saxophone
|
| 2755 |
+
diver
|
| 2756 |
+
stalk
|
| 2757 |
+
lily
|
| 2758 |
+
bead
|
| 2759 |
+
alley
|
| 2760 |
+
flock
|
| 2761 |
+
family room
|
| 2762 |
+
manufacturing
|
| 2763 |
+
pointer
|
| 2764 |
+
worker
|
| 2765 |
+
navy
|
| 2766 |
+
potato
|
| 2767 |
+
teacher
|
| 2768 |
+
photography
|
| 2769 |
+
dolly
|
| 2770 |
+
boardwalk
|
| 2771 |
+
water fountain
|
| 2772 |
+
athlete
|
| 2773 |
+
side dish
|
| 2774 |
+
bay
|
| 2775 |
+
ice hockey
|
| 2776 |
+
phone
|
| 2777 |
+
hero
|
| 2778 |
+
face
|
| 2779 |
+
gold medal
|
| 2780 |
+
blind
|
| 2781 |
+
swamp
|
| 2782 |
+
researcher
|
| 2783 |
+
swim
|
| 2784 |
+
meatball
|
| 2785 |
+
iguana
|
| 2786 |
+
leather jacket
|
| 2787 |
+
jellyfish
|
| 2788 |
+
site
|
| 2789 |
+
smoke
|
| 2790 |
+
traffic signal
|
| 2791 |
+
melon
|
| 2792 |
+
beetle
|
| 2793 |
+
calculator
|
| 2794 |
+
skirt
|
| 2795 |
+
plantation
|
| 2796 |
+
sculptor
|
| 2797 |
+
barrier
|
| 2798 |
+
catcher
|
| 2799 |
+
security guard
|
| 2800 |
+
sketch
|
| 2801 |
+
awning
|
| 2802 |
+
steering wheel
|
| 2803 |
+
mountain view
|
| 2804 |
+
bus stop
|
| 2805 |
+
pool
|
| 2806 |
+
leg
|
| 2807 |
+
spotlight
|
| 2808 |
+
apron
|
| 2809 |
+
mineral
|
| 2810 |
+
inlet
|
| 2811 |
+
sleeve
|
| 2812 |
+
torch
|
| 2813 |
+
emotion
|
| 2814 |
+
march
|
| 2815 |
+
police officer
|
| 2816 |
+
performance
|
| 2817 |
+
lamp post
|
| 2818 |
+
fishing boat
|
| 2819 |
+
summer
|
| 2820 |
+
presentation
|
| 2821 |
+
saucer
|
| 2822 |
+
suitcase
|
| 2823 |
+
supermodel
|
| 2824 |
+
goalkeeper
|
| 2825 |
+
shrub
|
| 2826 |
+
rock artist
|
| 2827 |
+
document
|
| 2828 |
+
beach house
|
| 2829 |
+
man
|
| 2830 |
+
blue artist
|
| 2831 |
+
cigar
|
| 2832 |
+
railroad track
|
| 2833 |
+
gown
|
| 2834 |
+
mosaic
|
| 2835 |
+
bungalow
|
| 2836 |
+
alphabet
|
| 2837 |
+
baseball field
|
| 2838 |
+
shed
|
| 2839 |
+
pedestrian
|
| 2840 |
+
rail
|
| 2841 |
+
soap
|
| 2842 |
+
kitchen counter
|
| 2843 |
+
dessert
|
| 2844 |
+
dunk
|
| 2845 |
+
blossom
|
| 2846 |
+
conversation
|
| 2847 |
+
fruit market
|
| 2848 |
+
glass jar
|
| 2849 |
+
military
|
| 2850 |
+
beer bottle
|
| 2851 |
+
photographer
|
| 2852 |
+
tennis racket
|
| 2853 |
+
competition
|
| 2854 |
+
escalator
|
| 2855 |
+
bell tower
|
| 2856 |
+
stilt
|
| 2857 |
+
ballerina
|
| 2858 |
+
television
|
| 2859 |
+
feather
|
| 2860 |
+
fence post
|
| 2861 |
+
rear
|
| 2862 |
+
dahlia
|
| 2863 |
+
red carpet
|
| 2864 |
+
tub
|
| 2865 |
+
hole
|
| 2866 |
+
fortress
|
| 2867 |
+
pack
|
| 2868 |
+
telephone
|
| 2869 |
+
cardboard
|
| 2870 |
+
city park
|
| 2871 |
+
platform
|
| 2872 |
+
college student
|
| 2873 |
+
arch bridge
|
| 2874 |
+
wind
|
| 2875 |
+
blender
|
| 2876 |
+
bloom
|
| 2877 |
+
ice rink
|
| 2878 |
+
birthday
|
| 2879 |
+
raven
|
| 2880 |
+
fairy
|
| 2881 |
+
embankment
|
| 2882 |
+
hall
|
| 2883 |
+
flower shop
|
| 2884 |
+
suburb
|
| 2885 |
+
barrel
|
| 2886 |
+
biker
|
| 2887 |
+
steam
|
| 2888 |
+
dragonfly
|
| 2889 |
+
formation
|
| 2890 |
+
electricity
|
| 2891 |
+
business people
|
| 2892 |
+
symmetry
|
| 2893 |
+
walkway
|
| 2894 |
+
fisherman
|
| 2895 |
+
gas mask
|
| 2896 |
+
loch
|
| 2897 |
+
youth
|
| 2898 |
+
hanger
|
| 2899 |
+
dot
|
| 2900 |
+
fish
|
| 2901 |
+
street market
|
| 2902 |
+
animation film
|
| 2903 |
+
crime fiction film
|
| 2904 |
+
boar
|
| 2905 |
+
emblem
|
| 2906 |
+
halloween costume
|
| 2907 |
+
kangaroo
|
| 2908 |
+
couple
|
| 2909 |
+
spoon
|
| 2910 |
+
squirrel
|
| 2911 |
+
neon sign
|
| 2912 |
+
sky
|
| 2913 |
+
office desk
|
| 2914 |
+
beauty salon
|
| 2915 |
+
breakwater
|
| 2916 |
+
fashion look
|
| 2917 |
+
toaster
|
| 2918 |
+
author
|
| 2919 |
+
news conference
|
| 2920 |
+
outdoor
|
| 2921 |
+
canoe
|
| 2922 |
+
dragon
|
| 2923 |
+
tool
|
| 2924 |
+
shopping centre
|
| 2925 |
+
ladybug
|
| 2926 |
+
swimming pool
|
| 2927 |
+
landscaping
|
| 2928 |
+
ski pole
|
| 2929 |
+
red
|
| 2930 |
+
truck
|
| 2931 |
+
fly
|
| 2932 |
+
temple
|
| 2933 |
+
level
|
| 2934 |
+
sunday
|
| 2935 |
+
railroad bridge
|
| 2936 |
+
car mirror
|
| 2937 |
+
lawn mower
|
| 2938 |
+
flute
|
| 2939 |
+
aircraft carrier
|
| 2940 |
+
fashion menswear london week
|
| 2941 |
+
sunshine
|
| 2942 |
+
tile floor
|
| 2943 |
+
skull
|
| 2944 |
+
fossil
|
| 2945 |
+
flower arrangement
|
| 2946 |
+
diaper
|
| 2947 |
+
sea turtle
|
| 2948 |
+
cherry blossom
|
| 2949 |
+
fireman
|
| 2950 |
+
shack
|
| 2951 |
+
lens
|
| 2952 |
+
waiter
|
| 2953 |
+
animal
|
| 2954 |
+
basement
|
| 2955 |
+
snow
|
| 2956 |
+
autumn park
|
| 2957 |
+
glass box
|
| 2958 |
+
kick
|
| 2959 |
+
head
|
| 2960 |
+
anniversary
|
| 2961 |
+
vine
|
| 2962 |
+
back
|
| 2963 |
+
paper lantern
|
| 2964 |
+
fish tank
|
| 2965 |
+
cellphone
|
| 2966 |
+
silk
|
| 2967 |
+
coral
|
| 2968 |
+
notebook
|
| 2969 |
+
photo
|
| 2970 |
+
gazebo
|
| 2971 |
+
ketchup
|
| 2972 |
+
driver
|
| 2973 |
+
farmer
|
| 2974 |
+
bonfire
|
| 2975 |
+
chestnut
|
| 2976 |
+
photoshoot
|
| 2977 |
+
football field
|
| 2978 |
+
olive tree
|
| 2979 |
+
pheasant
|
| 2980 |
+
sandal
|
| 2981 |
+
toilet
|
| 2982 |
+
fireplace
|
| 2983 |
+
music
|
| 2984 |
+
deity
|
| 2985 |
+
fish market
|
| 2986 |
+
fig
|
| 2987 |
+
bell
|
| 2988 |
+
neck
|
| 2989 |
+
grave
|
| 2990 |
+
villa
|
| 2991 |
+
cyclist
|
| 2992 |
+
crate
|
| 2993 |
+
grey
|
| 2994 |
+
asphalt road
|
| 2995 |
+
soccer
|
| 2996 |
+
hostel
|
| 2997 |
+
municipality
|
| 2998 |
+
courthouse
|
| 2999 |
+
roof
|
| 3000 |
+
end table
|
| 3001 |
+
pot
|
| 3002 |
+
sedan
|
| 3003 |
+
structure
|
| 3004 |
+
folk artist
|
| 3005 |
+
sport
|
| 3006 |
+
sport team
|
| 3007 |
+
protest
|
| 3008 |
+
syringe
|
| 3009 |
+
fashion designer
|
| 3010 |
+
jersey
|
| 3011 |
+
heart shape
|
| 3012 |
+
kayak
|
| 3013 |
+
stare
|
| 3014 |
+
sit with
|
| 3015 |
+
direct
|
| 3016 |
+
read
|
| 3017 |
+
photograph
|
| 3018 |
+
spin
|
| 3019 |
+
teach
|
| 3020 |
+
laugh
|
| 3021 |
+
carve
|
| 3022 |
+
grow on
|
| 3023 |
+
warm
|
| 3024 |
+
watch
|
| 3025 |
+
stretch
|
| 3026 |
+
smell
|
| 3027 |
+
decorate
|
| 3028 |
+
shine
|
| 3029 |
+
light
|
| 3030 |
+
dance
|
| 3031 |
+
send
|
| 3032 |
+
park
|
| 3033 |
+
chase
|
| 3034 |
+
collect
|
| 3035 |
+
lead
|
| 3036 |
+
kiss
|
| 3037 |
+
lead to
|
| 3038 |
+
lick
|
| 3039 |
+
smile
|
| 3040 |
+
cheer
|
| 3041 |
+
sit
|
| 3042 |
+
point
|
| 3043 |
+
block
|
| 3044 |
+
rock
|
| 3045 |
+
drop
|
| 3046 |
+
cut
|
| 3047 |
+
ski
|
| 3048 |
+
wrap
|
| 3049 |
+
lose
|
| 3050 |
+
serve
|
| 3051 |
+
provide
|
| 3052 |
+
sleep
|
| 3053 |
+
dress
|
| 3054 |
+
embrace
|
| 3055 |
+
burn
|
| 3056 |
+
pack
|
| 3057 |
+
stir
|
| 3058 |
+
create
|
| 3059 |
+
touch
|
| 3060 |
+
wash
|
| 3061 |
+
stick
|
| 3062 |
+
reveal
|
| 3063 |
+
shop
|
| 3064 |
+
train
|
| 3065 |
+
paint
|
| 3066 |
+
groom
|
| 3067 |
+
hunt
|
| 3068 |
+
bloom
|
| 3069 |
+
play
|
| 3070 |
+
pay
|
| 3071 |
+
brush
|
| 3072 |
+
shoot
|
| 3073 |
+
hold
|
| 3074 |
+
picture
|
| 3075 |
+
carry
|
| 3076 |
+
sip
|
| 3077 |
+
contain
|
| 3078 |
+
turn
|
| 3079 |
+
pour
|
| 3080 |
+
pitch
|
| 3081 |
+
give
|
| 3082 |
+
add
|
| 3083 |
+
blow
|
| 3084 |
+
look in
|
| 3085 |
+
show
|
| 3086 |
+
walk
|
| 3087 |
+
illuminate
|
| 3088 |
+
kneel
|
| 3089 |
+
cover
|
| 3090 |
+
drag
|
| 3091 |
+
post
|
| 3092 |
+
present
|
| 3093 |
+
fit
|
| 3094 |
+
operate
|
| 3095 |
+
fish
|
| 3096 |
+
race
|
| 3097 |
+
write
|
| 3098 |
+
deliver
|
| 3099 |
+
peel
|
| 3100 |
+
push
|
| 3101 |
+
run
|
| 3102 |
+
sit around
|
| 3103 |
+
buy
|
| 3104 |
+
jump
|
| 3105 |
+
walk on
|
| 3106 |
+
attend
|
| 3107 |
+
clean
|
| 3108 |
+
sell
|
| 3109 |
+
ride on
|
| 3110 |
+
mount
|
| 3111 |
+
host
|
| 3112 |
+
dry
|
| 3113 |
+
plant
|
| 3114 |
+
sing
|
| 3115 |
+
row
|
| 3116 |
+
shake
|
| 3117 |
+
perch
|
| 3118 |
+
ride
|
| 3119 |
+
fight
|
| 3120 |
+
skateboard
|
| 3121 |
+
live
|
| 3122 |
+
call
|
| 3123 |
+
surround
|
| 3124 |
+
practice
|
| 3125 |
+
play on
|
| 3126 |
+
work on
|
| 3127 |
+
step
|
| 3128 |
+
relax
|
| 3129 |
+
hit
|
| 3130 |
+
fall in
|
| 3131 |
+
flow
|
| 3132 |
+
greet
|
| 3133 |
+
launch
|
| 3134 |
+
wear
|
| 3135 |
+
hang on
|
| 3136 |
+
drive
|
| 3137 |
+
sit in
|
| 3138 |
+
break
|
| 3139 |
+
learn
|
| 3140 |
+
fly
|
| 3141 |
+
connect
|
| 3142 |
+
display
|
| 3143 |
+
locate
|
| 3144 |
+
compete
|
| 3145 |
+
go for
|
| 3146 |
+
sail
|
| 3147 |
+
lift
|
| 3148 |
+
toast
|
| 3149 |
+
help
|
| 3150 |
+
run on
|
| 3151 |
+
reflect
|
| 3152 |
+
pose
|
| 3153 |
+
scratch
|
| 3154 |
+
frame
|
| 3155 |
+
dribble
|
| 3156 |
+
herd
|
| 3157 |
+
enter
|
| 3158 |
+
exit
|
| 3159 |
+
place
|
| 3160 |
+
inspect
|
| 3161 |
+
build
|
| 3162 |
+
pick
|
| 3163 |
+
fill
|
| 3164 |
+
grind
|
| 3165 |
+
skate
|
| 3166 |
+
offer
|
| 3167 |
+
float
|
| 3168 |
+
sit by
|
| 3169 |
+
stand
|
| 3170 |
+
release
|
| 3171 |
+
rest
|
| 3172 |
+
singe
|
| 3173 |
+
climb
|
| 3174 |
+
tie
|
| 3175 |
+
mark
|
| 3176 |
+
lay
|
| 3177 |
+
stand around
|
| 3178 |
+
capture
|
| 3179 |
+
set
|
| 3180 |
+
land
|
| 3181 |
+
swinge
|
| 3182 |
+
run in
|
| 3183 |
+
kick
|
| 3184 |
+
lean
|
| 3185 |
+
head
|
| 3186 |
+
sign
|
| 3187 |
+
approach
|
| 3188 |
+
swim
|
| 3189 |
+
close
|
| 3190 |
+
crash
|
| 3191 |
+
control
|
| 3192 |
+
fall
|
| 3193 |
+
remove
|
| 3194 |
+
repair
|
| 3195 |
+
open
|
| 3196 |
+
appear
|
| 3197 |
+
travel
|
| 3198 |
+
load
|
| 3199 |
+
miss
|
| 3200 |
+
check
|
| 3201 |
+
surf
|
| 3202 |
+
moor
|
| 3203 |
+
smoke
|
| 3204 |
+
drink
|
| 3205 |
+
board
|
| 3206 |
+
seat
|
| 3207 |
+
feed
|
| 3208 |
+
rise
|
| 3209 |
+
sit on
|
| 3210 |
+
swing
|
| 3211 |
+
grow
|
| 3212 |
+
strike
|
| 3213 |
+
date
|
| 3214 |
+
slide
|
| 3215 |
+
share
|
| 3216 |
+
graze
|
| 3217 |
+
jump in
|
| 3218 |
+
lie
|
| 3219 |
+
extrude
|
| 3220 |
+
roll
|
| 3221 |
+
move
|
| 3222 |
+
gather
|
| 3223 |
+
eat
|
| 3224 |
+
pull
|
| 3225 |
+
run through
|
| 3226 |
+
squeeze
|
| 3227 |
+
lay on
|
| 3228 |
+
draw
|
| 3229 |
+
play with
|
| 3230 |
+
wave
|
| 3231 |
+
assemble
|
| 3232 |
+
perform
|
| 3233 |
+
march
|
| 3234 |
+
score
|
| 3235 |
+
attach
|
| 3236 |
+
adjust
|
| 3237 |
+
hang
|
| 3238 |
+
hug
|
| 3239 |
+
sleep on
|
| 3240 |
+
throw
|
| 3241 |
+
live in
|
| 3242 |
+
talk
|
| 3243 |
+
pet
|
| 3244 |
+
work
|
| 3245 |
+
run with
|
| 3246 |
+
see
|
| 3247 |
+
flip
|
| 3248 |
+
catch
|
| 3249 |
+
cook
|
| 3250 |
+
receive
|
| 3251 |
+
celebrate
|
| 3252 |
+
look
|
| 3253 |
+
classic
|
| 3254 |
+
bridal
|
| 3255 |
+
indoor
|
| 3256 |
+
industrial
|
| 3257 |
+
teenage
|
| 3258 |
+
mini
|
| 3259 |
+
grassy
|
| 3260 |
+
aged
|
| 3261 |
+
long
|
| 3262 |
+
warm
|
| 3263 |
+
light
|
| 3264 |
+
handsome
|
| 3265 |
+
happy
|
| 3266 |
+
three
|
| 3267 |
+
pregnant
|
| 3268 |
+
circular
|
| 3269 |
+
urban
|
| 3270 |
+
silver
|
| 3271 |
+
ceramic
|
| 3272 |
+
3d
|
| 3273 |
+
green
|
| 3274 |
+
blonde
|
| 3275 |
+
golden
|
| 3276 |
+
dark
|
| 3277 |
+
tropical
|
| 3278 |
+
ripe
|
| 3279 |
+
deep
|
| 3280 |
+
fat
|
| 3281 |
+
musical
|
| 3282 |
+
giant
|
| 3283 |
+
medical
|
| 3284 |
+
medieval
|
| 3285 |
+
bare
|
| 3286 |
+
stunning
|
| 3287 |
+
bold
|
| 3288 |
+
geographical
|
| 3289 |
+
huge
|
| 3290 |
+
plastic
|
| 3291 |
+
foggy
|
| 3292 |
+
stormy
|
| 3293 |
+
gothic
|
| 3294 |
+
biological
|
| 3295 |
+
empty
|
| 3296 |
+
clear
|
| 3297 |
+
antique
|
| 3298 |
+
pink
|
| 3299 |
+
steep
|
| 3300 |
+
brown
|
| 3301 |
+
striped
|
| 3302 |
+
aerial
|
| 3303 |
+
rainy
|
| 3304 |
+
cool
|
| 3305 |
+
flying
|
| 3306 |
+
commercial
|
| 3307 |
+
purple
|
| 3308 |
+
trendy
|
| 3309 |
+
blank
|
| 3310 |
+
haired
|
| 3311 |
+
dead
|
| 3312 |
+
wooden
|
| 3313 |
+
flat
|
| 3314 |
+
high
|
| 3315 |
+
beige
|
| 3316 |
+
panoramic
|
| 3317 |
+
angry
|
| 3318 |
+
dozen
|
| 3319 |
+
rural
|
| 3320 |
+
solar
|
| 3321 |
+
big
|
| 3322 |
+
small
|
| 3323 |
+
stained
|
| 3324 |
+
thick
|
| 3325 |
+
many
|
| 3326 |
+
fresh
|
| 3327 |
+
clean
|
| 3328 |
+
strong
|
| 3329 |
+
abstract
|
| 3330 |
+
crowded
|
| 3331 |
+
retro
|
| 3332 |
+
dry
|
| 3333 |
+
gorgeous
|
| 3334 |
+
martial
|
| 3335 |
+
modern
|
| 3336 |
+
blue
|
| 3337 |
+
cloudy
|
| 3338 |
+
low
|
| 3339 |
+
four
|
| 3340 |
+
outdoor
|
| 3341 |
+
single
|
| 3342 |
+
much
|
| 3343 |
+
beautiful
|
| 3344 |
+
snowy
|
| 3345 |
+
pretty
|
| 3346 |
+
new
|
| 3347 |
+
short
|
| 3348 |
+
sunny
|
| 3349 |
+
closed
|
| 3350 |
+
rocky
|
| 3351 |
+
red
|
| 3352 |
+
two
|
| 3353 |
+
double
|
| 3354 |
+
male
|
| 3355 |
+
gray
|
| 3356 |
+
five
|
| 3357 |
+
colorful
|
| 3358 |
+
automotive
|
| 3359 |
+
various
|
| 3360 |
+
one
|
| 3361 |
+
old
|
| 3362 |
+
rusty
|
| 3363 |
+
tall
|
| 3364 |
+
wild
|
| 3365 |
+
narrow
|
| 3366 |
+
natural
|
| 3367 |
+
several
|
| 3368 |
+
frozen
|
| 3369 |
+
textured
|
| 3370 |
+
lush
|
| 3371 |
+
young
|
| 3372 |
+
hot
|
| 3373 |
+
mixed
|
| 3374 |
+
white
|
| 3375 |
+
float
|
| 3376 |
+
quiet
|
| 3377 |
+
round
|
| 3378 |
+
bright
|
| 3379 |
+
religious
|
| 3380 |
+
female
|
| 3381 |
+
historical
|
| 3382 |
+
shiny
|
| 3383 |
+
traditional
|
| 3384 |
+
tourist
|
| 3385 |
+
yellow
|
| 3386 |
+
bald
|
| 3387 |
+
coastal
|
| 3388 |
+
lovely
|
| 3389 |
+
little
|
| 3390 |
+
broken
|
| 3391 |
+
romantic
|
| 3392 |
+
wide
|
| 3393 |
+
royal
|
| 3394 |
+
rich
|
| 3395 |
+
open
|
| 3396 |
+
cute
|
| 3397 |
+
ancient
|
| 3398 |
+
cold
|
| 3399 |
+
political
|
| 3400 |
+
elderly
|
| 3401 |
+
gold
|
| 3402 |
+
full
|
| 3403 |
+
rustic
|
| 3404 |
+
metallic
|
| 3405 |
+
floral
|
| 3406 |
+
sad
|
| 3407 |
+
wet
|
| 3408 |
+
fancy
|
| 3409 |
+
senior
|
| 3410 |
+
tiny
|
| 3411 |
+
stylish
|
| 3412 |
+
large
|
| 3413 |
+
frosty
|
| 3414 |
+
orange
|
| 3415 |
+
transparent
|
| 3416 |
+
electronic
|
| 3417 |
+
shallow
|
| 3418 |
+
scared
|
| 3419 |
+
armed
|
| 3420 |
+
dirty
|
| 3421 |
+
historic
|
| 3422 |
+
black
|
| 3423 |
+
few
|
| 3424 |
+
windy
|
| 3425 |
+
some
|
| 3426 |
+
square
|
| 3427 |
+
ornamental
|
| 3428 |
+
sandy
|
| 3429 |
+
thin
|
tag2text/inference.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
* Tag2Text
|
| 3 |
+
* Written by Xinyu Huang
|
| 4 |
+
"""
|
| 5 |
+
import argparse
|
| 6 |
+
import random
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
import torchvision.transforms as transforms
|
| 11 |
+
from models.tag2text import tag2text_caption
|
| 12 |
+
from PIL import Image
|
| 13 |
+
|
| 14 |
+
parser = argparse.ArgumentParser(
|
| 15 |
+
description="Tag2Text inferece for tagging and captioning"
|
| 16 |
+
)
|
| 17 |
+
parser.add_argument(
|
| 18 |
+
"--image",
|
| 19 |
+
metavar="DIR",
|
| 20 |
+
help="path to dataset",
|
| 21 |
+
default="images/1641173_2291260800.jpg",
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--pretrained",
|
| 25 |
+
metavar="DIR",
|
| 26 |
+
help="path to pretrained model",
|
| 27 |
+
default="pretrained/tag2text_swin_14m.pth",
|
| 28 |
+
)
|
| 29 |
+
parser.add_argument(
|
| 30 |
+
"--image-size",
|
| 31 |
+
default=384,
|
| 32 |
+
type=int,
|
| 33 |
+
metavar="N",
|
| 34 |
+
help="input image size (default: 448)",
|
| 35 |
+
)
|
| 36 |
+
parser.add_argument(
|
| 37 |
+
"--thre", default=0.68, type=float, metavar="N", help="threshold value"
|
| 38 |
+
)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--specified-tags", default="None", help="User input specified tags"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def inference(image, model, input_tag="None"):
|
| 45 |
+
with torch.no_grad():
|
| 46 |
+
caption, tag_predict = model.generate(
|
| 47 |
+
image, tag_input=None, max_length=50, return_tag_predict=True
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
if input_tag == "" or input_tag == "none" or input_tag == "None":
|
| 51 |
+
return tag_predict[0], None, caption[0]
|
| 52 |
+
|
| 53 |
+
# If user input specified tags:
|
| 54 |
+
else:
|
| 55 |
+
input_tag_list = []
|
| 56 |
+
input_tag_list.append(input_tag.replace(",", " | "))
|
| 57 |
+
|
| 58 |
+
with torch.no_grad():
|
| 59 |
+
caption, input_tag = model.generate(
|
| 60 |
+
image, tag_input=input_tag_list, max_length=50, return_tag_predict=True
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
return tag_predict[0], input_tag[0], caption[0]
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
args = parser.parse_args()
|
| 68 |
+
|
| 69 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 70 |
+
normalize = transforms.Normalize(
|
| 71 |
+
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
| 72 |
+
)
|
| 73 |
+
transform = transforms.Compose(
|
| 74 |
+
[
|
| 75 |
+
transforms.Resize((args.image_size, args.image_size)),
|
| 76 |
+
transforms.ToTensor(),
|
| 77 |
+
normalize,
|
| 78 |
+
]
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# delete some tags that may disturb captioning
|
| 82 |
+
# 127: "quarter"; 2961: "back", 3351: "two"; 3265: "three"; 3338: "four"; 3355: "five"; 3359: "one"
|
| 83 |
+
delete_tag_index = [127, 2961, 3351, 3265, 3338, 3355, 3359]
|
| 84 |
+
|
| 85 |
+
#######load model
|
| 86 |
+
model = tag2text_caption(
|
| 87 |
+
pretrained=args.pretrained,
|
| 88 |
+
image_size=args.image_size,
|
| 89 |
+
vit="swin_b",
|
| 90 |
+
delete_tag_index=delete_tag_index,
|
| 91 |
+
)
|
| 92 |
+
model.threshold = args.thre # threshold for tagging
|
| 93 |
+
model.eval()
|
| 94 |
+
|
| 95 |
+
model = model.to(device)
|
| 96 |
+
raw_image = Image.open(args.image).resize((args.image_size, args.image_size))
|
| 97 |
+
image = transform(raw_image).unsqueeze(0).to(device)
|
| 98 |
+
|
| 99 |
+
res = inference(image, model, args.specified_tags)
|
| 100 |
+
print("Model Identified Tags: ", res[0])
|
| 101 |
+
print("User Specified Tags: ", res[1])
|
| 102 |
+
print("Image Caption: ", res[2])
|
tag2text/models/bert.py
ADDED
|
@@ -0,0 +1,1157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
* Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
* All rights reserved.
|
| 4 |
+
* SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
* By Junnan Li
|
| 7 |
+
* Based on huggingface code base
|
| 8 |
+
* https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
|
| 9 |
+
"""
|
| 10 |
+
import math
|
| 11 |
+
import os
|
| 12 |
+
import warnings
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from typing import Optional
|
| 15 |
+
from typing import Tuple
|
| 16 |
+
|
| 17 |
+
import torch.nn.functional as F
|
| 18 |
+
import torch.utils.checkpoint
|
| 19 |
+
from torch import device
|
| 20 |
+
from torch import dtype
|
| 21 |
+
from torch import nn
|
| 22 |
+
from torch import Tensor
|
| 23 |
+
from torch.nn import CrossEntropyLoss
|
| 24 |
+
from transformers.activations import ACT2FN
|
| 25 |
+
from transformers.file_utils import (
|
| 26 |
+
ModelOutput,
|
| 27 |
+
)
|
| 28 |
+
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
|
| 29 |
+
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
|
| 30 |
+
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
| 31 |
+
from transformers.modeling_outputs import MaskedLMOutput
|
| 32 |
+
from transformers.modeling_outputs import MultipleChoiceModelOutput
|
| 33 |
+
from transformers.modeling_outputs import NextSentencePredictorOutput
|
| 34 |
+
from transformers.modeling_outputs import QuestionAnsweringModelOutput
|
| 35 |
+
from transformers.modeling_outputs import SequenceClassifierOutput
|
| 36 |
+
from transformers.modeling_outputs import TokenClassifierOutput
|
| 37 |
+
from transformers.modeling_utils import apply_chunking_to_forward
|
| 38 |
+
from transformers.modeling_utils import find_pruneable_heads_and_indices
|
| 39 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 40 |
+
from transformers.modeling_utils import prune_linear_layer
|
| 41 |
+
from transformers.models.bert.configuration_bert import BertConfig
|
| 42 |
+
from transformers.utils import logging
|
| 43 |
+
|
| 44 |
+
logger = logging.get_logger(__name__)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class BertEmbeddings_nopos(nn.Module):
|
| 48 |
+
"""Construct the embeddings from word and position embeddings."""
|
| 49 |
+
|
| 50 |
+
def __init__(self, config):
|
| 51 |
+
super().__init__()
|
| 52 |
+
self.word_embeddings = nn.Embedding(
|
| 53 |
+
config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
|
| 54 |
+
)
|
| 55 |
+
# self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
| 56 |
+
|
| 57 |
+
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
| 58 |
+
# any TensorFlow checkpoint file
|
| 59 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 60 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
| 61 |
+
|
| 62 |
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
| 63 |
+
# self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
|
| 64 |
+
# self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
|
| 65 |
+
|
| 66 |
+
self.config = config
|
| 67 |
+
|
| 68 |
+
def forward(
|
| 69 |
+
self,
|
| 70 |
+
input_ids=None,
|
| 71 |
+
position_ids=None,
|
| 72 |
+
inputs_embeds=None,
|
| 73 |
+
past_key_values_length=0,
|
| 74 |
+
):
|
| 75 |
+
if input_ids is not None:
|
| 76 |
+
input_shape = input_ids.size()
|
| 77 |
+
else:
|
| 78 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 79 |
+
|
| 80 |
+
seq_length = input_shape[1]
|
| 81 |
+
|
| 82 |
+
# if position_ids is None:
|
| 83 |
+
# position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
|
| 84 |
+
|
| 85 |
+
if inputs_embeds is None:
|
| 86 |
+
inputs_embeds = self.word_embeddings(input_ids)
|
| 87 |
+
|
| 88 |
+
embeddings = inputs_embeds
|
| 89 |
+
|
| 90 |
+
# if self.position_embedding_type == "absolute":
|
| 91 |
+
# position_embeddings = self.position_embeddings(position_ids)
|
| 92 |
+
# # print('add position_embeddings!!!!')
|
| 93 |
+
# embeddings += position_embeddings
|
| 94 |
+
embeddings = self.LayerNorm(embeddings)
|
| 95 |
+
embeddings = self.dropout(embeddings)
|
| 96 |
+
return embeddings
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class BertEmbeddings(nn.Module):
|
| 100 |
+
"""Construct the embeddings from word and position embeddings."""
|
| 101 |
+
|
| 102 |
+
def __init__(self, config):
|
| 103 |
+
super().__init__()
|
| 104 |
+
self.word_embeddings = nn.Embedding(
|
| 105 |
+
config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
|
| 106 |
+
)
|
| 107 |
+
self.position_embeddings = nn.Embedding(
|
| 108 |
+
config.max_position_embeddings, config.hidden_size
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
|
| 112 |
+
# any TensorFlow checkpoint file
|
| 113 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 114 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
| 115 |
+
|
| 116 |
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
| 117 |
+
self.register_buffer(
|
| 118 |
+
"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
|
| 119 |
+
)
|
| 120 |
+
self.position_embedding_type = getattr(
|
| 121 |
+
config, "position_embedding_type", "absolute"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
self.config = config
|
| 125 |
+
|
| 126 |
+
def forward(
|
| 127 |
+
self,
|
| 128 |
+
input_ids=None,
|
| 129 |
+
position_ids=None,
|
| 130 |
+
inputs_embeds=None,
|
| 131 |
+
past_key_values_length=0,
|
| 132 |
+
):
|
| 133 |
+
if input_ids is not None:
|
| 134 |
+
input_shape = input_ids.size()
|
| 135 |
+
else:
|
| 136 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 137 |
+
|
| 138 |
+
seq_length = input_shape[1]
|
| 139 |
+
|
| 140 |
+
if position_ids is None:
|
| 141 |
+
position_ids = self.position_ids[
|
| 142 |
+
:, past_key_values_length : seq_length + past_key_values_length
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
if inputs_embeds is None:
|
| 146 |
+
inputs_embeds = self.word_embeddings(input_ids)
|
| 147 |
+
|
| 148 |
+
embeddings = inputs_embeds
|
| 149 |
+
|
| 150 |
+
if self.position_embedding_type == "absolute":
|
| 151 |
+
position_embeddings = self.position_embeddings(position_ids)
|
| 152 |
+
# print('add position_embeddings!!!!')
|
| 153 |
+
embeddings += position_embeddings
|
| 154 |
+
embeddings = self.LayerNorm(embeddings)
|
| 155 |
+
embeddings = self.dropout(embeddings)
|
| 156 |
+
return embeddings
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class BertSelfAttention(nn.Module):
|
| 160 |
+
def __init__(self, config, is_cross_attention):
|
| 161 |
+
super().__init__()
|
| 162 |
+
self.config = config
|
| 163 |
+
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
|
| 164 |
+
config, "embedding_size"
|
| 165 |
+
):
|
| 166 |
+
raise ValueError(
|
| 167 |
+
"The hidden size (%d) is not a multiple of the number of attention "
|
| 168 |
+
"heads (%d)" % (config.hidden_size, config.num_attention_heads)
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
self.num_attention_heads = config.num_attention_heads
|
| 172 |
+
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
|
| 173 |
+
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
| 174 |
+
|
| 175 |
+
self.query = nn.Linear(config.hidden_size, self.all_head_size)
|
| 176 |
+
if is_cross_attention:
|
| 177 |
+
self.key = nn.Linear(config.encoder_width, self.all_head_size)
|
| 178 |
+
self.value = nn.Linear(config.encoder_width, self.all_head_size)
|
| 179 |
+
else:
|
| 180 |
+
self.key = nn.Linear(config.hidden_size, self.all_head_size)
|
| 181 |
+
self.value = nn.Linear(config.hidden_size, self.all_head_size)
|
| 182 |
+
|
| 183 |
+
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
|
| 184 |
+
self.position_embedding_type = getattr(
|
| 185 |
+
config, "position_embedding_type", "absolute"
|
| 186 |
+
)
|
| 187 |
+
if (
|
| 188 |
+
self.position_embedding_type == "relative_key"
|
| 189 |
+
or self.position_embedding_type == "relative_key_query"
|
| 190 |
+
):
|
| 191 |
+
self.max_position_embeddings = config.max_position_embeddings
|
| 192 |
+
self.distance_embedding = nn.Embedding(
|
| 193 |
+
2 * config.max_position_embeddings - 1, self.attention_head_size
|
| 194 |
+
)
|
| 195 |
+
self.save_attention = False
|
| 196 |
+
|
| 197 |
+
def save_attn_gradients(self, attn_gradients):
|
| 198 |
+
self.attn_gradients = attn_gradients
|
| 199 |
+
|
| 200 |
+
def get_attn_gradients(self):
|
| 201 |
+
return self.attn_gradients
|
| 202 |
+
|
| 203 |
+
def save_attention_map(self, attention_map):
|
| 204 |
+
self.attention_map = attention_map
|
| 205 |
+
|
| 206 |
+
def get_attention_map(self):
|
| 207 |
+
return self.attention_map
|
| 208 |
+
|
| 209 |
+
def transpose_for_scores(self, x):
|
| 210 |
+
new_x_shape = x.size()[:-1] + (
|
| 211 |
+
self.num_attention_heads,
|
| 212 |
+
self.attention_head_size,
|
| 213 |
+
)
|
| 214 |
+
x = x.view(*new_x_shape)
|
| 215 |
+
return x.permute(0, 2, 1, 3)
|
| 216 |
+
|
| 217 |
+
def forward(
|
| 218 |
+
self,
|
| 219 |
+
hidden_states,
|
| 220 |
+
attention_mask=None,
|
| 221 |
+
head_mask=None,
|
| 222 |
+
encoder_hidden_states=None,
|
| 223 |
+
encoder_attention_mask=None,
|
| 224 |
+
past_key_value=None,
|
| 225 |
+
output_attentions=False,
|
| 226 |
+
):
|
| 227 |
+
mixed_query_layer = self.query(hidden_states)
|
| 228 |
+
|
| 229 |
+
# If this is instantiated as a cross-attention module, the keys
|
| 230 |
+
# and values come from an encoder; the attention mask needs to be
|
| 231 |
+
# such that the encoder's padding tokens are not attended to.
|
| 232 |
+
is_cross_attention = encoder_hidden_states is not None
|
| 233 |
+
|
| 234 |
+
if is_cross_attention:
|
| 235 |
+
# print(self.key.weight.shape)
|
| 236 |
+
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
|
| 237 |
+
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
|
| 238 |
+
attention_mask = encoder_attention_mask
|
| 239 |
+
elif past_key_value is not None:
|
| 240 |
+
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
| 241 |
+
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
| 242 |
+
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
|
| 243 |
+
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
|
| 244 |
+
else:
|
| 245 |
+
key_layer = self.transpose_for_scores(self.key(hidden_states))
|
| 246 |
+
value_layer = self.transpose_for_scores(self.value(hidden_states))
|
| 247 |
+
|
| 248 |
+
query_layer = self.transpose_for_scores(mixed_query_layer)
|
| 249 |
+
|
| 250 |
+
past_key_value = (key_layer, value_layer)
|
| 251 |
+
|
| 252 |
+
# compatible with higher versions of transformers
|
| 253 |
+
if key_layer.shape[0] > query_layer.shape[0]:
|
| 254 |
+
key_layer = key_layer[: query_layer.shape[0], :, :, :]
|
| 255 |
+
attention_mask = attention_mask[: query_layer.shape[0], :, :]
|
| 256 |
+
value_layer = value_layer[: query_layer.shape[0], :, :, :]
|
| 257 |
+
|
| 258 |
+
# Take the dot product between "query" and "key" to get the raw attention scores.
|
| 259 |
+
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
| 260 |
+
|
| 261 |
+
if (
|
| 262 |
+
self.position_embedding_type == "relative_key"
|
| 263 |
+
or self.position_embedding_type == "relative_key_query"
|
| 264 |
+
):
|
| 265 |
+
seq_length = hidden_states.size()[1]
|
| 266 |
+
position_ids_l = torch.arange(
|
| 267 |
+
seq_length, dtype=torch.long, device=hidden_states.device
|
| 268 |
+
).view(-1, 1)
|
| 269 |
+
position_ids_r = torch.arange(
|
| 270 |
+
seq_length, dtype=torch.long, device=hidden_states.device
|
| 271 |
+
).view(1, -1)
|
| 272 |
+
distance = position_ids_l - position_ids_r
|
| 273 |
+
positional_embedding = self.distance_embedding(
|
| 274 |
+
distance + self.max_position_embeddings - 1
|
| 275 |
+
)
|
| 276 |
+
positional_embedding = positional_embedding.to(
|
| 277 |
+
dtype=query_layer.dtype
|
| 278 |
+
) # fp16 compatibility
|
| 279 |
+
|
| 280 |
+
if self.position_embedding_type == "relative_key":
|
| 281 |
+
relative_position_scores = torch.einsum(
|
| 282 |
+
"bhld,lrd->bhlr", query_layer, positional_embedding
|
| 283 |
+
)
|
| 284 |
+
attention_scores = attention_scores + relative_position_scores
|
| 285 |
+
elif self.position_embedding_type == "relative_key_query":
|
| 286 |
+
relative_position_scores_query = torch.einsum(
|
| 287 |
+
"bhld,lrd->bhlr", query_layer, positional_embedding
|
| 288 |
+
)
|
| 289 |
+
relative_position_scores_key = torch.einsum(
|
| 290 |
+
"bhrd,lrd->bhlr", key_layer, positional_embedding
|
| 291 |
+
)
|
| 292 |
+
attention_scores = (
|
| 293 |
+
attention_scores
|
| 294 |
+
+ relative_position_scores_query
|
| 295 |
+
+ relative_position_scores_key
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
| 299 |
+
if attention_mask is not None:
|
| 300 |
+
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
|
| 301 |
+
attention_scores = attention_scores + attention_mask
|
| 302 |
+
|
| 303 |
+
# Normalize the attention scores to probabilities.
|
| 304 |
+
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
| 305 |
+
|
| 306 |
+
if is_cross_attention and self.save_attention:
|
| 307 |
+
self.save_attention_map(attention_probs)
|
| 308 |
+
attention_probs.register_hook(self.save_attn_gradients)
|
| 309 |
+
|
| 310 |
+
# This is actually dropping out entire tokens to attend to, which might
|
| 311 |
+
# seem a bit unusual, but is taken from the original Transformer paper.
|
| 312 |
+
attention_probs_dropped = self.dropout(attention_probs)
|
| 313 |
+
|
| 314 |
+
# Mask heads if we want to
|
| 315 |
+
if head_mask is not None:
|
| 316 |
+
attention_probs_dropped = attention_probs_dropped * head_mask
|
| 317 |
+
|
| 318 |
+
context_layer = torch.matmul(attention_probs_dropped, value_layer)
|
| 319 |
+
|
| 320 |
+
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
| 321 |
+
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
| 322 |
+
context_layer = context_layer.view(*new_context_layer_shape)
|
| 323 |
+
|
| 324 |
+
outputs = (
|
| 325 |
+
(context_layer, attention_probs) if output_attentions else (context_layer,)
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
outputs = outputs + (past_key_value,)
|
| 329 |
+
return outputs
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
class BertSelfOutput(nn.Module):
|
| 333 |
+
def __init__(self, config):
|
| 334 |
+
super().__init__()
|
| 335 |
+
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
| 336 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 337 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
| 338 |
+
|
| 339 |
+
def forward(self, hidden_states, input_tensor):
|
| 340 |
+
hidden_states = self.dense(hidden_states)
|
| 341 |
+
hidden_states = self.dropout(hidden_states)
|
| 342 |
+
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
| 343 |
+
return hidden_states
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
class BertAttention(nn.Module):
|
| 347 |
+
def __init__(self, config, is_cross_attention=False):
|
| 348 |
+
super().__init__()
|
| 349 |
+
self.self = BertSelfAttention(config, is_cross_attention)
|
| 350 |
+
self.output = BertSelfOutput(config)
|
| 351 |
+
self.pruned_heads = set()
|
| 352 |
+
|
| 353 |
+
def prune_heads(self, heads):
|
| 354 |
+
if len(heads) == 0:
|
| 355 |
+
return
|
| 356 |
+
heads, index = find_pruneable_heads_and_indices(
|
| 357 |
+
heads,
|
| 358 |
+
self.self.num_attention_heads,
|
| 359 |
+
self.self.attention_head_size,
|
| 360 |
+
self.pruned_heads,
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
# Prune linear layers
|
| 364 |
+
self.self.query = prune_linear_layer(self.self.query, index)
|
| 365 |
+
self.self.key = prune_linear_layer(self.self.key, index)
|
| 366 |
+
self.self.value = prune_linear_layer(self.self.value, index)
|
| 367 |
+
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
|
| 368 |
+
|
| 369 |
+
# Update hyper params and store pruned heads
|
| 370 |
+
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
|
| 371 |
+
self.self.all_head_size = (
|
| 372 |
+
self.self.attention_head_size * self.self.num_attention_heads
|
| 373 |
+
)
|
| 374 |
+
self.pruned_heads = self.pruned_heads.union(heads)
|
| 375 |
+
|
| 376 |
+
def forward(
|
| 377 |
+
self,
|
| 378 |
+
hidden_states,
|
| 379 |
+
attention_mask=None,
|
| 380 |
+
head_mask=None,
|
| 381 |
+
encoder_hidden_states=None,
|
| 382 |
+
encoder_attention_mask=None,
|
| 383 |
+
past_key_value=None,
|
| 384 |
+
output_attentions=False,
|
| 385 |
+
):
|
| 386 |
+
self_outputs = self.self(
|
| 387 |
+
hidden_states,
|
| 388 |
+
attention_mask,
|
| 389 |
+
head_mask,
|
| 390 |
+
encoder_hidden_states,
|
| 391 |
+
encoder_attention_mask,
|
| 392 |
+
past_key_value,
|
| 393 |
+
output_attentions,
|
| 394 |
+
)
|
| 395 |
+
attention_output = self.output(self_outputs[0], hidden_states)
|
| 396 |
+
outputs = (attention_output,) + self_outputs[
|
| 397 |
+
1:
|
| 398 |
+
] # add attentions if we output them
|
| 399 |
+
return outputs
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
class BertIntermediate(nn.Module):
|
| 403 |
+
def __init__(self, config):
|
| 404 |
+
super().__init__()
|
| 405 |
+
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
|
| 406 |
+
if isinstance(config.hidden_act, str):
|
| 407 |
+
self.intermediate_act_fn = ACT2FN[config.hidden_act]
|
| 408 |
+
else:
|
| 409 |
+
self.intermediate_act_fn = config.hidden_act
|
| 410 |
+
|
| 411 |
+
def forward(self, hidden_states):
|
| 412 |
+
hidden_states = self.dense(hidden_states)
|
| 413 |
+
hidden_states = self.intermediate_act_fn(hidden_states)
|
| 414 |
+
return hidden_states
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
class BertOutput(nn.Module):
|
| 418 |
+
def __init__(self, config):
|
| 419 |
+
super().__init__()
|
| 420 |
+
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
|
| 421 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 422 |
+
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
| 423 |
+
|
| 424 |
+
def forward(self, hidden_states, input_tensor):
|
| 425 |
+
hidden_states = self.dense(hidden_states)
|
| 426 |
+
hidden_states = self.dropout(hidden_states)
|
| 427 |
+
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
| 428 |
+
return hidden_states
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
class BertLayer(nn.Module):
|
| 432 |
+
def __init__(self, config, layer_num):
|
| 433 |
+
super().__init__()
|
| 434 |
+
self.config = config
|
| 435 |
+
self.chunk_size_feed_forward = config.chunk_size_feed_forward
|
| 436 |
+
self.seq_len_dim = 1
|
| 437 |
+
self.attention = BertAttention(config)
|
| 438 |
+
self.layer_num = layer_num
|
| 439 |
+
if self.config.add_cross_attention:
|
| 440 |
+
self.crossattention = BertAttention(
|
| 441 |
+
config, is_cross_attention=self.config.add_cross_attention
|
| 442 |
+
)
|
| 443 |
+
self.intermediate = BertIntermediate(config)
|
| 444 |
+
self.output = BertOutput(config)
|
| 445 |
+
|
| 446 |
+
def forward(
|
| 447 |
+
self,
|
| 448 |
+
hidden_states,
|
| 449 |
+
attention_mask=None,
|
| 450 |
+
head_mask=None,
|
| 451 |
+
encoder_hidden_states=None,
|
| 452 |
+
encoder_attention_mask=None,
|
| 453 |
+
past_key_value=None,
|
| 454 |
+
output_attentions=False,
|
| 455 |
+
mode=None,
|
| 456 |
+
):
|
| 457 |
+
if mode == "tagging":
|
| 458 |
+
assert (
|
| 459 |
+
encoder_hidden_states is not None
|
| 460 |
+
), "encoder_hidden_states must be given for cross-attention layers"
|
| 461 |
+
|
| 462 |
+
cross_attention_outputs = self.crossattention(
|
| 463 |
+
hidden_states,
|
| 464 |
+
attention_mask,
|
| 465 |
+
head_mask,
|
| 466 |
+
encoder_hidden_states,
|
| 467 |
+
encoder_attention_mask,
|
| 468 |
+
output_attentions=output_attentions,
|
| 469 |
+
)
|
| 470 |
+
attention_output = cross_attention_outputs[0]
|
| 471 |
+
outputs = cross_attention_outputs[
|
| 472 |
+
1:-1
|
| 473 |
+
] # add cross attentions if we output attention weights
|
| 474 |
+
|
| 475 |
+
present_key_value = cross_attention_outputs[-1]
|
| 476 |
+
|
| 477 |
+
else:
|
| 478 |
+
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
|
| 479 |
+
self_attn_past_key_value = (
|
| 480 |
+
past_key_value[:2] if past_key_value is not None else None
|
| 481 |
+
)
|
| 482 |
+
self_attention_outputs = self.attention(
|
| 483 |
+
hidden_states,
|
| 484 |
+
attention_mask,
|
| 485 |
+
head_mask,
|
| 486 |
+
output_attentions=output_attentions,
|
| 487 |
+
past_key_value=self_attn_past_key_value,
|
| 488 |
+
)
|
| 489 |
+
attention_output = self_attention_outputs[0]
|
| 490 |
+
|
| 491 |
+
outputs = self_attention_outputs[1:-1]
|
| 492 |
+
present_key_value = self_attention_outputs[-1]
|
| 493 |
+
|
| 494 |
+
if mode == "multimodal":
|
| 495 |
+
assert (
|
| 496 |
+
encoder_hidden_states is not None
|
| 497 |
+
), "encoder_hidden_states must be given for cross-attention layers"
|
| 498 |
+
|
| 499 |
+
cross_attention_outputs = self.crossattention(
|
| 500 |
+
attention_output,
|
| 501 |
+
attention_mask,
|
| 502 |
+
head_mask,
|
| 503 |
+
encoder_hidden_states,
|
| 504 |
+
encoder_attention_mask,
|
| 505 |
+
output_attentions=output_attentions,
|
| 506 |
+
)
|
| 507 |
+
attention_output = cross_attention_outputs[0]
|
| 508 |
+
outputs = (
|
| 509 |
+
outputs + cross_attention_outputs[1:-1]
|
| 510 |
+
) # add cross attentions if we output attention weights
|
| 511 |
+
layer_output = apply_chunking_to_forward(
|
| 512 |
+
self.feed_forward_chunk,
|
| 513 |
+
self.chunk_size_feed_forward,
|
| 514 |
+
self.seq_len_dim,
|
| 515 |
+
attention_output,
|
| 516 |
+
)
|
| 517 |
+
outputs = (layer_output,) + outputs
|
| 518 |
+
|
| 519 |
+
outputs = outputs + (present_key_value,)
|
| 520 |
+
|
| 521 |
+
return outputs
|
| 522 |
+
|
| 523 |
+
def feed_forward_chunk(self, attention_output):
|
| 524 |
+
intermediate_output = self.intermediate(attention_output)
|
| 525 |
+
layer_output = self.output(intermediate_output, attention_output)
|
| 526 |
+
return layer_output
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
class BertEncoder(nn.Module):
|
| 530 |
+
def __init__(self, config):
|
| 531 |
+
super().__init__()
|
| 532 |
+
self.config = config
|
| 533 |
+
self.layer = nn.ModuleList(
|
| 534 |
+
[BertLayer(config, i) for i in range(config.num_hidden_layers)]
|
| 535 |
+
)
|
| 536 |
+
self.gradient_checkpointing = False
|
| 537 |
+
|
| 538 |
+
def forward(
|
| 539 |
+
self,
|
| 540 |
+
hidden_states,
|
| 541 |
+
attention_mask=None,
|
| 542 |
+
head_mask=None,
|
| 543 |
+
encoder_hidden_states=None,
|
| 544 |
+
encoder_attention_mask=None,
|
| 545 |
+
past_key_values=None,
|
| 546 |
+
use_cache=None,
|
| 547 |
+
output_attentions=False,
|
| 548 |
+
output_hidden_states=False,
|
| 549 |
+
return_dict=True,
|
| 550 |
+
mode="multimodal",
|
| 551 |
+
):
|
| 552 |
+
all_hidden_states = () if output_hidden_states else None
|
| 553 |
+
all_self_attentions = () if output_attentions else None
|
| 554 |
+
all_cross_attentions = (
|
| 555 |
+
() if output_attentions and self.config.add_cross_attention else None
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
next_decoder_cache = () if use_cache else None
|
| 559 |
+
|
| 560 |
+
for i in range(self.config.num_hidden_layers):
|
| 561 |
+
layer_module = self.layer[i]
|
| 562 |
+
if output_hidden_states:
|
| 563 |
+
all_hidden_states = all_hidden_states + (hidden_states,)
|
| 564 |
+
|
| 565 |
+
layer_head_mask = head_mask[i] if head_mask is not None else None
|
| 566 |
+
past_key_value = past_key_values[i] if past_key_values is not None else None
|
| 567 |
+
|
| 568 |
+
if self.gradient_checkpointing and self.training:
|
| 569 |
+
if use_cache:
|
| 570 |
+
logger.warn(
|
| 571 |
+
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
| 572 |
+
)
|
| 573 |
+
use_cache = False
|
| 574 |
+
|
| 575 |
+
def create_custom_forward(module):
|
| 576 |
+
def custom_forward(*inputs):
|
| 577 |
+
return module(*inputs, past_key_value, output_attentions)
|
| 578 |
+
|
| 579 |
+
return custom_forward
|
| 580 |
+
|
| 581 |
+
layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 582 |
+
create_custom_forward(layer_module),
|
| 583 |
+
hidden_states,
|
| 584 |
+
attention_mask,
|
| 585 |
+
layer_head_mask,
|
| 586 |
+
encoder_hidden_states,
|
| 587 |
+
encoder_attention_mask,
|
| 588 |
+
mode=mode,
|
| 589 |
+
)
|
| 590 |
+
else:
|
| 591 |
+
layer_outputs = layer_module(
|
| 592 |
+
hidden_states,
|
| 593 |
+
attention_mask,
|
| 594 |
+
layer_head_mask,
|
| 595 |
+
encoder_hidden_states,
|
| 596 |
+
encoder_attention_mask,
|
| 597 |
+
past_key_value,
|
| 598 |
+
output_attentions,
|
| 599 |
+
mode=mode,
|
| 600 |
+
)
|
| 601 |
+
|
| 602 |
+
hidden_states = layer_outputs[0]
|
| 603 |
+
if use_cache:
|
| 604 |
+
next_decoder_cache += (layer_outputs[-1],)
|
| 605 |
+
if output_attentions:
|
| 606 |
+
all_self_attentions = all_self_attentions + (layer_outputs[1],)
|
| 607 |
+
|
| 608 |
+
if output_hidden_states:
|
| 609 |
+
all_hidden_states = all_hidden_states + (hidden_states,)
|
| 610 |
+
|
| 611 |
+
if not return_dict:
|
| 612 |
+
return tuple(
|
| 613 |
+
v
|
| 614 |
+
for v in [
|
| 615 |
+
hidden_states,
|
| 616 |
+
next_decoder_cache,
|
| 617 |
+
all_hidden_states,
|
| 618 |
+
all_self_attentions,
|
| 619 |
+
all_cross_attentions,
|
| 620 |
+
]
|
| 621 |
+
if v is not None
|
| 622 |
+
)
|
| 623 |
+
return BaseModelOutputWithPastAndCrossAttentions(
|
| 624 |
+
last_hidden_state=hidden_states,
|
| 625 |
+
past_key_values=next_decoder_cache,
|
| 626 |
+
hidden_states=all_hidden_states,
|
| 627 |
+
attentions=all_self_attentions,
|
| 628 |
+
cross_attentions=all_cross_attentions,
|
| 629 |
+
)
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
class BertPooler(nn.Module):
|
| 633 |
+
def __init__(self, config):
|
| 634 |
+
super().__init__()
|
| 635 |
+
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
| 636 |
+
self.activation = nn.Tanh()
|
| 637 |
+
|
| 638 |
+
def forward(self, hidden_states):
|
| 639 |
+
# We "pool" the model by simply taking the hidden state corresponding
|
| 640 |
+
# to the first token.
|
| 641 |
+
first_token_tensor = hidden_states[:, 0]
|
| 642 |
+
pooled_output = self.dense(first_token_tensor)
|
| 643 |
+
pooled_output = self.activation(pooled_output)
|
| 644 |
+
return pooled_output
|
| 645 |
+
|
| 646 |
+
|
| 647 |
+
class BertPredictionHeadTransform(nn.Module):
|
| 648 |
+
def __init__(self, config):
|
| 649 |
+
super().__init__()
|
| 650 |
+
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
| 651 |
+
if isinstance(config.hidden_act, str):
|
| 652 |
+
self.transform_act_fn = ACT2FN[config.hidden_act]
|
| 653 |
+
else:
|
| 654 |
+
self.transform_act_fn = config.hidden_act
|
| 655 |
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
| 656 |
+
|
| 657 |
+
def forward(self, hidden_states):
|
| 658 |
+
hidden_states = self.dense(hidden_states)
|
| 659 |
+
hidden_states = self.transform_act_fn(hidden_states)
|
| 660 |
+
hidden_states = self.LayerNorm(hidden_states)
|
| 661 |
+
return hidden_states
|
| 662 |
+
|
| 663 |
+
|
| 664 |
+
class BertLMPredictionHead(nn.Module):
|
| 665 |
+
def __init__(self, config):
|
| 666 |
+
super().__init__()
|
| 667 |
+
self.transform = BertPredictionHeadTransform(config)
|
| 668 |
+
|
| 669 |
+
# The output weights are the same as the input embeddings, but there is
|
| 670 |
+
# an output-only bias for each token.
|
| 671 |
+
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 672 |
+
|
| 673 |
+
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
| 674 |
+
|
| 675 |
+
# Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
|
| 676 |
+
self.decoder.bias = self.bias
|
| 677 |
+
|
| 678 |
+
def forward(self, hidden_states):
|
| 679 |
+
hidden_states = self.transform(hidden_states)
|
| 680 |
+
hidden_states = self.decoder(hidden_states)
|
| 681 |
+
return hidden_states
|
| 682 |
+
|
| 683 |
+
|
| 684 |
+
class BertOnlyMLMHead(nn.Module):
|
| 685 |
+
def __init__(self, config):
|
| 686 |
+
super().__init__()
|
| 687 |
+
self.predictions = BertLMPredictionHead(config)
|
| 688 |
+
|
| 689 |
+
def forward(self, sequence_output):
|
| 690 |
+
prediction_scores = self.predictions(sequence_output)
|
| 691 |
+
return prediction_scores
|
| 692 |
+
|
| 693 |
+
|
| 694 |
+
class BertPreTrainedModel(PreTrainedModel):
|
| 695 |
+
"""An abstract class to handle weights initialization and a simple interface for downloading and loading
|
| 696 |
+
pretrained models."""
|
| 697 |
+
|
| 698 |
+
config_class = BertConfig
|
| 699 |
+
base_model_prefix = "bert"
|
| 700 |
+
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
| 701 |
+
|
| 702 |
+
def _init_weights(self, module):
|
| 703 |
+
"""Initialize the weights."""
|
| 704 |
+
if isinstance(module, (nn.Linear, nn.Embedding)):
|
| 705 |
+
# Slightly different from the TF version which uses truncated_normal for initialization
|
| 706 |
+
# cf https://github.com/pytorch/pytorch/pull/5617
|
| 707 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
| 708 |
+
elif isinstance(module, nn.LayerNorm):
|
| 709 |
+
module.bias.data.zero_()
|
| 710 |
+
module.weight.data.fill_(1.0)
|
| 711 |
+
if isinstance(module, nn.Linear) and module.bias is not None:
|
| 712 |
+
module.bias.data.zero_()
|
| 713 |
+
|
| 714 |
+
|
| 715 |
+
class BertModel(BertPreTrainedModel):
|
| 716 |
+
"""The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
|
| 717 |
+
cross-attention is added between the self-attention layers, following the architecture described in `Attention
|
| 718 |
+
is all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob
|
| 719 |
+
Uszkoreit, Llion Jones, Aidan N.
|
| 720 |
+
|
| 721 |
+
Gomez, Lukasz Kaiser and Illia Polosukhin. argument and :obj:`add_cross_attention` set to :obj:`True`; an
|
| 722 |
+
:obj:`encoder_hidden_states` is then expected as an input to the forward pass.
|
| 723 |
+
"""
|
| 724 |
+
|
| 725 |
+
def __init__(self, config, add_pooling_layer=True):
|
| 726 |
+
super().__init__(config)
|
| 727 |
+
self.config = config
|
| 728 |
+
|
| 729 |
+
self.embeddings = BertEmbeddings(config)
|
| 730 |
+
|
| 731 |
+
self.encoder = BertEncoder(config)
|
| 732 |
+
|
| 733 |
+
self.pooler = BertPooler(config) if add_pooling_layer else None
|
| 734 |
+
|
| 735 |
+
self.init_weights()
|
| 736 |
+
|
| 737 |
+
def get_input_embeddings(self):
|
| 738 |
+
return self.embeddings.word_embeddings
|
| 739 |
+
|
| 740 |
+
def set_input_embeddings(self, value):
|
| 741 |
+
self.embeddings.word_embeddings = value
|
| 742 |
+
|
| 743 |
+
def _prune_heads(self, heads_to_prune):
|
| 744 |
+
"""Prunes heads of the model.
|
| 745 |
+
|
| 746 |
+
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
|
| 747 |
+
class PreTrainedModel
|
| 748 |
+
"""
|
| 749 |
+
for layer, heads in heads_to_prune.items():
|
| 750 |
+
self.encoder.layer[layer].attention.prune_heads(heads)
|
| 751 |
+
|
| 752 |
+
def get_extended_attention_mask(
|
| 753 |
+
self,
|
| 754 |
+
attention_mask: Tensor,
|
| 755 |
+
input_shape: Tuple[int],
|
| 756 |
+
device: device,
|
| 757 |
+
is_decoder: bool,
|
| 758 |
+
) -> Tensor:
|
| 759 |
+
"""Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
|
| 760 |
+
|
| 761 |
+
Arguments:
|
| 762 |
+
attention_mask (:obj:`torch.Tensor`):
|
| 763 |
+
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
|
| 764 |
+
input_shape (:obj:`Tuple[int]`):
|
| 765 |
+
The shape of the input to the model.
|
| 766 |
+
device: (:obj:`torch.device`):
|
| 767 |
+
The device of the input to the model.
|
| 768 |
+
|
| 769 |
+
Returns:
|
| 770 |
+
:obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
|
| 771 |
+
"""
|
| 772 |
+
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
| 773 |
+
# ourselves in which case we just need to make it broadcastable to all heads.
|
| 774 |
+
if attention_mask.dim() == 3:
|
| 775 |
+
extended_attention_mask = attention_mask[:, None, :, :]
|
| 776 |
+
elif attention_mask.dim() == 2:
|
| 777 |
+
# Provided a padding mask of dimensions [batch_size, seq_length]
|
| 778 |
+
# - if the model is a decoder, apply a causal mask in addition to the padding mask
|
| 779 |
+
# - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
| 780 |
+
if is_decoder:
|
| 781 |
+
batch_size, seq_length = input_shape
|
| 782 |
+
|
| 783 |
+
seq_ids = torch.arange(seq_length, device=device)
|
| 784 |
+
causal_mask = (
|
| 785 |
+
seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
|
| 786 |
+
<= seq_ids[None, :, None]
|
| 787 |
+
)
|
| 788 |
+
# in case past_key_values are used we need to add a prefix ones mask to the causal mask
|
| 789 |
+
# causal and attention masks must have same type with pytorch version < 1.3
|
| 790 |
+
causal_mask = causal_mask.to(attention_mask.dtype)
|
| 791 |
+
|
| 792 |
+
if causal_mask.shape[1] < attention_mask.shape[1]:
|
| 793 |
+
prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
|
| 794 |
+
causal_mask = torch.cat(
|
| 795 |
+
[
|
| 796 |
+
torch.ones(
|
| 797 |
+
(batch_size, seq_length, prefix_seq_len),
|
| 798 |
+
device=device,
|
| 799 |
+
dtype=causal_mask.dtype,
|
| 800 |
+
),
|
| 801 |
+
causal_mask,
|
| 802 |
+
],
|
| 803 |
+
axis=-1,
|
| 804 |
+
)
|
| 805 |
+
|
| 806 |
+
extended_attention_mask = (
|
| 807 |
+
causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
|
| 808 |
+
)
|
| 809 |
+
else:
|
| 810 |
+
extended_attention_mask = attention_mask[:, None, None, :]
|
| 811 |
+
else:
|
| 812 |
+
raise ValueError(
|
| 813 |
+
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
|
| 814 |
+
input_shape, attention_mask.shape
|
| 815 |
+
)
|
| 816 |
+
)
|
| 817 |
+
|
| 818 |
+
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
| 819 |
+
# masked positions, this operation will create a tensor which is 0.0 for
|
| 820 |
+
# positions we want to attend and -10000.0 for masked positions.
|
| 821 |
+
# Since we are adding it to the raw scores before the softmax, this is
|
| 822 |
+
# effectively the same as removing these entirely.
|
| 823 |
+
extended_attention_mask = extended_attention_mask.to(
|
| 824 |
+
dtype=self.dtype
|
| 825 |
+
) # fp16 compatibility
|
| 826 |
+
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
| 827 |
+
return extended_attention_mask
|
| 828 |
+
|
| 829 |
+
def forward(
|
| 830 |
+
self,
|
| 831 |
+
input_ids=None,
|
| 832 |
+
attention_mask=None,
|
| 833 |
+
position_ids=None,
|
| 834 |
+
head_mask=None,
|
| 835 |
+
inputs_embeds=None,
|
| 836 |
+
encoder_embeds=None,
|
| 837 |
+
encoder_hidden_states=None,
|
| 838 |
+
encoder_attention_mask=None,
|
| 839 |
+
past_key_values=None,
|
| 840 |
+
use_cache=None,
|
| 841 |
+
output_attentions=None,
|
| 842 |
+
output_hidden_states=None,
|
| 843 |
+
return_dict=None,
|
| 844 |
+
is_decoder=False,
|
| 845 |
+
mode="multimodal",
|
| 846 |
+
):
|
| 847 |
+
r"""
|
| 848 |
+
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
| 849 |
+
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
| 850 |
+
the model is configured as a decoder.
|
| 851 |
+
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
| 852 |
+
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
| 853 |
+
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
|
| 854 |
+
- 1 for tokens that are **not masked**,
|
| 855 |
+
- 0 for tokens that are **masked**.
|
| 856 |
+
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
| 857 |
+
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
| 858 |
+
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
|
| 859 |
+
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
|
| 860 |
+
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
|
| 861 |
+
use_cache (:obj:`bool`, `optional`):
|
| 862 |
+
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
|
| 863 |
+
decoding (see :obj:`past_key_values`).
|
| 864 |
+
"""
|
| 865 |
+
output_attentions = (
|
| 866 |
+
output_attentions
|
| 867 |
+
if output_attentions is not None
|
| 868 |
+
else self.config.output_attentions
|
| 869 |
+
)
|
| 870 |
+
output_hidden_states = (
|
| 871 |
+
output_hidden_states
|
| 872 |
+
if output_hidden_states is not None
|
| 873 |
+
else self.config.output_hidden_states
|
| 874 |
+
)
|
| 875 |
+
return_dict = (
|
| 876 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
| 877 |
+
)
|
| 878 |
+
|
| 879 |
+
if is_decoder:
|
| 880 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 881 |
+
else:
|
| 882 |
+
use_cache = False
|
| 883 |
+
|
| 884 |
+
if input_ids is not None and inputs_embeds is not None:
|
| 885 |
+
raise ValueError(
|
| 886 |
+
"You cannot specify both input_ids and inputs_embeds at the same time"
|
| 887 |
+
)
|
| 888 |
+
elif input_ids is not None:
|
| 889 |
+
input_shape = input_ids.size()
|
| 890 |
+
batch_size, seq_length = input_shape
|
| 891 |
+
device = input_ids.device
|
| 892 |
+
elif inputs_embeds is not None:
|
| 893 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 894 |
+
batch_size, seq_length = input_shape
|
| 895 |
+
device = inputs_embeds.device
|
| 896 |
+
elif encoder_embeds is not None:
|
| 897 |
+
input_shape = encoder_embeds.size()[:-1]
|
| 898 |
+
batch_size, seq_length = input_shape
|
| 899 |
+
device = encoder_embeds.device
|
| 900 |
+
else:
|
| 901 |
+
raise ValueError(
|
| 902 |
+
"You have to specify either input_ids or inputs_embeds or encoder_embeds"
|
| 903 |
+
)
|
| 904 |
+
|
| 905 |
+
# past_key_values_length
|
| 906 |
+
past_key_values_length = (
|
| 907 |
+
past_key_values[0][0].shape[2] if past_key_values is not None else 0
|
| 908 |
+
)
|
| 909 |
+
|
| 910 |
+
if attention_mask is None:
|
| 911 |
+
attention_mask = torch.ones(
|
| 912 |
+
((batch_size, seq_length + past_key_values_length)), device=device
|
| 913 |
+
)
|
| 914 |
+
|
| 915 |
+
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
|
| 916 |
+
# ourselves in which case we just need to make it broadcastable to all heads.
|
| 917 |
+
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
|
| 918 |
+
attention_mask, input_shape, device, is_decoder
|
| 919 |
+
)
|
| 920 |
+
|
| 921 |
+
# If a 2D or 3D attention mask is provided for the cross-attention
|
| 922 |
+
# we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
|
| 923 |
+
if encoder_hidden_states is not None:
|
| 924 |
+
if type(encoder_hidden_states) == list:
|
| 925 |
+
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[
|
| 926 |
+
0
|
| 927 |
+
].size()
|
| 928 |
+
else:
|
| 929 |
+
(
|
| 930 |
+
encoder_batch_size,
|
| 931 |
+
encoder_sequence_length,
|
| 932 |
+
_,
|
| 933 |
+
) = encoder_hidden_states.size()
|
| 934 |
+
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
|
| 935 |
+
|
| 936 |
+
if type(encoder_attention_mask) == list:
|
| 937 |
+
encoder_extended_attention_mask = [
|
| 938 |
+
self.invert_attention_mask(mask) for mask in encoder_attention_mask
|
| 939 |
+
]
|
| 940 |
+
elif encoder_attention_mask is None:
|
| 941 |
+
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
|
| 942 |
+
encoder_extended_attention_mask = self.invert_attention_mask(
|
| 943 |
+
encoder_attention_mask
|
| 944 |
+
)
|
| 945 |
+
else:
|
| 946 |
+
encoder_extended_attention_mask = self.invert_attention_mask(
|
| 947 |
+
encoder_attention_mask
|
| 948 |
+
)
|
| 949 |
+
else:
|
| 950 |
+
encoder_extended_attention_mask = None
|
| 951 |
+
|
| 952 |
+
# Prepare head mask if needed
|
| 953 |
+
# 1.0 in head_mask indicate we keep the head
|
| 954 |
+
# attention_probs has shape bsz x n_heads x N x N
|
| 955 |
+
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
|
| 956 |
+
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
|
| 957 |
+
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
|
| 958 |
+
|
| 959 |
+
if encoder_embeds is None:
|
| 960 |
+
embedding_output = self.embeddings(
|
| 961 |
+
input_ids=input_ids,
|
| 962 |
+
position_ids=position_ids,
|
| 963 |
+
inputs_embeds=inputs_embeds,
|
| 964 |
+
past_key_values_length=past_key_values_length,
|
| 965 |
+
)
|
| 966 |
+
else:
|
| 967 |
+
embedding_output = encoder_embeds
|
| 968 |
+
|
| 969 |
+
encoder_outputs = self.encoder(
|
| 970 |
+
embedding_output,
|
| 971 |
+
attention_mask=extended_attention_mask,
|
| 972 |
+
head_mask=head_mask,
|
| 973 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 974 |
+
encoder_attention_mask=encoder_extended_attention_mask,
|
| 975 |
+
past_key_values=past_key_values,
|
| 976 |
+
use_cache=use_cache,
|
| 977 |
+
output_attentions=output_attentions,
|
| 978 |
+
output_hidden_states=output_hidden_states,
|
| 979 |
+
return_dict=return_dict,
|
| 980 |
+
mode=mode,
|
| 981 |
+
)
|
| 982 |
+
sequence_output = encoder_outputs[0]
|
| 983 |
+
pooled_output = (
|
| 984 |
+
self.pooler(sequence_output) if self.pooler is not None else None
|
| 985 |
+
)
|
| 986 |
+
|
| 987 |
+
if not return_dict:
|
| 988 |
+
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
| 989 |
+
|
| 990 |
+
return BaseModelOutputWithPoolingAndCrossAttentions(
|
| 991 |
+
last_hidden_state=sequence_output,
|
| 992 |
+
pooler_output=pooled_output,
|
| 993 |
+
past_key_values=encoder_outputs.past_key_values,
|
| 994 |
+
hidden_states=encoder_outputs.hidden_states,
|
| 995 |
+
attentions=encoder_outputs.attentions,
|
| 996 |
+
cross_attentions=encoder_outputs.cross_attentions,
|
| 997 |
+
)
|
| 998 |
+
|
| 999 |
+
|
| 1000 |
+
class BertLMHeadModel(BertPreTrainedModel):
|
| 1001 |
+
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
| 1002 |
+
_keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
|
| 1003 |
+
|
| 1004 |
+
def __init__(self, config):
|
| 1005 |
+
super().__init__(config)
|
| 1006 |
+
|
| 1007 |
+
self.bert = BertModel(config, add_pooling_layer=False)
|
| 1008 |
+
self.cls = BertOnlyMLMHead(config)
|
| 1009 |
+
|
| 1010 |
+
self.init_weights()
|
| 1011 |
+
|
| 1012 |
+
def get_output_embeddings(self):
|
| 1013 |
+
return self.cls.predictions.decoder
|
| 1014 |
+
|
| 1015 |
+
def set_output_embeddings(self, new_embeddings):
|
| 1016 |
+
self.cls.predictions.decoder = new_embeddings
|
| 1017 |
+
|
| 1018 |
+
def forward(
|
| 1019 |
+
self,
|
| 1020 |
+
input_ids=None,
|
| 1021 |
+
attention_mask=None,
|
| 1022 |
+
position_ids=None,
|
| 1023 |
+
head_mask=None,
|
| 1024 |
+
inputs_embeds=None,
|
| 1025 |
+
encoder_hidden_states=None,
|
| 1026 |
+
encoder_attention_mask=None,
|
| 1027 |
+
labels=None,
|
| 1028 |
+
past_key_values=None,
|
| 1029 |
+
use_cache=None,
|
| 1030 |
+
output_attentions=None,
|
| 1031 |
+
output_hidden_states=None,
|
| 1032 |
+
return_dict=None,
|
| 1033 |
+
return_logits=False,
|
| 1034 |
+
is_decoder=True,
|
| 1035 |
+
reduction="mean",
|
| 1036 |
+
mode="multimodal",
|
| 1037 |
+
):
|
| 1038 |
+
r"""
|
| 1039 |
+
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
|
| 1040 |
+
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
|
| 1041 |
+
the model is configured as a decoder.
|
| 1042 |
+
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
| 1043 |
+
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
|
| 1044 |
+
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
|
| 1045 |
+
- 1 for tokens that are **not masked**,
|
| 1046 |
+
- 0 for tokens that are **masked**.
|
| 1047 |
+
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
| 1048 |
+
Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
|
| 1049 |
+
``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
|
| 1050 |
+
ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
|
| 1051 |
+
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
|
| 1052 |
+
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
|
| 1053 |
+
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
|
| 1054 |
+
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
|
| 1055 |
+
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
|
| 1056 |
+
use_cache (:obj:`bool`, `optional`):
|
| 1057 |
+
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
|
| 1058 |
+
decoding (see :obj:`past_key_values`).
|
| 1059 |
+
Returns:
|
| 1060 |
+
Example::
|
| 1061 |
+
>>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
|
| 1062 |
+
>>> import torch
|
| 1063 |
+
>>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
|
| 1064 |
+
>>> config = BertConfig.from_pretrained("bert-base-cased")
|
| 1065 |
+
>>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
|
| 1066 |
+
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
|
| 1067 |
+
>>> outputs = model(**inputs)
|
| 1068 |
+
>>> prediction_logits = outputs.logits
|
| 1069 |
+
"""
|
| 1070 |
+
return_dict = (
|
| 1071 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
| 1072 |
+
)
|
| 1073 |
+
if labels is not None:
|
| 1074 |
+
use_cache = False
|
| 1075 |
+
|
| 1076 |
+
outputs = self.bert(
|
| 1077 |
+
input_ids,
|
| 1078 |
+
attention_mask=attention_mask,
|
| 1079 |
+
position_ids=position_ids,
|
| 1080 |
+
head_mask=head_mask,
|
| 1081 |
+
inputs_embeds=inputs_embeds,
|
| 1082 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 1083 |
+
encoder_attention_mask=encoder_attention_mask,
|
| 1084 |
+
past_key_values=past_key_values,
|
| 1085 |
+
use_cache=use_cache,
|
| 1086 |
+
output_attentions=output_attentions,
|
| 1087 |
+
output_hidden_states=output_hidden_states,
|
| 1088 |
+
return_dict=return_dict,
|
| 1089 |
+
is_decoder=is_decoder,
|
| 1090 |
+
mode=mode,
|
| 1091 |
+
)
|
| 1092 |
+
|
| 1093 |
+
sequence_output = outputs[0]
|
| 1094 |
+
prediction_scores = self.cls(sequence_output)
|
| 1095 |
+
# sequence_output.shape torch.Size([85, 30, 768])
|
| 1096 |
+
# prediction_scores.shape torch.Size([85, 30, 30524])
|
| 1097 |
+
# labels.shape torch.Size([85, 30])
|
| 1098 |
+
|
| 1099 |
+
if return_logits:
|
| 1100 |
+
return prediction_scores[:, :-1, :].contiguous()
|
| 1101 |
+
|
| 1102 |
+
lm_loss = None
|
| 1103 |
+
if labels is not None:
|
| 1104 |
+
# we are doing next-token prediction; shift prediction scores and input ids by one
|
| 1105 |
+
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
|
| 1106 |
+
labels = labels[:, 1:].contiguous()
|
| 1107 |
+
loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
|
| 1108 |
+
lm_loss = loss_fct(
|
| 1109 |
+
shifted_prediction_scores.view(-1, self.config.vocab_size),
|
| 1110 |
+
labels.view(-1),
|
| 1111 |
+
)
|
| 1112 |
+
if reduction == "none":
|
| 1113 |
+
lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
|
| 1114 |
+
|
| 1115 |
+
if not return_dict:
|
| 1116 |
+
output = (prediction_scores,) + outputs[2:]
|
| 1117 |
+
return ((lm_loss,) + output) if lm_loss is not None else output
|
| 1118 |
+
|
| 1119 |
+
return CausalLMOutputWithCrossAttentions(
|
| 1120 |
+
loss=lm_loss,
|
| 1121 |
+
logits=prediction_scores,
|
| 1122 |
+
past_key_values=outputs.past_key_values,
|
| 1123 |
+
hidden_states=outputs.hidden_states,
|
| 1124 |
+
attentions=outputs.attentions,
|
| 1125 |
+
cross_attentions=outputs.cross_attentions,
|
| 1126 |
+
)
|
| 1127 |
+
|
| 1128 |
+
def prepare_inputs_for_generation(
|
| 1129 |
+
self, input_ids, past=None, attention_mask=None, **model_kwargs
|
| 1130 |
+
):
|
| 1131 |
+
input_shape = input_ids.shape
|
| 1132 |
+
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
|
| 1133 |
+
if attention_mask is None:
|
| 1134 |
+
attention_mask = input_ids.new_ones(input_shape)
|
| 1135 |
+
|
| 1136 |
+
# cut decoder_input_ids if past is used
|
| 1137 |
+
if past is not None:
|
| 1138 |
+
input_ids = input_ids[:, -1:]
|
| 1139 |
+
|
| 1140 |
+
return {
|
| 1141 |
+
"input_ids": input_ids,
|
| 1142 |
+
"attention_mask": attention_mask,
|
| 1143 |
+
"past_key_values": past,
|
| 1144 |
+
"encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
|
| 1145 |
+
"encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
|
| 1146 |
+
"is_decoder": True,
|
| 1147 |
+
}
|
| 1148 |
+
|
| 1149 |
+
def _reorder_cache(self, past, beam_idx):
|
| 1150 |
+
reordered_past = ()
|
| 1151 |
+
for layer_past in past:
|
| 1152 |
+
reordered_past += (
|
| 1153 |
+
tuple(
|
| 1154 |
+
past_state.index_select(0, beam_idx) for past_state in layer_past
|
| 1155 |
+
),
|
| 1156 |
+
)
|
| 1157 |
+
return reordered_past
|
tag2text/models/swin_transformer.py
ADDED
|
@@ -0,0 +1,831 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# Swin Transformer
|
| 3 |
+
# Copyright (c) 2021 Microsoft
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# Written by Ze Liu
|
| 6 |
+
# --------------------------------------------------------
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.utils.checkpoint as checkpoint
|
| 11 |
+
from scipy import interpolate
|
| 12 |
+
from timm.models.layers import DropPath
|
| 13 |
+
from timm.models.layers import to_2tuple
|
| 14 |
+
from timm.models.layers import trunc_normal_
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Mlp(nn.Module):
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
in_features,
|
| 21 |
+
hidden_features=None,
|
| 22 |
+
out_features=None,
|
| 23 |
+
act_layer=nn.GELU,
|
| 24 |
+
drop=0.0,
|
| 25 |
+
):
|
| 26 |
+
super().__init__()
|
| 27 |
+
out_features = out_features or in_features
|
| 28 |
+
hidden_features = hidden_features or in_features
|
| 29 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
| 30 |
+
self.act = act_layer()
|
| 31 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
| 32 |
+
self.drop = nn.Dropout(drop)
|
| 33 |
+
|
| 34 |
+
def forward(self, x):
|
| 35 |
+
x = self.fc1(x)
|
| 36 |
+
x = self.act(x)
|
| 37 |
+
x = self.drop(x)
|
| 38 |
+
x = self.fc2(x)
|
| 39 |
+
x = self.drop(x)
|
| 40 |
+
return x
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def window_partition(x, window_size):
|
| 44 |
+
"""
|
| 45 |
+
Args:
|
| 46 |
+
x: (B, H, W, C)
|
| 47 |
+
window_size (int): window size
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
windows: (num_windows*B, window_size, window_size, C)
|
| 51 |
+
"""
|
| 52 |
+
B, H, W, C = x.shape
|
| 53 |
+
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
|
| 54 |
+
windows = (
|
| 55 |
+
x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
| 56 |
+
)
|
| 57 |
+
return windows
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def window_reverse(windows, window_size, H, W):
|
| 61 |
+
"""
|
| 62 |
+
Args:
|
| 63 |
+
windows: (num_windows*B, window_size, window_size, C)
|
| 64 |
+
window_size (int): Window size
|
| 65 |
+
H (int): Height of image
|
| 66 |
+
W (int): Width of image
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
x: (B, H, W, C)
|
| 70 |
+
"""
|
| 71 |
+
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
| 72 |
+
x = windows.view(
|
| 73 |
+
B, H // window_size, W // window_size, window_size, window_size, -1
|
| 74 |
+
)
|
| 75 |
+
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
|
| 76 |
+
return x
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class WindowAttention(nn.Module):
|
| 80 |
+
r"""Window based multi-head self attention (W-MSA) module with relative position bias.
|
| 81 |
+
It supports both of shifted and non-shifted window.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
dim (int): Number of input channels.
|
| 85 |
+
window_size (tuple[int]): The height and width of the window.
|
| 86 |
+
num_heads (int): Number of attention heads.
|
| 87 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
| 88 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
|
| 89 |
+
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
|
| 90 |
+
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
def __init__(
|
| 94 |
+
self,
|
| 95 |
+
dim,
|
| 96 |
+
window_size,
|
| 97 |
+
num_heads,
|
| 98 |
+
qkv_bias=True,
|
| 99 |
+
qk_scale=None,
|
| 100 |
+
attn_drop=0.0,
|
| 101 |
+
proj_drop=0.0,
|
| 102 |
+
):
|
| 103 |
+
super().__init__()
|
| 104 |
+
self.dim = dim
|
| 105 |
+
self.window_size = window_size # Wh, Ww
|
| 106 |
+
self.num_heads = num_heads
|
| 107 |
+
head_dim = dim // num_heads
|
| 108 |
+
self.scale = qk_scale or head_dim**-0.5
|
| 109 |
+
|
| 110 |
+
# define a parameter table of relative position bias
|
| 111 |
+
self.relative_position_bias_table = nn.Parameter(
|
| 112 |
+
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
|
| 113 |
+
) # 2*Wh-1 * 2*Ww-1, nH
|
| 114 |
+
|
| 115 |
+
# get pair-wise relative position index for each token inside the window
|
| 116 |
+
coords_h = torch.arange(self.window_size[0])
|
| 117 |
+
coords_w = torch.arange(self.window_size[1])
|
| 118 |
+
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
|
| 119 |
+
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
| 120 |
+
relative_coords = (
|
| 121 |
+
coords_flatten[:, :, None] - coords_flatten[:, None, :]
|
| 122 |
+
) # 2, Wh*Ww, Wh*Ww
|
| 123 |
+
relative_coords = relative_coords.permute(
|
| 124 |
+
1, 2, 0
|
| 125 |
+
).contiguous() # Wh*Ww, Wh*Ww, 2
|
| 126 |
+
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
|
| 127 |
+
relative_coords[:, :, 1] += self.window_size[1] - 1
|
| 128 |
+
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
| 129 |
+
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
|
| 130 |
+
self.register_buffer("relative_position_index", relative_position_index)
|
| 131 |
+
|
| 132 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 133 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
| 134 |
+
self.proj = nn.Linear(dim, dim)
|
| 135 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 136 |
+
|
| 137 |
+
trunc_normal_(self.relative_position_bias_table, std=0.02)
|
| 138 |
+
self.softmax = nn.Softmax(dim=-1)
|
| 139 |
+
|
| 140 |
+
def forward(self, x, mask=None):
|
| 141 |
+
"""
|
| 142 |
+
Args:
|
| 143 |
+
x: input features with shape of (num_windows*B, N, C)
|
| 144 |
+
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
|
| 145 |
+
"""
|
| 146 |
+
B_, N, C = x.shape
|
| 147 |
+
qkv = (
|
| 148 |
+
self.qkv(x)
|
| 149 |
+
.reshape(B_, N, 3, self.num_heads, C // self.num_heads)
|
| 150 |
+
.permute(2, 0, 3, 1, 4)
|
| 151 |
+
)
|
| 152 |
+
q, k, v = (
|
| 153 |
+
qkv[0],
|
| 154 |
+
qkv[1],
|
| 155 |
+
qkv[2],
|
| 156 |
+
) # make torchscript happy (cannot use tensor as tuple)
|
| 157 |
+
|
| 158 |
+
q = q * self.scale
|
| 159 |
+
attn = q @ k.transpose(-2, -1)
|
| 160 |
+
|
| 161 |
+
relative_position_bias = self.relative_position_bias_table[
|
| 162 |
+
self.relative_position_index.view(-1)
|
| 163 |
+
].view(
|
| 164 |
+
self.window_size[0] * self.window_size[1],
|
| 165 |
+
self.window_size[0] * self.window_size[1],
|
| 166 |
+
-1,
|
| 167 |
+
) # Wh*Ww,Wh*Ww,nH
|
| 168 |
+
relative_position_bias = relative_position_bias.permute(
|
| 169 |
+
2, 0, 1
|
| 170 |
+
).contiguous() # nH, Wh*Ww, Wh*Ww
|
| 171 |
+
attn = attn + relative_position_bias.unsqueeze(0)
|
| 172 |
+
|
| 173 |
+
if mask is not None:
|
| 174 |
+
nW = mask.shape[0]
|
| 175 |
+
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
|
| 176 |
+
1
|
| 177 |
+
).unsqueeze(0)
|
| 178 |
+
attn = attn.view(-1, self.num_heads, N, N)
|
| 179 |
+
attn = self.softmax(attn)
|
| 180 |
+
else:
|
| 181 |
+
attn = self.softmax(attn)
|
| 182 |
+
|
| 183 |
+
attn = self.attn_drop(attn)
|
| 184 |
+
|
| 185 |
+
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
|
| 186 |
+
x = self.proj(x)
|
| 187 |
+
x = self.proj_drop(x)
|
| 188 |
+
return x
|
| 189 |
+
|
| 190 |
+
def extra_repr(self) -> str:
|
| 191 |
+
return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
|
| 192 |
+
|
| 193 |
+
def flops(self, N):
|
| 194 |
+
# calculate flops for 1 window with token length of N
|
| 195 |
+
flops = 0
|
| 196 |
+
# qkv = self.qkv(x)
|
| 197 |
+
flops += N * self.dim * 3 * self.dim
|
| 198 |
+
# attn = (q @ k.transpose(-2, -1))
|
| 199 |
+
flops += self.num_heads * N * (self.dim // self.num_heads) * N
|
| 200 |
+
# x = (attn @ v)
|
| 201 |
+
flops += self.num_heads * N * N * (self.dim // self.num_heads)
|
| 202 |
+
# x = self.proj(x)
|
| 203 |
+
flops += N * self.dim * self.dim
|
| 204 |
+
return flops
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class SwinTransformerBlock(nn.Module):
|
| 208 |
+
r"""Swin Transformer Block.
|
| 209 |
+
|
| 210 |
+
Args:
|
| 211 |
+
dim (int): Number of input channels.
|
| 212 |
+
input_resolution (tuple[int]): Input resulotion.
|
| 213 |
+
num_heads (int): Number of attention heads.
|
| 214 |
+
window_size (int): Window size.
|
| 215 |
+
shift_size (int): Shift size for SW-MSA.
|
| 216 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
| 217 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
| 218 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
| 219 |
+
drop (float, optional): Dropout rate. Default: 0.0
|
| 220 |
+
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
| 221 |
+
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
| 222 |
+
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
|
| 223 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
| 224 |
+
"""
|
| 225 |
+
|
| 226 |
+
def __init__(
|
| 227 |
+
self,
|
| 228 |
+
dim,
|
| 229 |
+
input_resolution,
|
| 230 |
+
num_heads,
|
| 231 |
+
window_size=7,
|
| 232 |
+
shift_size=0,
|
| 233 |
+
mlp_ratio=4.0,
|
| 234 |
+
qkv_bias=True,
|
| 235 |
+
qk_scale=None,
|
| 236 |
+
drop=0.0,
|
| 237 |
+
attn_drop=0.0,
|
| 238 |
+
drop_path=0.0,
|
| 239 |
+
act_layer=nn.GELU,
|
| 240 |
+
norm_layer=nn.LayerNorm,
|
| 241 |
+
):
|
| 242 |
+
super().__init__()
|
| 243 |
+
self.dim = dim
|
| 244 |
+
self.input_resolution = input_resolution
|
| 245 |
+
self.num_heads = num_heads
|
| 246 |
+
self.window_size = window_size
|
| 247 |
+
self.shift_size = shift_size
|
| 248 |
+
self.mlp_ratio = mlp_ratio
|
| 249 |
+
if min(self.input_resolution) <= self.window_size:
|
| 250 |
+
# if window size is larger than input resolution, we don't partition windows
|
| 251 |
+
self.shift_size = 0
|
| 252 |
+
self.window_size = min(self.input_resolution)
|
| 253 |
+
assert (
|
| 254 |
+
0 <= self.shift_size < self.window_size
|
| 255 |
+
), "shift_size must in 0-window_size"
|
| 256 |
+
|
| 257 |
+
self.norm1 = norm_layer(dim)
|
| 258 |
+
self.attn = WindowAttention(
|
| 259 |
+
dim,
|
| 260 |
+
window_size=to_2tuple(self.window_size),
|
| 261 |
+
num_heads=num_heads,
|
| 262 |
+
qkv_bias=qkv_bias,
|
| 263 |
+
qk_scale=qk_scale,
|
| 264 |
+
attn_drop=attn_drop,
|
| 265 |
+
proj_drop=drop,
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
| 269 |
+
self.norm2 = norm_layer(dim)
|
| 270 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
| 271 |
+
self.mlp = Mlp(
|
| 272 |
+
in_features=dim,
|
| 273 |
+
hidden_features=mlp_hidden_dim,
|
| 274 |
+
act_layer=act_layer,
|
| 275 |
+
drop=drop,
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
if self.shift_size > 0:
|
| 279 |
+
# calculate attention mask for SW-MSA
|
| 280 |
+
H, W = self.input_resolution
|
| 281 |
+
img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1
|
| 282 |
+
h_slices = (
|
| 283 |
+
slice(0, -self.window_size),
|
| 284 |
+
slice(-self.window_size, -self.shift_size),
|
| 285 |
+
slice(-self.shift_size, None),
|
| 286 |
+
)
|
| 287 |
+
w_slices = (
|
| 288 |
+
slice(0, -self.window_size),
|
| 289 |
+
slice(-self.window_size, -self.shift_size),
|
| 290 |
+
slice(-self.shift_size, None),
|
| 291 |
+
)
|
| 292 |
+
cnt = 0
|
| 293 |
+
for h in h_slices:
|
| 294 |
+
for w in w_slices:
|
| 295 |
+
img_mask[:, h, w, :] = cnt
|
| 296 |
+
cnt += 1
|
| 297 |
+
|
| 298 |
+
mask_windows = window_partition(
|
| 299 |
+
img_mask, self.window_size
|
| 300 |
+
) # nW, window_size, window_size, 1
|
| 301 |
+
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
|
| 302 |
+
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
| 303 |
+
attn_mask = attn_mask.masked_fill(
|
| 304 |
+
attn_mask != 0, float(-100.0)
|
| 305 |
+
).masked_fill(attn_mask == 0, float(0.0))
|
| 306 |
+
else:
|
| 307 |
+
attn_mask = None
|
| 308 |
+
|
| 309 |
+
self.register_buffer("attn_mask", attn_mask)
|
| 310 |
+
|
| 311 |
+
def forward(self, x):
|
| 312 |
+
H, W = self.input_resolution
|
| 313 |
+
B, L, C = x.shape
|
| 314 |
+
assert L == H * W, "input feature has wrong size"
|
| 315 |
+
|
| 316 |
+
shortcut = x
|
| 317 |
+
x = self.norm1(x)
|
| 318 |
+
x = x.view(B, H, W, C)
|
| 319 |
+
|
| 320 |
+
# cyclic shift
|
| 321 |
+
if self.shift_size > 0:
|
| 322 |
+
shifted_x = torch.roll(
|
| 323 |
+
x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
|
| 324 |
+
)
|
| 325 |
+
else:
|
| 326 |
+
shifted_x = x
|
| 327 |
+
|
| 328 |
+
# partition windows
|
| 329 |
+
x_windows = window_partition(
|
| 330 |
+
shifted_x, self.window_size
|
| 331 |
+
) # nW*B, window_size, window_size, C
|
| 332 |
+
x_windows = x_windows.view(
|
| 333 |
+
-1, self.window_size * self.window_size, C
|
| 334 |
+
) # nW*B, window_size*window_size, C
|
| 335 |
+
|
| 336 |
+
# W-MSA/SW-MSA
|
| 337 |
+
attn_windows = self.attn(
|
| 338 |
+
x_windows, mask=self.attn_mask
|
| 339 |
+
) # nW*B, window_size*window_size, C
|
| 340 |
+
|
| 341 |
+
# merge windows
|
| 342 |
+
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
|
| 343 |
+
shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C
|
| 344 |
+
|
| 345 |
+
# reverse cyclic shift
|
| 346 |
+
if self.shift_size > 0:
|
| 347 |
+
x = torch.roll(
|
| 348 |
+
shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
|
| 349 |
+
)
|
| 350 |
+
else:
|
| 351 |
+
x = shifted_x
|
| 352 |
+
x = x.view(B, H * W, C)
|
| 353 |
+
|
| 354 |
+
# FFN
|
| 355 |
+
x = shortcut + self.drop_path(x)
|
| 356 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
| 357 |
+
|
| 358 |
+
return x
|
| 359 |
+
|
| 360 |
+
def extra_repr(self) -> str:
|
| 361 |
+
return (
|
| 362 |
+
f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
|
| 363 |
+
f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
def flops(self):
|
| 367 |
+
flops = 0
|
| 368 |
+
H, W = self.input_resolution
|
| 369 |
+
# norm1
|
| 370 |
+
flops += self.dim * H * W
|
| 371 |
+
# W-MSA/SW-MSA
|
| 372 |
+
nW = H * W / self.window_size / self.window_size
|
| 373 |
+
flops += nW * self.attn.flops(self.window_size * self.window_size)
|
| 374 |
+
# mlp
|
| 375 |
+
flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
|
| 376 |
+
# norm2
|
| 377 |
+
flops += self.dim * H * W
|
| 378 |
+
return flops
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
class PatchMerging(nn.Module):
|
| 382 |
+
r"""Patch Merging Layer.
|
| 383 |
+
|
| 384 |
+
Args:
|
| 385 |
+
input_resolution (tuple[int]): Resolution of input feature.
|
| 386 |
+
dim (int): Number of input channels.
|
| 387 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
| 388 |
+
"""
|
| 389 |
+
|
| 390 |
+
def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
|
| 391 |
+
super().__init__()
|
| 392 |
+
self.input_resolution = input_resolution
|
| 393 |
+
self.dim = dim
|
| 394 |
+
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
|
| 395 |
+
self.norm = norm_layer(4 * dim)
|
| 396 |
+
|
| 397 |
+
def forward(self, x):
|
| 398 |
+
"""
|
| 399 |
+
x: B, H*W, C
|
| 400 |
+
"""
|
| 401 |
+
H, W = self.input_resolution
|
| 402 |
+
B, L, C = x.shape
|
| 403 |
+
assert L == H * W, "input feature has wrong size"
|
| 404 |
+
assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
|
| 405 |
+
|
| 406 |
+
x = x.view(B, H, W, C)
|
| 407 |
+
|
| 408 |
+
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
|
| 409 |
+
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
|
| 410 |
+
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
|
| 411 |
+
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
|
| 412 |
+
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
|
| 413 |
+
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
|
| 414 |
+
|
| 415 |
+
x = self.norm(x)
|
| 416 |
+
x = self.reduction(x)
|
| 417 |
+
|
| 418 |
+
return x
|
| 419 |
+
|
| 420 |
+
def extra_repr(self) -> str:
|
| 421 |
+
return f"input_resolution={self.input_resolution}, dim={self.dim}"
|
| 422 |
+
|
| 423 |
+
def flops(self):
|
| 424 |
+
H, W = self.input_resolution
|
| 425 |
+
flops = H * W * self.dim
|
| 426 |
+
flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
|
| 427 |
+
return flops
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
class BasicLayer(nn.Module):
|
| 431 |
+
"""A basic Swin Transformer layer for one stage.
|
| 432 |
+
|
| 433 |
+
Args:
|
| 434 |
+
dim (int): Number of input channels.
|
| 435 |
+
input_resolution (tuple[int]): Input resolution.
|
| 436 |
+
depth (int): Number of blocks.
|
| 437 |
+
num_heads (int): Number of attention heads.
|
| 438 |
+
window_size (int): Local window size.
|
| 439 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
| 440 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
| 441 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
| 442 |
+
drop (float, optional): Dropout rate. Default: 0.0
|
| 443 |
+
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
| 444 |
+
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
| 445 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
| 446 |
+
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
|
| 447 |
+
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
| 448 |
+
"""
|
| 449 |
+
|
| 450 |
+
def __init__(
|
| 451 |
+
self,
|
| 452 |
+
dim,
|
| 453 |
+
input_resolution,
|
| 454 |
+
depth,
|
| 455 |
+
num_heads,
|
| 456 |
+
window_size,
|
| 457 |
+
mlp_ratio=4.0,
|
| 458 |
+
qkv_bias=True,
|
| 459 |
+
qk_scale=None,
|
| 460 |
+
drop=0.0,
|
| 461 |
+
attn_drop=0.0,
|
| 462 |
+
drop_path=0.0,
|
| 463 |
+
norm_layer=nn.LayerNorm,
|
| 464 |
+
downsample=None,
|
| 465 |
+
use_checkpoint=False,
|
| 466 |
+
):
|
| 467 |
+
super().__init__()
|
| 468 |
+
self.dim = dim
|
| 469 |
+
self.input_resolution = input_resolution
|
| 470 |
+
self.depth = depth
|
| 471 |
+
self.use_checkpoint = use_checkpoint
|
| 472 |
+
|
| 473 |
+
# build blocks
|
| 474 |
+
self.blocks = nn.ModuleList(
|
| 475 |
+
[
|
| 476 |
+
SwinTransformerBlock(
|
| 477 |
+
dim=dim,
|
| 478 |
+
input_resolution=input_resolution,
|
| 479 |
+
num_heads=num_heads,
|
| 480 |
+
window_size=window_size,
|
| 481 |
+
shift_size=0 if (i % 2 == 0) else window_size // 2,
|
| 482 |
+
mlp_ratio=mlp_ratio,
|
| 483 |
+
qkv_bias=qkv_bias,
|
| 484 |
+
qk_scale=qk_scale,
|
| 485 |
+
drop=drop,
|
| 486 |
+
attn_drop=attn_drop,
|
| 487 |
+
drop_path=drop_path[i]
|
| 488 |
+
if isinstance(drop_path, list)
|
| 489 |
+
else drop_path,
|
| 490 |
+
norm_layer=norm_layer,
|
| 491 |
+
)
|
| 492 |
+
for i in range(depth)
|
| 493 |
+
]
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
# patch merging layer
|
| 497 |
+
if downsample is not None:
|
| 498 |
+
self.downsample = downsample(
|
| 499 |
+
input_resolution, dim=dim, norm_layer=norm_layer
|
| 500 |
+
)
|
| 501 |
+
else:
|
| 502 |
+
self.downsample = None
|
| 503 |
+
|
| 504 |
+
def forward(self, x):
|
| 505 |
+
for blk in self.blocks:
|
| 506 |
+
if self.use_checkpoint:
|
| 507 |
+
x = checkpoint.checkpoint(blk, x)
|
| 508 |
+
else:
|
| 509 |
+
x = blk(x)
|
| 510 |
+
if self.downsample is not None:
|
| 511 |
+
x = self.downsample(x)
|
| 512 |
+
return x
|
| 513 |
+
|
| 514 |
+
def extra_repr(self) -> str:
|
| 515 |
+
return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
|
| 516 |
+
|
| 517 |
+
def flops(self):
|
| 518 |
+
flops = 0
|
| 519 |
+
for blk in self.blocks:
|
| 520 |
+
flops += blk.flops()
|
| 521 |
+
if self.downsample is not None:
|
| 522 |
+
flops += self.downsample.flops()
|
| 523 |
+
return flops
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
class PatchEmbed(nn.Module):
|
| 527 |
+
r"""Image to Patch Embedding
|
| 528 |
+
|
| 529 |
+
Args:
|
| 530 |
+
img_size (int): Image size. Default: 224.
|
| 531 |
+
patch_size (int): Patch token size. Default: 4.
|
| 532 |
+
in_chans (int): Number of input image channels. Default: 3.
|
| 533 |
+
embed_dim (int): Number of linear projection output channels. Default: 96.
|
| 534 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: None
|
| 535 |
+
"""
|
| 536 |
+
|
| 537 |
+
def __init__(
|
| 538 |
+
self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None
|
| 539 |
+
):
|
| 540 |
+
super().__init__()
|
| 541 |
+
img_size = to_2tuple(img_size)
|
| 542 |
+
patch_size = to_2tuple(patch_size)
|
| 543 |
+
patches_resolution = [
|
| 544 |
+
img_size[0] // patch_size[0],
|
| 545 |
+
img_size[1] // patch_size[1],
|
| 546 |
+
]
|
| 547 |
+
self.img_size = img_size
|
| 548 |
+
self.patch_size = patch_size
|
| 549 |
+
self.patches_resolution = patches_resolution
|
| 550 |
+
self.num_patches = patches_resolution[0] * patches_resolution[1]
|
| 551 |
+
|
| 552 |
+
self.in_chans = in_chans
|
| 553 |
+
self.embed_dim = embed_dim
|
| 554 |
+
|
| 555 |
+
self.proj = nn.Conv2d(
|
| 556 |
+
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
|
| 557 |
+
)
|
| 558 |
+
if norm_layer is not None:
|
| 559 |
+
self.norm = norm_layer(embed_dim)
|
| 560 |
+
else:
|
| 561 |
+
self.norm = None
|
| 562 |
+
|
| 563 |
+
def forward(self, x):
|
| 564 |
+
B, C, H, W = x.shape
|
| 565 |
+
# FIXME look at relaxing size constraints
|
| 566 |
+
assert (
|
| 567 |
+
H == self.img_size[0] and W == self.img_size[1]
|
| 568 |
+
), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
|
| 569 |
+
x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C
|
| 570 |
+
if self.norm is not None:
|
| 571 |
+
x = self.norm(x)
|
| 572 |
+
return x
|
| 573 |
+
|
| 574 |
+
def flops(self):
|
| 575 |
+
Ho, Wo = self.patches_resolution
|
| 576 |
+
flops = (
|
| 577 |
+
Ho
|
| 578 |
+
* Wo
|
| 579 |
+
* self.embed_dim
|
| 580 |
+
* self.in_chans
|
| 581 |
+
* (self.patch_size[0] * self.patch_size[1])
|
| 582 |
+
)
|
| 583 |
+
if self.norm is not None:
|
| 584 |
+
flops += Ho * Wo * self.embed_dim
|
| 585 |
+
return flops
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
class SwinTransformer(nn.Module):
|
| 589 |
+
r"""Swin Transformer
|
| 590 |
+
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
|
| 591 |
+
https://arxiv.org/pdf/2103.14030
|
| 592 |
+
|
| 593 |
+
Args:
|
| 594 |
+
img_size (int | tuple(int)): Input image size. Default 224
|
| 595 |
+
patch_size (int | tuple(int)): Patch size. Default: 4
|
| 596 |
+
in_chans (int): Number of input image channels. Default: 3
|
| 597 |
+
num_classes (int): Number of classes for classification head. Default: 1000
|
| 598 |
+
embed_dim (int): Patch embedding dimension. Default: 96
|
| 599 |
+
depths (tuple(int)): Depth of each Swin Transformer layer.
|
| 600 |
+
num_heads (tuple(int)): Number of attention heads in different layers.
|
| 601 |
+
window_size (int): Window size. Default: 7
|
| 602 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
|
| 603 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
|
| 604 |
+
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
|
| 605 |
+
drop_rate (float): Dropout rate. Default: 0
|
| 606 |
+
attn_drop_rate (float): Attention dropout rate. Default: 0
|
| 607 |
+
drop_path_rate (float): Stochastic depth rate. Default: 0.1
|
| 608 |
+
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
|
| 609 |
+
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
|
| 610 |
+
patch_norm (bool): If True, add normalization after patch embedding. Default: True
|
| 611 |
+
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
|
| 612 |
+
"""
|
| 613 |
+
|
| 614 |
+
def __init__(
|
| 615 |
+
self,
|
| 616 |
+
img_size=224,
|
| 617 |
+
patch_size=4,
|
| 618 |
+
in_chans=3,
|
| 619 |
+
num_classes=1000,
|
| 620 |
+
embed_dim=96,
|
| 621 |
+
depths=[2, 2, 6, 2],
|
| 622 |
+
num_heads=[3, 6, 12, 24],
|
| 623 |
+
window_size=7,
|
| 624 |
+
mlp_ratio=4.0,
|
| 625 |
+
qkv_bias=True,
|
| 626 |
+
qk_scale=None,
|
| 627 |
+
drop_rate=0.0,
|
| 628 |
+
attn_drop_rate=0.0,
|
| 629 |
+
drop_path_rate=0.1,
|
| 630 |
+
norm_layer=nn.LayerNorm,
|
| 631 |
+
ape=False,
|
| 632 |
+
patch_norm=True,
|
| 633 |
+
use_checkpoint=False,
|
| 634 |
+
**kwargs,
|
| 635 |
+
):
|
| 636 |
+
super().__init__()
|
| 637 |
+
|
| 638 |
+
self.num_classes = num_classes
|
| 639 |
+
self.num_layers = len(depths)
|
| 640 |
+
self.embed_dim = embed_dim
|
| 641 |
+
self.ape = ape
|
| 642 |
+
self.patch_norm = patch_norm
|
| 643 |
+
self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
|
| 644 |
+
self.mlp_ratio = mlp_ratio
|
| 645 |
+
|
| 646 |
+
# split image into non-overlapping patches
|
| 647 |
+
self.patch_embed = PatchEmbed(
|
| 648 |
+
img_size=img_size,
|
| 649 |
+
patch_size=patch_size,
|
| 650 |
+
in_chans=in_chans,
|
| 651 |
+
embed_dim=embed_dim,
|
| 652 |
+
norm_layer=norm_layer if self.patch_norm else None,
|
| 653 |
+
)
|
| 654 |
+
num_patches = self.patch_embed.num_patches
|
| 655 |
+
patches_resolution = self.patch_embed.patches_resolution
|
| 656 |
+
self.patches_resolution = patches_resolution
|
| 657 |
+
|
| 658 |
+
# absolute position embedding
|
| 659 |
+
if self.ape:
|
| 660 |
+
self.absolute_pos_embed = nn.Parameter(
|
| 661 |
+
torch.zeros(1, num_patches, embed_dim)
|
| 662 |
+
)
|
| 663 |
+
trunc_normal_(self.absolute_pos_embed, std=0.02)
|
| 664 |
+
|
| 665 |
+
self.pos_drop = nn.Dropout(p=drop_rate)
|
| 666 |
+
|
| 667 |
+
# stochastic depth
|
| 668 |
+
dpr = [
|
| 669 |
+
x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
|
| 670 |
+
] # stochastic depth decay rule
|
| 671 |
+
|
| 672 |
+
# build layers
|
| 673 |
+
self.layers = nn.ModuleList()
|
| 674 |
+
for i_layer in range(self.num_layers):
|
| 675 |
+
layer = BasicLayer(
|
| 676 |
+
dim=int(embed_dim * 2**i_layer),
|
| 677 |
+
input_resolution=(
|
| 678 |
+
patches_resolution[0] // (2**i_layer),
|
| 679 |
+
patches_resolution[1] // (2**i_layer),
|
| 680 |
+
),
|
| 681 |
+
depth=depths[i_layer],
|
| 682 |
+
num_heads=num_heads[i_layer],
|
| 683 |
+
window_size=window_size,
|
| 684 |
+
mlp_ratio=self.mlp_ratio,
|
| 685 |
+
qkv_bias=qkv_bias,
|
| 686 |
+
qk_scale=qk_scale,
|
| 687 |
+
drop=drop_rate,
|
| 688 |
+
attn_drop=attn_drop_rate,
|
| 689 |
+
drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
|
| 690 |
+
norm_layer=norm_layer,
|
| 691 |
+
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
|
| 692 |
+
use_checkpoint=use_checkpoint,
|
| 693 |
+
)
|
| 694 |
+
self.layers.append(layer)
|
| 695 |
+
|
| 696 |
+
self.norm = norm_layer(self.num_features)
|
| 697 |
+
self.avgpool = nn.AdaptiveAvgPool1d(1)
|
| 698 |
+
# self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
|
| 699 |
+
|
| 700 |
+
self.apply(self._init_weights)
|
| 701 |
+
|
| 702 |
+
def _init_weights(self, m):
|
| 703 |
+
if isinstance(m, nn.Linear):
|
| 704 |
+
trunc_normal_(m.weight, std=0.02)
|
| 705 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 706 |
+
nn.init.constant_(m.bias, 0)
|
| 707 |
+
elif isinstance(m, nn.LayerNorm):
|
| 708 |
+
nn.init.constant_(m.bias, 0)
|
| 709 |
+
nn.init.constant_(m.weight, 1.0)
|
| 710 |
+
|
| 711 |
+
@torch.jit.ignore
|
| 712 |
+
def no_weight_decay(self):
|
| 713 |
+
return {"absolute_pos_embed"}
|
| 714 |
+
|
| 715 |
+
@torch.jit.ignore
|
| 716 |
+
def no_weight_decay_keywords(self):
|
| 717 |
+
return {"relative_position_bias_table"}
|
| 718 |
+
|
| 719 |
+
def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs):
|
| 720 |
+
x = self.patch_embed(x)
|
| 721 |
+
if self.ape:
|
| 722 |
+
x = x + self.absolute_pos_embed
|
| 723 |
+
x = self.pos_drop(x)
|
| 724 |
+
|
| 725 |
+
for layer in self.layers:
|
| 726 |
+
x = layer(x)
|
| 727 |
+
|
| 728 |
+
x = self.norm(x) # B L C
|
| 729 |
+
|
| 730 |
+
x_cls = self.avgpool(x.transpose(1, 2)) # B C 1
|
| 731 |
+
|
| 732 |
+
if idx_to_group_img is None:
|
| 733 |
+
return torch.cat([x_cls.transpose(1, 2), x], dim=1)
|
| 734 |
+
else:
|
| 735 |
+
x_bs = torch.gather(
|
| 736 |
+
x,
|
| 737 |
+
dim=0,
|
| 738 |
+
index=idx_to_group_img.view(-1, 1, 1).expand(
|
| 739 |
+
-1, x.shape[1], x.shape[2]
|
| 740 |
+
),
|
| 741 |
+
)
|
| 742 |
+
weights = image_atts[:, 1:].unsqueeze(2) # B L 1
|
| 743 |
+
x_bs_cls = torch.sum(
|
| 744 |
+
(weights * x_bs).transpose(1, 2), dim=-1, keepdim=True
|
| 745 |
+
) # B C 1
|
| 746 |
+
x_bs_cls = x_bs_cls / torch.sum(
|
| 747 |
+
weights.transpose(1, 2), dim=-1, keepdim=True
|
| 748 |
+
) # avgpool
|
| 749 |
+
|
| 750 |
+
return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), torch.cat(
|
| 751 |
+
[x_cls.transpose(1, 2), x], dim=1
|
| 752 |
+
)
|
| 753 |
+
|
| 754 |
+
def flops(self):
|
| 755 |
+
flops = 0
|
| 756 |
+
flops += self.patch_embed.flops()
|
| 757 |
+
for i, layer in enumerate(self.layers):
|
| 758 |
+
flops += layer.flops()
|
| 759 |
+
flops += (
|
| 760 |
+
self.num_features
|
| 761 |
+
* self.patches_resolution[0]
|
| 762 |
+
* self.patches_resolution[1]
|
| 763 |
+
// (2**self.num_layers)
|
| 764 |
+
)
|
| 765 |
+
flops += self.num_features * self.num_classes
|
| 766 |
+
return flops
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=""):
|
| 770 |
+
# from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348
|
| 771 |
+
|
| 772 |
+
# rel_pos_bias: relative_position_bias_table
|
| 773 |
+
src_num_pos, num_attn_heads = rel_pos_bias.size()
|
| 774 |
+
|
| 775 |
+
num_extra_tokens = 0
|
| 776 |
+
src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
|
| 777 |
+
dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
|
| 778 |
+
if src_size != dst_size:
|
| 779 |
+
print(
|
| 780 |
+
"Position interpolate %s from %dx%d to %dx%d"
|
| 781 |
+
% (param_name, src_size, src_size, dst_size, dst_size)
|
| 782 |
+
)
|
| 783 |
+
|
| 784 |
+
# extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
|
| 785 |
+
# rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
|
| 786 |
+
|
| 787 |
+
def geometric_progression(a, r, n):
|
| 788 |
+
return a * (1.0 - r**n) / (1.0 - r)
|
| 789 |
+
|
| 790 |
+
left, right = 1.01, 1.5
|
| 791 |
+
while right - left > 1e-6:
|
| 792 |
+
q = (left + right) / 2.0
|
| 793 |
+
gp = geometric_progression(1, q, src_size // 2)
|
| 794 |
+
if gp > dst_size // 2:
|
| 795 |
+
right = q
|
| 796 |
+
else:
|
| 797 |
+
left = q
|
| 798 |
+
|
| 799 |
+
# if q > 1.090307:
|
| 800 |
+
# q = 1.090307
|
| 801 |
+
|
| 802 |
+
dis = []
|
| 803 |
+
cur = 1
|
| 804 |
+
for i in range(src_size // 2):
|
| 805 |
+
dis.append(cur)
|
| 806 |
+
cur += q ** (i + 1)
|
| 807 |
+
|
| 808 |
+
r_ids = [-_ for _ in reversed(dis)]
|
| 809 |
+
|
| 810 |
+
x = r_ids + [0] + dis
|
| 811 |
+
y = r_ids + [0] + dis
|
| 812 |
+
|
| 813 |
+
t = dst_size // 2.0
|
| 814 |
+
dx = np.arange(-t, t + 0.1, 1.0)
|
| 815 |
+
dy = np.arange(-t, t + 0.1, 1.0)
|
| 816 |
+
|
| 817 |
+
# print("Original positions = %s" % str(x))
|
| 818 |
+
# print("Target positions = %s" % str(dx))
|
| 819 |
+
|
| 820 |
+
all_rel_pos_bias = []
|
| 821 |
+
|
| 822 |
+
for i in range(num_attn_heads):
|
| 823 |
+
z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
|
| 824 |
+
f = interpolate.interp2d(x, y, z, kind="cubic")
|
| 825 |
+
all_rel_pos_bias.append(
|
| 826 |
+
torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)
|
| 827 |
+
)
|
| 828 |
+
|
| 829 |
+
rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
|
| 830 |
+
|
| 831 |
+
return rel_pos_bias
|
tag2text/models/tag2text.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
* Tag2Text
|
| 3 |
+
* Written by Xinyu Huang
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
import warnings
|
| 7 |
+
|
| 8 |
+
import numpy as np
|
| 9 |
+
import torch
|
| 10 |
+
from models.bert import BertConfig
|
| 11 |
+
from models.bert import BertLMHeadModel
|
| 12 |
+
from models.bert import BertModel
|
| 13 |
+
from models.swin_transformer import SwinTransformer
|
| 14 |
+
from models.utils import *
|
| 15 |
+
from models.vit import VisionTransformer
|
| 16 |
+
from torch import nn
|
| 17 |
+
|
| 18 |
+
warnings.filterwarnings("ignore")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Tag2Text_Caption(nn.Module):
|
| 22 |
+
def __init__(
|
| 23 |
+
self,
|
| 24 |
+
med_config=f"{CONFIG_PATH}/configs/med_config.json",
|
| 25 |
+
image_size=384,
|
| 26 |
+
vit="base",
|
| 27 |
+
vit_grad_ckpt=False,
|
| 28 |
+
vit_ckpt_layer=0,
|
| 29 |
+
prompt="a picture of ",
|
| 30 |
+
threshold=0.68,
|
| 31 |
+
delete_tag_index=[],
|
| 32 |
+
tag_list=f"{CONFIG_PATH}/data/tag_list.txt",
|
| 33 |
+
):
|
| 34 |
+
r"""Tag2Text inference module, both captioning and tagging are included.
|
| 35 |
+
Tag2Text is an efficient and controllable vision-language pre-training framework.
|
| 36 |
+
Described in the paper "Tag2Text: Guiding Vision-Language Model via Image Tagging" https://arxiv.org/abs/2303.05657
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
med_config (str): path for the mixture of encoder-decoder model's configuration file
|
| 40 |
+
image_size (int): input image size
|
| 41 |
+
vit (str): model size of vision transformer
|
| 42 |
+
threshold (int): tagging threshold
|
| 43 |
+
delete_tag_index (list): delete some tags that may disturb captioning
|
| 44 |
+
"""
|
| 45 |
+
super().__init__()
|
| 46 |
+
|
| 47 |
+
# create image encoder
|
| 48 |
+
if vit == "swin_b":
|
| 49 |
+
if image_size == 224:
|
| 50 |
+
vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_224.json"
|
| 51 |
+
elif image_size == 384:
|
| 52 |
+
vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_384.json"
|
| 53 |
+
vision_config = read_json(vision_config_path)
|
| 54 |
+
assert image_size == vision_config["image_res"]
|
| 55 |
+
# assert config['patch_size'] == 32
|
| 56 |
+
vision_width = vision_config["vision_width"]
|
| 57 |
+
|
| 58 |
+
self.visual_encoder = SwinTransformer(
|
| 59 |
+
img_size=vision_config["image_res"],
|
| 60 |
+
patch_size=4,
|
| 61 |
+
in_chans=3,
|
| 62 |
+
embed_dim=vision_config["embed_dim"],
|
| 63 |
+
depths=vision_config["depths"],
|
| 64 |
+
num_heads=vision_config["num_heads"],
|
| 65 |
+
window_size=vision_config["window_size"],
|
| 66 |
+
mlp_ratio=4.0,
|
| 67 |
+
qkv_bias=True,
|
| 68 |
+
drop_rate=0.0,
|
| 69 |
+
drop_path_rate=0.1,
|
| 70 |
+
ape=False,
|
| 71 |
+
patch_norm=True,
|
| 72 |
+
use_checkpoint=False,
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
else:
|
| 76 |
+
self.visual_encoder, vision_width = create_vit(
|
| 77 |
+
vit, image_size, vit_grad_ckpt, vit_ckpt_layer
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# create tokenzier
|
| 81 |
+
self.tokenizer = init_tokenizer()
|
| 82 |
+
|
| 83 |
+
# Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder
|
| 84 |
+
# create image-tag interaction encoder
|
| 85 |
+
encoder_config = BertConfig.from_json_file(med_config)
|
| 86 |
+
encoder_config.encoder_width = vision_width
|
| 87 |
+
self.tag_encoder = BertModel(config=encoder_config, add_pooling_layer=False)
|
| 88 |
+
|
| 89 |
+
# create image-tag-text decoder
|
| 90 |
+
decoder_config = BertConfig.from_json_file(med_config)
|
| 91 |
+
self.text_decoder = BertLMHeadModel(config=decoder_config)
|
| 92 |
+
|
| 93 |
+
self.delete_tag_index = delete_tag_index
|
| 94 |
+
self.prompt = prompt
|
| 95 |
+
self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1
|
| 96 |
+
|
| 97 |
+
# load tag list
|
| 98 |
+
self.tag_list = self.load_tag_list(tag_list)
|
| 99 |
+
|
| 100 |
+
# create image-tag recognition decoder
|
| 101 |
+
self.threshold = threshold
|
| 102 |
+
self.num_class = len(self.tag_list)
|
| 103 |
+
q2l_config = BertConfig.from_json_file(f"{CONFIG_PATH}/configs/q2l_config.json")
|
| 104 |
+
q2l_config.encoder_width = vision_width
|
| 105 |
+
self.tagging_head = BertModel(config=q2l_config, add_pooling_layer=False)
|
| 106 |
+
self.tagging_head.resize_token_embeddings(len(self.tokenizer))
|
| 107 |
+
self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size)
|
| 108 |
+
self.fc = GroupWiseLinear(self.num_class, q2l_config.hidden_size, bias=True)
|
| 109 |
+
self.del_selfattention()
|
| 110 |
+
|
| 111 |
+
# share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder"
|
| 112 |
+
tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, "", " ")
|
| 113 |
+
|
| 114 |
+
def load_tag_list(self, tag_list_file):
|
| 115 |
+
with open(tag_list_file) as f:
|
| 116 |
+
tag_list = f.read().splitlines()
|
| 117 |
+
tag_list = np.array(tag_list)
|
| 118 |
+
return tag_list
|
| 119 |
+
|
| 120 |
+
# delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label
|
| 121 |
+
def del_selfattention(self):
|
| 122 |
+
del self.tagging_head.embeddings
|
| 123 |
+
for layer in self.tagging_head.encoder.layer:
|
| 124 |
+
del layer.attention
|
| 125 |
+
|
| 126 |
+
def generate(
|
| 127 |
+
self,
|
| 128 |
+
image,
|
| 129 |
+
sample=False,
|
| 130 |
+
num_beams=3,
|
| 131 |
+
max_length=30,
|
| 132 |
+
min_length=10,
|
| 133 |
+
top_p=0.9,
|
| 134 |
+
repetition_penalty=1.0,
|
| 135 |
+
tag_input=None,
|
| 136 |
+
return_tag_predict=False,
|
| 137 |
+
):
|
| 138 |
+
image_embeds = self.visual_encoder(image)
|
| 139 |
+
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
|
| 140 |
+
image.device
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
# if not user specified tags, recognized image tags using image-tag recogntiion decoder
|
| 144 |
+
if tag_input == None:
|
| 145 |
+
image_cls_embeds = image_embeds[:, 0, :]
|
| 146 |
+
image_spatial_embeds = image_embeds[:, 1:, :]
|
| 147 |
+
|
| 148 |
+
bs = image_spatial_embeds.shape[0]
|
| 149 |
+
label_embed = self.label_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
|
| 150 |
+
tagging_embed = self.tagging_head(
|
| 151 |
+
encoder_embeds=label_embed,
|
| 152 |
+
encoder_hidden_states=image_embeds,
|
| 153 |
+
encoder_attention_mask=image_atts,
|
| 154 |
+
return_dict=False,
|
| 155 |
+
mode="tagging",
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
logits = self.fc(tagging_embed[0])
|
| 159 |
+
|
| 160 |
+
targets = torch.where(
|
| 161 |
+
torch.sigmoid(logits) > self.threshold,
|
| 162 |
+
torch.tensor(1.0).to(image.device),
|
| 163 |
+
torch.zeros(self.num_class).to(image.device),
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
tag = targets.cpu().numpy()
|
| 167 |
+
|
| 168 |
+
# delete some tags that may disturb captioning
|
| 169 |
+
tag[:, self.delete_tag_index] = 0
|
| 170 |
+
|
| 171 |
+
tag_input = []
|
| 172 |
+
for b in range(bs):
|
| 173 |
+
index = np.argwhere(tag[b] == 1)
|
| 174 |
+
token = self.tag_list[index].squeeze(axis=1)
|
| 175 |
+
tag_input.append(" | ".join(token))
|
| 176 |
+
|
| 177 |
+
tag_output = tag_input
|
| 178 |
+
|
| 179 |
+
# beam search for text generation(default)
|
| 180 |
+
if not sample:
|
| 181 |
+
image_embeds = image_embeds.repeat_interleave(num_beams, dim=0)
|
| 182 |
+
tag_input_temp = []
|
| 183 |
+
for tag in tag_input:
|
| 184 |
+
for i in range(num_beams):
|
| 185 |
+
tag_input_temp.append(tag)
|
| 186 |
+
tag_input = tag_input_temp
|
| 187 |
+
|
| 188 |
+
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(
|
| 189 |
+
image.device
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
# tokenizer input tags
|
| 193 |
+
tag_input_tokenzier = self.tokenizer(
|
| 194 |
+
tag_input,
|
| 195 |
+
padding="max_length",
|
| 196 |
+
truncation=True,
|
| 197 |
+
max_length=40,
|
| 198 |
+
return_tensors="pt",
|
| 199 |
+
).to(image.device)
|
| 200 |
+
encoder_input_ids = tag_input_tokenzier.input_ids
|
| 201 |
+
encoder_input_ids[:, 0] = self.tokenizer.enc_token_id
|
| 202 |
+
|
| 203 |
+
# put input tag into image-tag interaction encoder to interact with image embeddings
|
| 204 |
+
output_tagembedding = self.tag_encoder(
|
| 205 |
+
encoder_input_ids,
|
| 206 |
+
attention_mask=tag_input_tokenzier.attention_mask,
|
| 207 |
+
encoder_hidden_states=image_embeds,
|
| 208 |
+
encoder_attention_mask=image_atts,
|
| 209 |
+
return_dict=True,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# prompt trick for better captioning, followed BLIP
|
| 213 |
+
prompt = [self.prompt] * image.size(0)
|
| 214 |
+
input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids.to(
|
| 215 |
+
image.device
|
| 216 |
+
)
|
| 217 |
+
input_ids[:, 0] = self.tokenizer.bos_token_id
|
| 218 |
+
input_ids = input_ids[:, :-1]
|
| 219 |
+
|
| 220 |
+
if sample:
|
| 221 |
+
# nucleus sampling
|
| 222 |
+
model_kwargs = {
|
| 223 |
+
"encoder_hidden_states": output_tagembedding.last_hidden_state,
|
| 224 |
+
"encoder_attention_mask": None,
|
| 225 |
+
}
|
| 226 |
+
outputs = self.text_decoder.generate(
|
| 227 |
+
input_ids=input_ids,
|
| 228 |
+
max_length=max_length,
|
| 229 |
+
min_length=min_length,
|
| 230 |
+
do_sample=True,
|
| 231 |
+
top_p=top_p,
|
| 232 |
+
num_return_sequences=1,
|
| 233 |
+
eos_token_id=self.tokenizer.sep_token_id,
|
| 234 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
| 235 |
+
repetition_penalty=1.1,
|
| 236 |
+
**model_kwargs,
|
| 237 |
+
)
|
| 238 |
+
else:
|
| 239 |
+
# beam search (default)
|
| 240 |
+
model_kwargs = {
|
| 241 |
+
"encoder_hidden_states": output_tagembedding.last_hidden_state,
|
| 242 |
+
"encoder_attention_mask": None,
|
| 243 |
+
}
|
| 244 |
+
outputs = self.text_decoder.generate(
|
| 245 |
+
input_ids=input_ids,
|
| 246 |
+
max_length=max_length,
|
| 247 |
+
min_length=min_length,
|
| 248 |
+
num_beams=num_beams,
|
| 249 |
+
eos_token_id=self.tokenizer.sep_token_id,
|
| 250 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
| 251 |
+
repetition_penalty=repetition_penalty,
|
| 252 |
+
**model_kwargs,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
captions = []
|
| 256 |
+
for output in outputs:
|
| 257 |
+
caption = self.tokenizer.decode(output, skip_special_tokens=True)
|
| 258 |
+
captions.append(caption[len(self.prompt) :])
|
| 259 |
+
if return_tag_predict == True:
|
| 260 |
+
return captions, tag_output
|
| 261 |
+
return captions
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# load pretrained model parameters
|
| 265 |
+
def tag2text_caption(pretrained="", **kwargs):
|
| 266 |
+
model = Tag2Text_Caption(**kwargs)
|
| 267 |
+
if pretrained:
|
| 268 |
+
if kwargs["vit"] == "swin_b":
|
| 269 |
+
model, msg = load_checkpoint_swinbase(model, pretrained, kwargs)
|
| 270 |
+
else:
|
| 271 |
+
model, msg = load_checkpoint(model, pretrained)
|
| 272 |
+
print("vit:", kwargs["vit"])
|
| 273 |
+
print("msg", msg)
|
| 274 |
+
return model
|
tag2text/models/utils.py
ADDED
|
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import math
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List
|
| 6 |
+
from urllib.parse import urlparse
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
from models.swin_transformer import interpolate_relative_pos_embed
|
| 10 |
+
from models.vit import interpolate_pos_embed
|
| 11 |
+
from timm.models.hub import download_cached_file
|
| 12 |
+
from torch import nn
|
| 13 |
+
from transformers import BertTokenizer
|
| 14 |
+
|
| 15 |
+
CONFIG_PATH = Path(__file__).resolve().parents[1]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def read_json(rpath):
|
| 19 |
+
with open(rpath) as f:
|
| 20 |
+
return json.load(f)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def tie_encoder_decoder_weights(
|
| 24 |
+
encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, skip_key: str
|
| 25 |
+
):
|
| 26 |
+
uninitialized_encoder_weights: List[str] = []
|
| 27 |
+
if decoder.__class__ != encoder.__class__:
|
| 28 |
+
logger.info(
|
| 29 |
+
f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
def tie_encoder_to_decoder_recursively(
|
| 33 |
+
decoder_pointer: nn.Module,
|
| 34 |
+
encoder_pointer: nn.Module,
|
| 35 |
+
module_name: str,
|
| 36 |
+
uninitialized_encoder_weights: List[str],
|
| 37 |
+
skip_key: str,
|
| 38 |
+
depth=0,
|
| 39 |
+
):
|
| 40 |
+
assert isinstance(decoder_pointer, nn.Module) and isinstance(
|
| 41 |
+
encoder_pointer, nn.Module
|
| 42 |
+
), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
|
| 43 |
+
if hasattr(decoder_pointer, "weight") and skip_key not in module_name:
|
| 44 |
+
assert hasattr(encoder_pointer, "weight")
|
| 45 |
+
encoder_pointer.weight = decoder_pointer.weight
|
| 46 |
+
if hasattr(decoder_pointer, "bias"):
|
| 47 |
+
assert hasattr(encoder_pointer, "bias")
|
| 48 |
+
encoder_pointer.bias = decoder_pointer.bias
|
| 49 |
+
print(module_name + " is tied")
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
encoder_modules = encoder_pointer._modules
|
| 53 |
+
decoder_modules = decoder_pointer._modules
|
| 54 |
+
if len(decoder_modules) > 0:
|
| 55 |
+
assert (
|
| 56 |
+
len(encoder_modules) > 0
|
| 57 |
+
), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
|
| 58 |
+
|
| 59 |
+
all_encoder_weights = {
|
| 60 |
+
module_name + "/" + sub_name for sub_name in encoder_modules.keys()
|
| 61 |
+
}
|
| 62 |
+
encoder_layer_pos = 0
|
| 63 |
+
for name, module in decoder_modules.items():
|
| 64 |
+
if name.isdigit():
|
| 65 |
+
encoder_name = str(int(name) + encoder_layer_pos)
|
| 66 |
+
decoder_name = name
|
| 67 |
+
if not isinstance(
|
| 68 |
+
decoder_modules[decoder_name],
|
| 69 |
+
type(encoder_modules[encoder_name]),
|
| 70 |
+
) and len(encoder_modules) != len(decoder_modules):
|
| 71 |
+
# this can happen if the name corresponds to the position in a list module list of layers
|
| 72 |
+
# in this case the decoder has added a cross-attention that the encoder does not have
|
| 73 |
+
# thus skip this step and subtract one layer pos from encoder
|
| 74 |
+
encoder_layer_pos -= 1
|
| 75 |
+
continue
|
| 76 |
+
elif name not in encoder_modules:
|
| 77 |
+
continue
|
| 78 |
+
elif depth > 500:
|
| 79 |
+
raise ValueError(
|
| 80 |
+
"Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
|
| 81 |
+
)
|
| 82 |
+
else:
|
| 83 |
+
decoder_name = encoder_name = name
|
| 84 |
+
tie_encoder_to_decoder_recursively(
|
| 85 |
+
decoder_modules[decoder_name],
|
| 86 |
+
encoder_modules[encoder_name],
|
| 87 |
+
module_name + "/" + name,
|
| 88 |
+
uninitialized_encoder_weights,
|
| 89 |
+
skip_key,
|
| 90 |
+
depth=depth + 1,
|
| 91 |
+
)
|
| 92 |
+
all_encoder_weights.remove(module_name + "/" + encoder_name)
|
| 93 |
+
|
| 94 |
+
uninitialized_encoder_weights += list(all_encoder_weights)
|
| 95 |
+
|
| 96 |
+
# tie weights recursively
|
| 97 |
+
tie_encoder_to_decoder_recursively(
|
| 98 |
+
decoder, encoder, base_model_prefix, uninitialized_encoder_weights, skip_key
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class GroupWiseLinear(nn.Module):
|
| 103 |
+
# could be changed to:
|
| 104 |
+
# output = torch.einsum('ijk,zjk->ij', x, self.W)
|
| 105 |
+
# or output = torch.einsum('ijk,jk->ij', x, self.W[0])
|
| 106 |
+
def __init__(self, num_class, hidden_dim, bias=True):
|
| 107 |
+
super().__init__()
|
| 108 |
+
self.num_class = num_class
|
| 109 |
+
self.hidden_dim = hidden_dim
|
| 110 |
+
self.bias = bias
|
| 111 |
+
|
| 112 |
+
self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim))
|
| 113 |
+
if bias:
|
| 114 |
+
self.b = nn.Parameter(torch.Tensor(1, num_class))
|
| 115 |
+
self.reset_parameters()
|
| 116 |
+
|
| 117 |
+
def reset_parameters(self):
|
| 118 |
+
stdv = 1.0 / math.sqrt(self.W.size(2))
|
| 119 |
+
for i in range(self.num_class):
|
| 120 |
+
self.W[0][i].data.uniform_(-stdv, stdv)
|
| 121 |
+
if self.bias:
|
| 122 |
+
for i in range(self.num_class):
|
| 123 |
+
self.b[0][i].data.uniform_(-stdv, stdv)
|
| 124 |
+
|
| 125 |
+
def forward(self, x):
|
| 126 |
+
# x: B,K,d
|
| 127 |
+
x = (self.W * x).sum(-1)
|
| 128 |
+
if self.bias:
|
| 129 |
+
x = x + self.b
|
| 130 |
+
return x
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def init_tokenizer():
|
| 134 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
| 135 |
+
tokenizer.add_special_tokens({"bos_token": "[DEC]"})
|
| 136 |
+
tokenizer.add_special_tokens({"additional_special_tokens": ["[ENC]"]})
|
| 137 |
+
tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
|
| 138 |
+
return tokenizer
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def create_vit(
|
| 142 |
+
vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop_path_rate=0
|
| 143 |
+
):
|
| 144 |
+
assert vit in ["base", "large"], "vit parameter must be base or large"
|
| 145 |
+
if vit == "base":
|
| 146 |
+
vision_width = 768
|
| 147 |
+
visual_encoder = VisionTransformer(
|
| 148 |
+
img_size=image_size,
|
| 149 |
+
patch_size=16,
|
| 150 |
+
embed_dim=vision_width,
|
| 151 |
+
depth=12,
|
| 152 |
+
num_heads=12,
|
| 153 |
+
use_grad_checkpointing=use_grad_checkpointing,
|
| 154 |
+
ckpt_layer=ckpt_layer,
|
| 155 |
+
drop_path_rate=0 or drop_path_rate,
|
| 156 |
+
)
|
| 157 |
+
elif vit == "large":
|
| 158 |
+
vision_width = 1024
|
| 159 |
+
visual_encoder = VisionTransformer(
|
| 160 |
+
img_size=image_size,
|
| 161 |
+
patch_size=16,
|
| 162 |
+
embed_dim=vision_width,
|
| 163 |
+
depth=24,
|
| 164 |
+
num_heads=16,
|
| 165 |
+
use_grad_checkpointing=use_grad_checkpointing,
|
| 166 |
+
ckpt_layer=ckpt_layer,
|
| 167 |
+
drop_path_rate=0.1 or drop_path_rate,
|
| 168 |
+
)
|
| 169 |
+
return visual_encoder, vision_width
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def is_url(url_or_filename):
|
| 173 |
+
parsed = urlparse(url_or_filename)
|
| 174 |
+
return parsed.scheme in ("http", "https")
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def load_checkpoint(model, url_or_filename):
|
| 178 |
+
if is_url(url_or_filename):
|
| 179 |
+
cached_file = download_cached_file(
|
| 180 |
+
url_or_filename, check_hash=False, progress=True
|
| 181 |
+
)
|
| 182 |
+
checkpoint = torch.load(cached_file, map_location="cpu")
|
| 183 |
+
elif os.path.isfile(url_or_filename):
|
| 184 |
+
checkpoint = torch.load(url_or_filename, map_location="cpu")
|
| 185 |
+
else:
|
| 186 |
+
raise RuntimeError("checkpoint url or path is invalid")
|
| 187 |
+
|
| 188 |
+
state_dict = checkpoint["model"]
|
| 189 |
+
|
| 190 |
+
state_dict["visual_encoder.pos_embed"] = interpolate_pos_embed(
|
| 191 |
+
state_dict["visual_encoder.pos_embed"], model.visual_encoder
|
| 192 |
+
)
|
| 193 |
+
if "visual_encoder_m.pos_embed" in model.state_dict().keys():
|
| 194 |
+
state_dict["visual_encoder_m.pos_embed"] = interpolate_pos_embed(
|
| 195 |
+
state_dict["visual_encoder_m.pos_embed"], model.visual_encoder_m
|
| 196 |
+
)
|
| 197 |
+
for key in model.state_dict().keys():
|
| 198 |
+
if key in state_dict.keys():
|
| 199 |
+
if state_dict[key].shape != model.state_dict()[key].shape:
|
| 200 |
+
del state_dict[key]
|
| 201 |
+
|
| 202 |
+
msg = model.load_state_dict(state_dict, strict=False)
|
| 203 |
+
print("load checkpoint from %s" % url_or_filename)
|
| 204 |
+
return model, msg
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def load_checkpoint_swinbase(model, url_or_filename, kwargs):
|
| 208 |
+
if kwargs["image_size"] == 224:
|
| 209 |
+
vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_224.json"
|
| 210 |
+
elif kwargs["image_size"] == 384:
|
| 211 |
+
vision_config_path = f"{CONFIG_PATH}/configs/swin/config_swinB_384.json"
|
| 212 |
+
window_size = read_json(vision_config_path)["window_size"]
|
| 213 |
+
print("--------------")
|
| 214 |
+
print(url_or_filename)
|
| 215 |
+
print("--------------")
|
| 216 |
+
if is_url(url_or_filename):
|
| 217 |
+
cached_file = download_cached_file(
|
| 218 |
+
url_or_filename, check_hash=False, progress=True
|
| 219 |
+
)
|
| 220 |
+
checkpoint = torch.load(cached_file, map_location="cpu")
|
| 221 |
+
elif os.path.isfile(url_or_filename):
|
| 222 |
+
checkpoint = torch.load(url_or_filename, map_location="cpu")
|
| 223 |
+
else:
|
| 224 |
+
raise RuntimeError("checkpoint url or path is invalid")
|
| 225 |
+
|
| 226 |
+
state_dict = checkpoint["model"]
|
| 227 |
+
|
| 228 |
+
for k in list(state_dict.keys()):
|
| 229 |
+
if "relative_position_bias_table" in k:
|
| 230 |
+
dst_num_pos = (2 * window_size - 1) ** 2
|
| 231 |
+
state_dict[k] = interpolate_relative_pos_embed(
|
| 232 |
+
state_dict[k], dst_num_pos, param_name=k
|
| 233 |
+
)
|
| 234 |
+
elif ("relative_position_index" in k) or ("attn_mask" in k):
|
| 235 |
+
del state_dict[k]
|
| 236 |
+
elif "vision_multi" in k:
|
| 237 |
+
state_dict[k.replace("vision_multi", "tagging_head")] = state_dict.pop(k)
|
| 238 |
+
|
| 239 |
+
msg = model.load_state_dict(state_dict, strict=False)
|
| 240 |
+
print("load checkpoint from %s" % url_or_filename)
|
| 241 |
+
return model, msg
|
tag2text/models/vit.py
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
* Copyright (c) 2022, salesforce.com, inc.
|
| 3 |
+
* All rights reserved.
|
| 4 |
+
* SPDX-License-Identifier: BSD-3-Clause
|
| 5 |
+
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
|
| 6 |
+
* By Junnan Li
|
| 7 |
+
* Based on timm code base
|
| 8 |
+
* https://github.com/rwightman/pytorch-image-models/tree/master/timm
|
| 9 |
+
"""
|
| 10 |
+
from functools import partial
|
| 11 |
+
|
| 12 |
+
import torch
|
| 13 |
+
import torch.nn as nn
|
| 14 |
+
import torch.nn.functional as F
|
| 15 |
+
from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
|
| 16 |
+
from timm.models.helpers import adapt_input_conv
|
| 17 |
+
from timm.models.helpers import named_apply
|
| 18 |
+
from timm.models.layers import DropPath
|
| 19 |
+
from timm.models.layers import trunc_normal_
|
| 20 |
+
from timm.models.registry import register_model
|
| 21 |
+
from timm.models.vision_transformer import _cfg
|
| 22 |
+
from timm.models.vision_transformer import PatchEmbed
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class Mlp(nn.Module):
|
| 26 |
+
"""MLP as used in Vision Transformer, MLP-Mixer and related networks."""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
in_features,
|
| 31 |
+
hidden_features=None,
|
| 32 |
+
out_features=None,
|
| 33 |
+
act_layer=nn.GELU,
|
| 34 |
+
drop=0.0,
|
| 35 |
+
):
|
| 36 |
+
super().__init__()
|
| 37 |
+
out_features = out_features or in_features
|
| 38 |
+
hidden_features = hidden_features or in_features
|
| 39 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
| 40 |
+
self.act = act_layer()
|
| 41 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
| 42 |
+
self.drop = nn.Dropout(drop)
|
| 43 |
+
|
| 44 |
+
def forward(self, x):
|
| 45 |
+
x = self.fc1(x)
|
| 46 |
+
x = self.act(x)
|
| 47 |
+
x = self.drop(x)
|
| 48 |
+
x = self.fc2(x)
|
| 49 |
+
x = self.drop(x)
|
| 50 |
+
return x
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class Attention(nn.Module):
|
| 54 |
+
def __init__(
|
| 55 |
+
self,
|
| 56 |
+
dim,
|
| 57 |
+
num_heads=8,
|
| 58 |
+
qkv_bias=False,
|
| 59 |
+
qk_scale=None,
|
| 60 |
+
attn_drop=0.0,
|
| 61 |
+
proj_drop=0.0,
|
| 62 |
+
):
|
| 63 |
+
super().__init__()
|
| 64 |
+
self.num_heads = num_heads
|
| 65 |
+
head_dim = dim // num_heads
|
| 66 |
+
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
|
| 67 |
+
self.scale = qk_scale or head_dim**-0.5
|
| 68 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 69 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
| 70 |
+
self.proj = nn.Linear(dim, dim)
|
| 71 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 72 |
+
self.attn_gradients = None
|
| 73 |
+
self.attention_map = None
|
| 74 |
+
|
| 75 |
+
def save_attn_gradients(self, attn_gradients):
|
| 76 |
+
self.attn_gradients = attn_gradients
|
| 77 |
+
|
| 78 |
+
def get_attn_gradients(self):
|
| 79 |
+
return self.attn_gradients
|
| 80 |
+
|
| 81 |
+
def save_attention_map(self, attention_map):
|
| 82 |
+
self.attention_map = attention_map
|
| 83 |
+
|
| 84 |
+
def get_attention_map(self):
|
| 85 |
+
return self.attention_map
|
| 86 |
+
|
| 87 |
+
def forward(self, x, register_hook=False):
|
| 88 |
+
B, N, C = x.shape
|
| 89 |
+
qkv = (
|
| 90 |
+
self.qkv(x)
|
| 91 |
+
.reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
| 92 |
+
.permute(2, 0, 3, 1, 4)
|
| 93 |
+
)
|
| 94 |
+
q, k, v = (
|
| 95 |
+
qkv[0],
|
| 96 |
+
qkv[1],
|
| 97 |
+
qkv[2],
|
| 98 |
+
) # make torchscript happy (cannot use tensor as tuple)
|
| 99 |
+
|
| 100 |
+
attn = (q @ k.transpose(-2, -1)) * self.scale
|
| 101 |
+
attn = attn.softmax(dim=-1)
|
| 102 |
+
attn = self.attn_drop(attn)
|
| 103 |
+
|
| 104 |
+
if register_hook:
|
| 105 |
+
self.save_attention_map(attn)
|
| 106 |
+
attn.register_hook(self.save_attn_gradients)
|
| 107 |
+
|
| 108 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
| 109 |
+
x = self.proj(x)
|
| 110 |
+
x = self.proj_drop(x)
|
| 111 |
+
return x
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class Block(nn.Module):
|
| 115 |
+
def __init__(
|
| 116 |
+
self,
|
| 117 |
+
dim,
|
| 118 |
+
num_heads,
|
| 119 |
+
mlp_ratio=4.0,
|
| 120 |
+
qkv_bias=False,
|
| 121 |
+
qk_scale=None,
|
| 122 |
+
drop=0.0,
|
| 123 |
+
attn_drop=0.0,
|
| 124 |
+
drop_path=0.0,
|
| 125 |
+
act_layer=nn.GELU,
|
| 126 |
+
norm_layer=nn.LayerNorm,
|
| 127 |
+
use_grad_checkpointing=False,
|
| 128 |
+
):
|
| 129 |
+
super().__init__()
|
| 130 |
+
self.norm1 = norm_layer(dim)
|
| 131 |
+
self.attn = Attention(
|
| 132 |
+
dim,
|
| 133 |
+
num_heads=num_heads,
|
| 134 |
+
qkv_bias=qkv_bias,
|
| 135 |
+
qk_scale=qk_scale,
|
| 136 |
+
attn_drop=attn_drop,
|
| 137 |
+
proj_drop=drop,
|
| 138 |
+
)
|
| 139 |
+
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
| 140 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
|
| 141 |
+
self.norm2 = norm_layer(dim)
|
| 142 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
| 143 |
+
self.mlp = Mlp(
|
| 144 |
+
in_features=dim,
|
| 145 |
+
hidden_features=mlp_hidden_dim,
|
| 146 |
+
act_layer=act_layer,
|
| 147 |
+
drop=drop,
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
if use_grad_checkpointing:
|
| 151 |
+
self.attn = checkpoint_wrapper(self.attn)
|
| 152 |
+
self.mlp = checkpoint_wrapper(self.mlp)
|
| 153 |
+
|
| 154 |
+
def forward(self, x, register_hook=False):
|
| 155 |
+
x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
|
| 156 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
| 157 |
+
return x
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class VisionTransformer(nn.Module):
|
| 161 |
+
"""Vision Transformer
|
| 162 |
+
A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` -
|
| 163 |
+
https://arxiv.org/abs/2010.11929
|
| 164 |
+
"""
|
| 165 |
+
|
| 166 |
+
def __init__(
|
| 167 |
+
self,
|
| 168 |
+
img_size=224,
|
| 169 |
+
patch_size=16,
|
| 170 |
+
in_chans=3,
|
| 171 |
+
num_classes=1000,
|
| 172 |
+
embed_dim=768,
|
| 173 |
+
depth=12,
|
| 174 |
+
num_heads=12,
|
| 175 |
+
mlp_ratio=4.0,
|
| 176 |
+
qkv_bias=True,
|
| 177 |
+
qk_scale=None,
|
| 178 |
+
representation_size=None,
|
| 179 |
+
drop_rate=0.0,
|
| 180 |
+
attn_drop_rate=0.0,
|
| 181 |
+
drop_path_rate=0.0,
|
| 182 |
+
norm_layer=None,
|
| 183 |
+
use_grad_checkpointing=False,
|
| 184 |
+
ckpt_layer=0,
|
| 185 |
+
):
|
| 186 |
+
"""
|
| 187 |
+
Args:
|
| 188 |
+
img_size (int, tuple): input image size
|
| 189 |
+
patch_size (int, tuple): patch size
|
| 190 |
+
in_chans (int): number of input channels
|
| 191 |
+
num_classes (int): number of classes for classification head
|
| 192 |
+
embed_dim (int): embedding dimension
|
| 193 |
+
depth (int): depth of transformer
|
| 194 |
+
num_heads (int): number of attention heads
|
| 195 |
+
mlp_ratio (int): ratio of mlp hidden dim to embedding dim
|
| 196 |
+
qkv_bias (bool): enable bias for qkv if True
|
| 197 |
+
qk_scale (float): override default qk scale of head_dim ** -0.5 if set
|
| 198 |
+
representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
|
| 199 |
+
drop_rate (float): dropout rate
|
| 200 |
+
attn_drop_rate (float): attention dropout rate
|
| 201 |
+
drop_path_rate (float): stochastic depth rate
|
| 202 |
+
norm_layer: (nn.Module): normalization layer
|
| 203 |
+
"""
|
| 204 |
+
super().__init__()
|
| 205 |
+
self.num_features = (
|
| 206 |
+
self.embed_dim
|
| 207 |
+
) = embed_dim # num_features for consistency with other models
|
| 208 |
+
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
|
| 209 |
+
|
| 210 |
+
self.patch_embed = PatchEmbed(
|
| 211 |
+
img_size=img_size,
|
| 212 |
+
patch_size=patch_size,
|
| 213 |
+
in_chans=in_chans,
|
| 214 |
+
embed_dim=embed_dim,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
num_patches = self.patch_embed.num_patches
|
| 218 |
+
|
| 219 |
+
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
|
| 220 |
+
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
|
| 221 |
+
self.pos_drop = nn.Dropout(p=drop_rate)
|
| 222 |
+
|
| 223 |
+
dpr = [
|
| 224 |
+
x.item() for x in torch.linspace(0, drop_path_rate, depth)
|
| 225 |
+
] # stochastic depth decay rule
|
| 226 |
+
self.blocks = nn.ModuleList(
|
| 227 |
+
[
|
| 228 |
+
Block(
|
| 229 |
+
dim=embed_dim,
|
| 230 |
+
num_heads=num_heads,
|
| 231 |
+
mlp_ratio=mlp_ratio,
|
| 232 |
+
qkv_bias=qkv_bias,
|
| 233 |
+
qk_scale=qk_scale,
|
| 234 |
+
drop=drop_rate,
|
| 235 |
+
attn_drop=attn_drop_rate,
|
| 236 |
+
drop_path=dpr[i],
|
| 237 |
+
norm_layer=norm_layer,
|
| 238 |
+
use_grad_checkpointing=(
|
| 239 |
+
use_grad_checkpointing and i >= depth - ckpt_layer
|
| 240 |
+
),
|
| 241 |
+
)
|
| 242 |
+
for i in range(depth)
|
| 243 |
+
]
|
| 244 |
+
)
|
| 245 |
+
self.norm = norm_layer(embed_dim)
|
| 246 |
+
|
| 247 |
+
trunc_normal_(self.pos_embed, std=0.02)
|
| 248 |
+
trunc_normal_(self.cls_token, std=0.02)
|
| 249 |
+
self.apply(self._init_weights)
|
| 250 |
+
|
| 251 |
+
def _init_weights(self, m):
|
| 252 |
+
if isinstance(m, nn.Linear):
|
| 253 |
+
trunc_normal_(m.weight, std=0.02)
|
| 254 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 255 |
+
nn.init.constant_(m.bias, 0)
|
| 256 |
+
elif isinstance(m, nn.LayerNorm):
|
| 257 |
+
nn.init.constant_(m.bias, 0)
|
| 258 |
+
nn.init.constant_(m.weight, 1.0)
|
| 259 |
+
|
| 260 |
+
@torch.jit.ignore
|
| 261 |
+
def no_weight_decay(self):
|
| 262 |
+
return {"pos_embed", "cls_token"}
|
| 263 |
+
|
| 264 |
+
def forward(self, x, register_blk=-1):
|
| 265 |
+
B = x.shape[0]
|
| 266 |
+
x = self.patch_embed(x)
|
| 267 |
+
|
| 268 |
+
cls_tokens = self.cls_token.expand(
|
| 269 |
+
B, -1, -1
|
| 270 |
+
) # stole cls_tokens impl from Phil Wang, thanks
|
| 271 |
+
x = torch.cat((cls_tokens, x), dim=1)
|
| 272 |
+
|
| 273 |
+
x = x + self.pos_embed[:, : x.size(1), :]
|
| 274 |
+
x = self.pos_drop(x)
|
| 275 |
+
|
| 276 |
+
for i, blk in enumerate(self.blocks):
|
| 277 |
+
x = blk(x, register_blk == i)
|
| 278 |
+
x = self.norm(x)
|
| 279 |
+
|
| 280 |
+
return x
|
| 281 |
+
|
| 282 |
+
@torch.jit.ignore()
|
| 283 |
+
def load_pretrained(self, checkpoint_path, prefix=""):
|
| 284 |
+
_load_weights(self, checkpoint_path, prefix)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
@torch.no_grad()
|
| 288 |
+
def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ""):
|
| 289 |
+
"""Load weights from .npz checkpoints for official Google Brain Flax implementation."""
|
| 290 |
+
import numpy as np
|
| 291 |
+
|
| 292 |
+
def _n2p(w, t=True):
|
| 293 |
+
if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
|
| 294 |
+
w = w.flatten()
|
| 295 |
+
if t:
|
| 296 |
+
if w.ndim == 4:
|
| 297 |
+
w = w.transpose([3, 2, 0, 1])
|
| 298 |
+
elif w.ndim == 3:
|
| 299 |
+
w = w.transpose([2, 0, 1])
|
| 300 |
+
elif w.ndim == 2:
|
| 301 |
+
w = w.transpose([1, 0])
|
| 302 |
+
return torch.from_numpy(w)
|
| 303 |
+
|
| 304 |
+
w = np.load(checkpoint_path)
|
| 305 |
+
if not prefix and "opt/target/embedding/kernel" in w:
|
| 306 |
+
prefix = "opt/target/"
|
| 307 |
+
|
| 308 |
+
if hasattr(model.patch_embed, "backbone"):
|
| 309 |
+
# hybrid
|
| 310 |
+
backbone = model.patch_embed.backbone
|
| 311 |
+
stem_only = not hasattr(backbone, "stem")
|
| 312 |
+
stem = backbone if stem_only else backbone.stem
|
| 313 |
+
stem.conv.weight.copy_(
|
| 314 |
+
adapt_input_conv(
|
| 315 |
+
stem.conv.weight.shape[1], _n2p(w[f"{prefix}conv_root/kernel"])
|
| 316 |
+
)
|
| 317 |
+
)
|
| 318 |
+
stem.norm.weight.copy_(_n2p(w[f"{prefix}gn_root/scale"]))
|
| 319 |
+
stem.norm.bias.copy_(_n2p(w[f"{prefix}gn_root/bias"]))
|
| 320 |
+
if not stem_only:
|
| 321 |
+
for i, stage in enumerate(backbone.stages):
|
| 322 |
+
for j, block in enumerate(stage.blocks):
|
| 323 |
+
bp = f"{prefix}block{i + 1}/unit{j + 1}/"
|
| 324 |
+
for r in range(3):
|
| 325 |
+
getattr(block, f"conv{r + 1}").weight.copy_(
|
| 326 |
+
_n2p(w[f"{bp}conv{r + 1}/kernel"])
|
| 327 |
+
)
|
| 328 |
+
getattr(block, f"norm{r + 1}").weight.copy_(
|
| 329 |
+
_n2p(w[f"{bp}gn{r + 1}/scale"])
|
| 330 |
+
)
|
| 331 |
+
getattr(block, f"norm{r + 1}").bias.copy_(
|
| 332 |
+
_n2p(w[f"{bp}gn{r + 1}/bias"])
|
| 333 |
+
)
|
| 334 |
+
if block.downsample is not None:
|
| 335 |
+
block.downsample.conv.weight.copy_(
|
| 336 |
+
_n2p(w[f"{bp}conv_proj/kernel"])
|
| 337 |
+
)
|
| 338 |
+
block.downsample.norm.weight.copy_(
|
| 339 |
+
_n2p(w[f"{bp}gn_proj/scale"])
|
| 340 |
+
)
|
| 341 |
+
block.downsample.norm.bias.copy_(_n2p(w[f"{bp}gn_proj/bias"]))
|
| 342 |
+
embed_conv_w = _n2p(w[f"{prefix}embedding/kernel"])
|
| 343 |
+
else:
|
| 344 |
+
embed_conv_w = adapt_input_conv(
|
| 345 |
+
model.patch_embed.proj.weight.shape[1], _n2p(w[f"{prefix}embedding/kernel"])
|
| 346 |
+
)
|
| 347 |
+
model.patch_embed.proj.weight.copy_(embed_conv_w)
|
| 348 |
+
model.patch_embed.proj.bias.copy_(_n2p(w[f"{prefix}embedding/bias"]))
|
| 349 |
+
model.cls_token.copy_(_n2p(w[f"{prefix}cls"], t=False))
|
| 350 |
+
pos_embed_w = _n2p(w[f"{prefix}Transformer/posembed_input/pos_embedding"], t=False)
|
| 351 |
+
if pos_embed_w.shape != model.pos_embed.shape:
|
| 352 |
+
pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights
|
| 353 |
+
pos_embed_w,
|
| 354 |
+
model.pos_embed,
|
| 355 |
+
getattr(model, "num_tokens", 1),
|
| 356 |
+
model.patch_embed.grid_size,
|
| 357 |
+
)
|
| 358 |
+
model.pos_embed.copy_(pos_embed_w)
|
| 359 |
+
model.norm.weight.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/scale"]))
|
| 360 |
+
model.norm.bias.copy_(_n2p(w[f"{prefix}Transformer/encoder_norm/bias"]))
|
| 361 |
+
# if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
|
| 362 |
+
# model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
|
| 363 |
+
# model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
|
| 364 |
+
# if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
|
| 365 |
+
# model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
|
| 366 |
+
# model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
|
| 367 |
+
for i, block in enumerate(model.blocks.children()):
|
| 368 |
+
block_prefix = f"{prefix}Transformer/encoderblock_{i}/"
|
| 369 |
+
mha_prefix = block_prefix + "MultiHeadDotProductAttention_1/"
|
| 370 |
+
block.norm1.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/scale"]))
|
| 371 |
+
block.norm1.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_0/bias"]))
|
| 372 |
+
block.attn.qkv.weight.copy_(
|
| 373 |
+
torch.cat(
|
| 374 |
+
[
|
| 375 |
+
_n2p(w[f"{mha_prefix}{n}/kernel"], t=False).flatten(1).T
|
| 376 |
+
for n in ("query", "key", "value")
|
| 377 |
+
]
|
| 378 |
+
)
|
| 379 |
+
)
|
| 380 |
+
block.attn.qkv.bias.copy_(
|
| 381 |
+
torch.cat(
|
| 382 |
+
[
|
| 383 |
+
_n2p(w[f"{mha_prefix}{n}/bias"], t=False).reshape(-1)
|
| 384 |
+
for n in ("query", "key", "value")
|
| 385 |
+
]
|
| 386 |
+
)
|
| 387 |
+
)
|
| 388 |
+
block.attn.proj.weight.copy_(_n2p(w[f"{mha_prefix}out/kernel"]).flatten(1))
|
| 389 |
+
block.attn.proj.bias.copy_(_n2p(w[f"{mha_prefix}out/bias"]))
|
| 390 |
+
for r in range(2):
|
| 391 |
+
getattr(block.mlp, f"fc{r + 1}").weight.copy_(
|
| 392 |
+
_n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/kernel"])
|
| 393 |
+
)
|
| 394 |
+
getattr(block.mlp, f"fc{r + 1}").bias.copy_(
|
| 395 |
+
_n2p(w[f"{block_prefix}MlpBlock_3/Dense_{r}/bias"])
|
| 396 |
+
)
|
| 397 |
+
block.norm2.weight.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/scale"]))
|
| 398 |
+
block.norm2.bias.copy_(_n2p(w[f"{block_prefix}LayerNorm_2/bias"]))
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):
|
| 402 |
+
# interpolate position embedding
|
| 403 |
+
embedding_size = pos_embed_checkpoint.shape[-1]
|
| 404 |
+
num_patches = visual_encoder.patch_embed.num_patches
|
| 405 |
+
num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
|
| 406 |
+
# height (== width) for the checkpoint position embedding
|
| 407 |
+
orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
|
| 408 |
+
# height (== width) for the new position embedding
|
| 409 |
+
new_size = int(num_patches**0.5)
|
| 410 |
+
|
| 411 |
+
if orig_size != new_size:
|
| 412 |
+
# class_token and dist_token are kept unchanged
|
| 413 |
+
extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
|
| 414 |
+
# only the position tokens are interpolated
|
| 415 |
+
pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
|
| 416 |
+
pos_tokens = pos_tokens.reshape(
|
| 417 |
+
-1, orig_size, orig_size, embedding_size
|
| 418 |
+
).permute(0, 3, 1, 2)
|
| 419 |
+
pos_tokens = torch.nn.functional.interpolate(
|
| 420 |
+
pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False
|
| 421 |
+
)
|
| 422 |
+
pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
|
| 423 |
+
new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
|
| 424 |
+
print(
|
| 425 |
+
"reshape position embedding from %d to %d" % (orig_size**2, new_size**2)
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
return new_pos_embed
|
| 429 |
+
else:
|
| 430 |
+
return pos_embed_checkpoint
|
tag2text/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
timm==0.4.12
|
| 2 |
+
transformers==4.15.0
|
| 3 |
+
fairscale==0.4.4
|
| 4 |
+
pycocoevalcap
|
| 5 |
+
torch
|
| 6 |
+
torchvision
|
| 7 |
+
Pillow
|
| 8 |
+
scipy
|
utils.py
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import sys
|
| 3 |
+
from typing import Dict
|
| 4 |
+
from typing import List
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import supervision as sv
|
| 8 |
+
import torch
|
| 9 |
+
import torchvision
|
| 10 |
+
import torchvision.transforms as T
|
| 11 |
+
from groundingdino.models import build_model
|
| 12 |
+
from groundingdino.util.inference import Model as DinoModel
|
| 13 |
+
from groundingdino.util.slconfig import SLConfig
|
| 14 |
+
from groundingdino.util.utils import clean_state_dict
|
| 15 |
+
from huggingface_hub import hf_hub_download
|
| 16 |
+
from PIL import Image
|
| 17 |
+
from segment_anything import SamPredictor
|
| 18 |
+
|
| 19 |
+
# segment anything
|
| 20 |
+
|
| 21 |
+
sys.path.append("tag2text")
|
| 22 |
+
|
| 23 |
+
from tag2text.inference import inference as tag2text_inference
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def load_model_hf(repo_id, filename, ckpt_config_filename, device="cpu"):
|
| 27 |
+
cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
|
| 28 |
+
|
| 29 |
+
args = SLConfig.fromfile(cache_config_file)
|
| 30 |
+
args.device = device
|
| 31 |
+
model = build_model(args)
|
| 32 |
+
|
| 33 |
+
cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 34 |
+
checkpoint = torch.load(cache_file, map_location=device)
|
| 35 |
+
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
|
| 36 |
+
model.eval()
|
| 37 |
+
return model
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def download_file_hf(repo_id, filename, cache_dir="./cache"):
|
| 41 |
+
cache_file = hf_hub_download(
|
| 42 |
+
repo_id=repo_id, filename=filename, force_filename=filename, cache_dir=cache_dir
|
| 43 |
+
)
|
| 44 |
+
return cache_file
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def transform_image_tag2text(image_pil: Image) -> torch.Tensor:
|
| 48 |
+
transform = T.Compose(
|
| 49 |
+
[
|
| 50 |
+
T.Resize((384, 384)),
|
| 51 |
+
T.ToTensor(),
|
| 52 |
+
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
| 53 |
+
]
|
| 54 |
+
)
|
| 55 |
+
image = transform(image_pil) # 3, h, w
|
| 56 |
+
return image
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def show_anns_sam(anns: List[Dict]):
|
| 60 |
+
"""Extracts the mask annotations from the Segment Anything model output and plots them.
|
| 61 |
+
https://github.com/facebookresearch/segment-anything.
|
| 62 |
+
|
| 63 |
+
Arguments:
|
| 64 |
+
anns (List[Dict]): Segment Anything model output.
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
(np.ndarray): Masked image.
|
| 68 |
+
(np.ndarray): annotation encoding from https://github.com/LUSSeg/ImageNet-S
|
| 69 |
+
"""
|
| 70 |
+
if len(anns) == 0:
|
| 71 |
+
return
|
| 72 |
+
sorted_anns = sorted(anns, key=(lambda x: x["area"]), reverse=True)
|
| 73 |
+
full_img = None
|
| 74 |
+
|
| 75 |
+
# for ann in sorted_anns:
|
| 76 |
+
for i in range(len(sorted_anns)):
|
| 77 |
+
ann = anns[i]
|
| 78 |
+
m = ann["segmentation"]
|
| 79 |
+
if full_img is None:
|
| 80 |
+
full_img = np.zeros((m.shape[0], m.shape[1], 3))
|
| 81 |
+
map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
|
| 82 |
+
map[m != 0] = i + 1
|
| 83 |
+
color_mask = np.random.random((1, 3)).tolist()[0]
|
| 84 |
+
full_img[m != 0] = color_mask
|
| 85 |
+
full_img = full_img * 255
|
| 86 |
+
|
| 87 |
+
# anno encoding from https://github.com/LUSSeg/ImageNet-S
|
| 88 |
+
res = np.zeros((map.shape[0], map.shape[1], 3))
|
| 89 |
+
res[:, :, 0] = map % 256
|
| 90 |
+
res[:, :, 1] = map // 256
|
| 91 |
+
res.astype(np.float32)
|
| 92 |
+
full_img = np.uint8(full_img)
|
| 93 |
+
return full_img, res
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def show_anns_sv(detections: sv.Detections):
|
| 97 |
+
"""Extracts the mask annotations from the Supervision Detections object.
|
| 98 |
+
https://roboflow.github.io/supervision/detection/core/.
|
| 99 |
+
|
| 100 |
+
Arguments:
|
| 101 |
+
anns (sv.Detections): Containing information about the detections.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
(np.ndarray): Masked image.
|
| 105 |
+
(np.ndarray): annotation encoding from https://github.com/LUSSeg/ImageNet-S
|
| 106 |
+
"""
|
| 107 |
+
if detections.mask is None:
|
| 108 |
+
return
|
| 109 |
+
full_img = None
|
| 110 |
+
|
| 111 |
+
for i in np.flip(np.argsort(detections.area)):
|
| 112 |
+
m = detections.mask[i]
|
| 113 |
+
if full_img is None:
|
| 114 |
+
full_img = np.zeros((m.shape[0], m.shape[1], 3))
|
| 115 |
+
map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
|
| 116 |
+
map[m != 0] = i + 1
|
| 117 |
+
color_mask = np.random.random((1, 3)).tolist()[0]
|
| 118 |
+
full_img[m != 0] = color_mask
|
| 119 |
+
full_img = full_img * 255
|
| 120 |
+
|
| 121 |
+
# anno encoding from https://github.com/LUSSeg/ImageNet-S
|
| 122 |
+
res = np.zeros((map.shape[0], map.shape[1], 3))
|
| 123 |
+
res[:, :, 0] = map % 256
|
| 124 |
+
res[:, :, 1] = map // 256
|
| 125 |
+
res.astype(np.float32)
|
| 126 |
+
full_img = np.uint8(full_img)
|
| 127 |
+
return full_img, res
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def generate_tags(tag2text_model, image, specified_tags, device="cpu"):
|
| 131 |
+
"""Generate image tags and caption using Tag2Text model.
|
| 132 |
+
|
| 133 |
+
Arguments:
|
| 134 |
+
tag2text_model (nn.Module): Tag2Text model to use for prediction.
|
| 135 |
+
image (np.ndarray): The image for calculating. Expects an
|
| 136 |
+
image in HWC uint8 format, with pixel values in [0, 255].
|
| 137 |
+
specified_tags(str): User input specified tags
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
(List[str]): Predicted image tags.
|
| 141 |
+
(str): Predicted image caption
|
| 142 |
+
"""
|
| 143 |
+
image = transform_image_tag2text(image).unsqueeze(0).to(device)
|
| 144 |
+
res = tag2text_inference(image, tag2text_model, specified_tags)
|
| 145 |
+
tags = res[0].split(" | ")
|
| 146 |
+
caption = res[2]
|
| 147 |
+
return tags, caption
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def detect(
|
| 151 |
+
grounding_dino_model: DinoModel,
|
| 152 |
+
image: np.ndarray,
|
| 153 |
+
caption: str,
|
| 154 |
+
box_threshold: float = 0.3,
|
| 155 |
+
text_threshold: float = 0.25,
|
| 156 |
+
iou_threshold: float = 0.5,
|
| 157 |
+
post_process: bool = True,
|
| 158 |
+
):
|
| 159 |
+
"""Detect bounding boxes for the given image, using the input caption.
|
| 160 |
+
|
| 161 |
+
Arguments:
|
| 162 |
+
grounding_dino_model (DinoModel): The model to use for detection.
|
| 163 |
+
image (np.ndarray): The image for calculating masks. Expects an
|
| 164 |
+
image in HWC uint8 format, with pixel values in [0, 255].
|
| 165 |
+
caption (str): Input caption contain object names to detect. To detect multiple objects, seperating each name with '.', like this: cat . dog . chair
|
| 166 |
+
box_threshold (float): Box confidence threshold
|
| 167 |
+
text_threshold (float): Text confidence threshold
|
| 168 |
+
iou_threshold (float): IOU score threshold for post processing
|
| 169 |
+
post_process (bool): If True, run NMS algorithm to remove duplicates segments.
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
(sv.Detections): Containing information about the detections in a video frame.
|
| 173 |
+
(str): Predicted phrases.
|
| 174 |
+
(List[str]): Predicted classes.
|
| 175 |
+
"""
|
| 176 |
+
detections, phrases = grounding_dino_model.predict_with_caption(
|
| 177 |
+
image=image,
|
| 178 |
+
caption=caption,
|
| 179 |
+
box_threshold=box_threshold,
|
| 180 |
+
text_threshold=text_threshold,
|
| 181 |
+
)
|
| 182 |
+
classes = list(map(lambda x: x.strip(), caption.split(".")))
|
| 183 |
+
detections.class_id = DinoModel.phrases2classes(phrases=phrases, classes=classes)
|
| 184 |
+
|
| 185 |
+
# NMS post process
|
| 186 |
+
if post_process:
|
| 187 |
+
# print(f"Before NMS: {len(detections.xyxy)} boxes")
|
| 188 |
+
nms_idx = (
|
| 189 |
+
torchvision.ops.nms(
|
| 190 |
+
torch.from_numpy(detections.xyxy),
|
| 191 |
+
torch.from_numpy(detections.confidence),
|
| 192 |
+
iou_threshold,
|
| 193 |
+
)
|
| 194 |
+
.numpy()
|
| 195 |
+
.tolist()
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
phrases = [phrases[idx] for idx in nms_idx]
|
| 199 |
+
detections.xyxy = detections.xyxy[nms_idx]
|
| 200 |
+
detections.confidence = detections.confidence[nms_idx]
|
| 201 |
+
detections.class_id = detections.class_id[nms_idx]
|
| 202 |
+
|
| 203 |
+
# print(f"After NMS: {len(detections.xyxy)} boxes")
|
| 204 |
+
|
| 205 |
+
return detections, phrases, classes
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def segment(sam_model: SamPredictor, image: np.ndarray, boxes: np.ndarray):
|
| 209 |
+
"""Predict masks for the given input boxes, using the currently set image.
|
| 210 |
+
|
| 211 |
+
Arguments:
|
| 212 |
+
sam_model (SamPredictor): The model to use for mask prediction.
|
| 213 |
+
image (np.ndarray): The image for calculating masks. Expects an
|
| 214 |
+
image in HWC uint8 format, with pixel values in [0, 255].
|
| 215 |
+
boxes (np.ndarray or None): A Bx4 array given a box prompt to the
|
| 216 |
+
model, in XYXY format.
|
| 217 |
+
return_logits (bool): If true, returns un-thresholded masks logits
|
| 218 |
+
instead of a binary mask.
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
(torch.Tensor): The output masks in BxCxHxW format, where C is the
|
| 222 |
+
number of masks, and (H, W) is the original image size.
|
| 223 |
+
(torch.Tensor): An array of shape BxC containing the model's
|
| 224 |
+
predictions for the quality of each mask.
|
| 225 |
+
(torch.Tensor): An array of shape BxCxHxW, where C is the number
|
| 226 |
+
of masks and H=W=256. These low res logits can be passed to
|
| 227 |
+
a subsequent iteration as mask input.
|
| 228 |
+
"""
|
| 229 |
+
sam_model.set_image(image)
|
| 230 |
+
transformed_boxes = None
|
| 231 |
+
if boxes is not None:
|
| 232 |
+
boxes = torch.from_numpy(boxes)
|
| 233 |
+
|
| 234 |
+
transformed_boxes = sam_model.transform.apply_boxes_torch(
|
| 235 |
+
boxes.to(sam_model.device), image.shape[:2]
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
masks, scores, _ = sam_model.predict_torch(
|
| 239 |
+
point_coords=None,
|
| 240 |
+
point_labels=None,
|
| 241 |
+
boxes=transformed_boxes,
|
| 242 |
+
multimask_output=False,
|
| 243 |
+
)
|
| 244 |
+
masks = masks[:, 0, :, :]
|
| 245 |
+
scores = scores[:, 0]
|
| 246 |
+
return masks.cpu().numpy(), scores.cpu().numpy()
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def draw_mask(mask, draw, random_color=False):
|
| 250 |
+
if random_color:
|
| 251 |
+
color = (
|
| 252 |
+
random.randint(0, 255),
|
| 253 |
+
random.randint(0, 255),
|
| 254 |
+
random.randint(0, 255),
|
| 255 |
+
153,
|
| 256 |
+
)
|
| 257 |
+
else:
|
| 258 |
+
color = (30, 144, 255, 153)
|
| 259 |
+
|
| 260 |
+
nonzero_coords = np.transpose(np.nonzero(mask))
|
| 261 |
+
|
| 262 |
+
for coord in nonzero_coords:
|
| 263 |
+
draw.point(coord[::-1], fill=color)
|