diff --git a/.gitignore b/.gitignore index aa6c95de80d6ab950281ba9474ac52afb51c83a2..3d9069502f0495dc8bbd74cb34625cfadbc39c4d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ annotator/ckpts/** result/** -trash/** \ No newline at end of file +trash/** +data/** \ No newline at end of file diff --git a/README.md b/README.md index 68cbc56a833651545370f4b4613022b2e188e94f..df712169a527e132b46308a70e7efda805d281ae 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ Our method is tested using cuda12.1, fp16 of accelerator and xformers on a singl conda create -n st-modulator python==3.10 conda activate st-modulator -# Step 2: Install PyTorch and CUDA +# Step 2: Install PyTorch, CUDA and Xformers conda install pytorch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 pytorch-cuda=12.1 -c pytorch -c nvidia - +pip install --pre -U xformers==0.0.27 # Step 3: Install additional dependencies with pip pip install -r requirements.txt ``` diff --git a/__pycache__/ptp_utils_null_text_inversion.cpython-310.pyc b/__pycache__/ptp_utils_null_text_inversion.cpython-310.pyc deleted file mode 100644 index 72189c1cb33464c411420948eb04bc8ca21aa46f..0000000000000000000000000000000000000000 Binary files a/__pycache__/ptp_utils_null_text_inversion.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/ptp_utils_null_text_inversion.cpython-38.pyc b/__pycache__/ptp_utils_null_text_inversion.cpython-38.pyc deleted file mode 100644 index 1dee2c11be89b6ce6fbbc5af9eb8a9754ad75a22..0000000000000000000000000000000000000000 Binary files a/__pycache__/ptp_utils_null_text_inversion.cpython-38.pyc and /dev/null differ diff --git a/__pycache__/utils.cpython-310.pyc b/__pycache__/utils.cpython-310.pyc deleted file mode 100644 index f75bdc530d590d77443e15595e2729e9b963fb50..0000000000000000000000000000000000000000 Binary files a/__pycache__/utils.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/xformers.cpython-310.pyc b/__pycache__/xformers.cpython-310.pyc deleted file mode 100644 index da513aa0483e57a23de04c9e727851218968252c..0000000000000000000000000000000000000000 Binary files a/__pycache__/xformers.cpython-310.pyc and /dev/null differ diff --git a/annotator/__pycache__/util.cpython-310.pyc b/annotator/__pycache__/util.cpython-310.pyc index 46828fe6843f9bdb563490855849e73236ca06e0..e75cbd35fc16d4395597f9ec7f451eff071b3250 100644 Binary files a/annotator/__pycache__/util.cpython-310.pyc and b/annotator/__pycache__/util.cpython-310.pyc differ diff --git a/annotator/dwpose/__pycache__/__init__.cpython-310.pyc b/annotator/dwpose/__pycache__/__init__.cpython-310.pyc index d656faa76f0f3d529f555d00b509aec35f9df47a..a0e3cb20e6acebbc559d348bab498968bd7cd90b 100644 Binary files a/annotator/dwpose/__pycache__/__init__.cpython-310.pyc and b/annotator/dwpose/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/dwpose/__pycache__/onnxdet.cpython-310.pyc b/annotator/dwpose/__pycache__/onnxdet.cpython-310.pyc index a826180393fc91e2aa959ed6b9de45ab952c1059..10be8e5aa1e3b6068c79ca6fbc25958d9446223a 100644 Binary files a/annotator/dwpose/__pycache__/onnxdet.cpython-310.pyc and b/annotator/dwpose/__pycache__/onnxdet.cpython-310.pyc differ diff --git a/annotator/dwpose/__pycache__/onnxpose.cpython-310.pyc b/annotator/dwpose/__pycache__/onnxpose.cpython-310.pyc index 38cdd0a2865ce8802c570028ef13bd0df70bd568..fec3b6c6838c2d0ca26a31659c560a07a53e804f 100644 Binary files a/annotator/dwpose/__pycache__/onnxpose.cpython-310.pyc and b/annotator/dwpose/__pycache__/onnxpose.cpython-310.pyc differ diff --git a/annotator/dwpose/__pycache__/util.cpython-310.pyc b/annotator/dwpose/__pycache__/util.cpython-310.pyc index e712199d91b577f54e8ffc92d2c5d42c85094e46..14402dc00c6302f4e3cdbb4d7cae112b7933e2d7 100644 Binary files a/annotator/dwpose/__pycache__/util.cpython-310.pyc and b/annotator/dwpose/__pycache__/util.cpython-310.pyc differ diff --git a/annotator/dwpose/__pycache__/wholebody.cpython-310.pyc b/annotator/dwpose/__pycache__/wholebody.cpython-310.pyc index 4faac40aa29e66e4fc6f5e5775db685fcd08438f..2b4ff40133cb6b15a3efa04f41b07c65405d3416 100644 Binary files a/annotator/dwpose/__pycache__/wholebody.cpython-310.pyc and b/annotator/dwpose/__pycache__/wholebody.cpython-310.pyc differ diff --git a/annotator/midas/__pycache__/__init__.cpython-310.pyc b/annotator/midas/__pycache__/__init__.cpython-310.pyc index 94d1e12de8722702cdab4272d186ca48a515d18c..4493a8bcce078c732d0c29e77309c7085ce5b96e 100644 Binary files a/annotator/midas/__pycache__/__init__.cpython-310.pyc and b/annotator/midas/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/midas/__pycache__/api.cpython-310.pyc b/annotator/midas/__pycache__/api.cpython-310.pyc index a24ca8ec7916cbfb1899f3020490d90a6038a180..ca8551f1e68418147651af928176b51760cd7907 100644 Binary files a/annotator/midas/__pycache__/api.cpython-310.pyc and b/annotator/midas/__pycache__/api.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/__init__.cpython-310.pyc b/annotator/midas/midas/__pycache__/__init__.cpython-310.pyc index 45650853feeb5f28179cde97c9bb9b6fc0f646f7..6c25f54b1097c113f0eb3c30a5214dba3dfb5c08 100644 Binary files a/annotator/midas/midas/__pycache__/__init__.cpython-310.pyc and b/annotator/midas/midas/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/base_model.cpython-310.pyc b/annotator/midas/midas/__pycache__/base_model.cpython-310.pyc index 1223c99ae712592d5d21903147bb41fa995b0a0f..b0b9f5e6f88c07010b0fdd121808a1d02d3c563a 100644 Binary files a/annotator/midas/midas/__pycache__/base_model.cpython-310.pyc and b/annotator/midas/midas/__pycache__/base_model.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/blocks.cpython-310.pyc b/annotator/midas/midas/__pycache__/blocks.cpython-310.pyc index d068a3c5ff6e412fd6c83e3314bc47d27f602ea7..7dd5b1de3b40f8c1253754ece74d73e3e088f906 100644 Binary files a/annotator/midas/midas/__pycache__/blocks.cpython-310.pyc and b/annotator/midas/midas/__pycache__/blocks.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/dpt_depth.cpython-310.pyc b/annotator/midas/midas/__pycache__/dpt_depth.cpython-310.pyc index f5779c23707e01966d62af4e85283140c9e754c3..58b7d19b22a0277d853fbe3213707fe1b4162186 100644 Binary files a/annotator/midas/midas/__pycache__/dpt_depth.cpython-310.pyc and b/annotator/midas/midas/__pycache__/dpt_depth.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/midas_net.cpython-310.pyc b/annotator/midas/midas/__pycache__/midas_net.cpython-310.pyc index cd53826e275318803912c2c170b4181d52ff316d..f2f305639e72c6ac6d0d04b67cc612d3abf59051 100644 Binary files a/annotator/midas/midas/__pycache__/midas_net.cpython-310.pyc and b/annotator/midas/midas/__pycache__/midas_net.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/midas_net_custom.cpython-310.pyc b/annotator/midas/midas/__pycache__/midas_net_custom.cpython-310.pyc index 887e102d2f132dafb7105b090119cfd558bc8498..a0a3a4ad41403fb0fec6a074ac6ea3f5bfe3a622 100644 Binary files a/annotator/midas/midas/__pycache__/midas_net_custom.cpython-310.pyc and b/annotator/midas/midas/__pycache__/midas_net_custom.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/transforms.cpython-310.pyc b/annotator/midas/midas/__pycache__/transforms.cpython-310.pyc index d8c56361e4a9afcb21a661e8e248193d586eda05..d33a0e3f6baa5ac0c30bd481f81f392f8559486b 100644 Binary files a/annotator/midas/midas/__pycache__/transforms.cpython-310.pyc and b/annotator/midas/midas/__pycache__/transforms.cpython-310.pyc differ diff --git a/annotator/midas/midas/__pycache__/vit.cpython-310.pyc b/annotator/midas/midas/__pycache__/vit.cpython-310.pyc index c127a0f38f4ff078a74ec10fb2713129ae14230d..275bc1dd49565e9cfdfb00089b9489734f4252f8 100644 Binary files a/annotator/midas/midas/__pycache__/vit.cpython-310.pyc and b/annotator/midas/midas/__pycache__/vit.cpython-310.pyc differ diff --git a/annotator/openpose/__pycache__/__init__.cpython-310.pyc b/annotator/openpose/__pycache__/__init__.cpython-310.pyc index 35bdd6c415fa270ec4465bb663962d5ff17bda49..288030f3a3a5fae902628a60f07879fe3281d1aa 100644 Binary files a/annotator/openpose/__pycache__/__init__.cpython-310.pyc and b/annotator/openpose/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/openpose/__pycache__/body.cpython-310.pyc b/annotator/openpose/__pycache__/body.cpython-310.pyc index b1219368bb73be2b175e82a8fba271937f14013e..7763b1016dab668f8a49e9eadc87898d872efb58 100644 Binary files a/annotator/openpose/__pycache__/body.cpython-310.pyc and b/annotator/openpose/__pycache__/body.cpython-310.pyc differ diff --git a/annotator/openpose/__pycache__/face.cpython-310.pyc b/annotator/openpose/__pycache__/face.cpython-310.pyc index 65d0d2a1ad806d811dbb6cfd2a6a1b8aa5085489..8bba4c5d9534fca597560f644c166a8e898397d3 100644 Binary files a/annotator/openpose/__pycache__/face.cpython-310.pyc and b/annotator/openpose/__pycache__/face.cpython-310.pyc differ diff --git a/annotator/openpose/__pycache__/hand.cpython-310.pyc b/annotator/openpose/__pycache__/hand.cpython-310.pyc index c37c858b9637f3b5b7aab5e773a7ca36a7d0f054..061aa322868ec431891493a1b99eac1484872840 100644 Binary files a/annotator/openpose/__pycache__/hand.cpython-310.pyc and b/annotator/openpose/__pycache__/hand.cpython-310.pyc differ diff --git a/annotator/openpose/__pycache__/model.cpython-310.pyc b/annotator/openpose/__pycache__/model.cpython-310.pyc index 18b4424a74adb5e9e96e7a311603f9aed983a2f8..bdedd627ca8311659086191e90b4d43f2b61f0bf 100644 Binary files a/annotator/openpose/__pycache__/model.cpython-310.pyc and b/annotator/openpose/__pycache__/model.cpython-310.pyc differ diff --git a/annotator/openpose/__pycache__/util.cpython-310.pyc b/annotator/openpose/__pycache__/util.cpython-310.pyc index 6fd671d024ccd9493e5f835ae5f877f6e3ebe8e0..7ec964d9d831042c5fd0de4e506dcef5107486f0 100644 Binary files a/annotator/openpose/__pycache__/util.cpython-310.pyc and b/annotator/openpose/__pycache__/util.cpython-310.pyc differ diff --git a/annotator/zoe/__pycache__/__init__.cpython-310.pyc b/annotator/zoe/__pycache__/__init__.cpython-310.pyc index 31ef210fbed36848f56408097842dfaf90c433f1..694df310a1322d01b136fed92afe8845962b904e 100644 Binary files a/annotator/zoe/__pycache__/__init__.cpython-310.pyc and b/annotator/zoe/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/data/__init__.py b/annotator/zoe/zoedepth/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f2668792389157609abb2a0846fb620e7d67eb9 --- /dev/null +++ b/annotator/zoe/zoedepth/data/__init__.py @@ -0,0 +1,24 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + diff --git a/annotator/zoe/zoedepth/data/data_mono.py b/annotator/zoe/zoedepth/data/data_mono.py new file mode 100644 index 0000000000000000000000000000000000000000..80a8486f239a35331df553f490e213f9bf71e735 --- /dev/null +++ b/annotator/zoe/zoedepth/data/data_mono.py @@ -0,0 +1,573 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +# This file is partly inspired from BTS (https://github.com/cleinc/bts/blob/master/pytorch/bts_dataloader.py); author: Jin Han Lee + +import itertools +import os +import random + +import numpy as np +import cv2 +import torch +import torch.nn as nn +import torch.utils.data.distributed +from zoedepth.utils.easydict import EasyDict as edict +from PIL import Image, ImageOps +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + +from zoedepth.utils.config import change_dataset + +from .ddad import get_ddad_loader +from .diml_indoor_test import get_diml_indoor_loader +from .diml_outdoor_test import get_diml_outdoor_loader +from .diode import get_diode_loader +from .hypersim import get_hypersim_loader +from .ibims import get_ibims_loader +from .sun_rgbd_loader import get_sunrgbd_loader +from .vkitti import get_vkitti_loader +from .vkitti2 import get_vkitti2_loader + +from .preprocess import CropParams, get_white_border, get_black_border + + +def _is_pil_image(img): + return isinstance(img, Image.Image) + + +def _is_numpy_image(img): + return isinstance(img, np.ndarray) and (img.ndim in {2, 3}) + + +def preprocessing_transforms(mode, **kwargs): + return transforms.Compose([ + ToTensor(mode=mode, **kwargs) + ]) + + +class DepthDataLoader(object): + def __init__(self, config, mode, device='cpu', transform=None, **kwargs): + """ + Data loader for depth datasets + + Args: + config (dict): Config dictionary. Refer to utils/config.py + mode (str): "train" or "online_eval" + device (str, optional): Device to load the data on. Defaults to 'cpu'. + transform (torchvision.transforms, optional): Transform to apply to the data. Defaults to None. + """ + + self.config = config + + if config.dataset == 'ibims': + self.data = get_ibims_loader(config, batch_size=1, num_workers=1) + return + + if config.dataset == 'sunrgbd': + self.data = get_sunrgbd_loader( + data_dir_root=config.sunrgbd_root, batch_size=1, num_workers=1) + return + + if config.dataset == 'diml_indoor': + self.data = get_diml_indoor_loader( + data_dir_root=config.diml_indoor_root, batch_size=1, num_workers=1) + return + + if config.dataset == 'diml_outdoor': + self.data = get_diml_outdoor_loader( + data_dir_root=config.diml_outdoor_root, batch_size=1, num_workers=1) + return + + if "diode" in config.dataset: + self.data = get_diode_loader( + config[config.dataset+"_root"], batch_size=1, num_workers=1) + return + + if config.dataset == 'hypersim_test': + self.data = get_hypersim_loader( + config.hypersim_test_root, batch_size=1, num_workers=1) + return + + if config.dataset == 'vkitti': + self.data = get_vkitti_loader( + config.vkitti_root, batch_size=1, num_workers=1) + return + + if config.dataset == 'vkitti2': + self.data = get_vkitti2_loader( + config.vkitti2_root, batch_size=1, num_workers=1) + return + + if config.dataset == 'ddad': + self.data = get_ddad_loader(config.ddad_root, resize_shape=( + 352, 1216), batch_size=1, num_workers=1) + return + + img_size = self.config.get("img_size", None) + img_size = img_size if self.config.get( + "do_input_resize", False) else None + + if transform is None: + transform = preprocessing_transforms(mode, size=img_size) + + if mode == 'train': + + Dataset = DataLoadPreprocess + self.training_samples = Dataset( + config, mode, transform=transform, device=device) + + if config.distributed: + self.train_sampler = torch.utils.data.distributed.DistributedSampler( + self.training_samples) + else: + self.train_sampler = None + + self.data = DataLoader(self.training_samples, + batch_size=config.batch_size, + shuffle=(self.train_sampler is None), + num_workers=config.workers, + pin_memory=True, + persistent_workers=True, + # prefetch_factor=2, + sampler=self.train_sampler) + + elif mode == 'online_eval': + self.testing_samples = DataLoadPreprocess( + config, mode, transform=transform) + if config.distributed: # redundant. here only for readability and to be more explicit + # Give whole test set to all processes (and report evaluation only on one) regardless + self.eval_sampler = None + else: + self.eval_sampler = None + self.data = DataLoader(self.testing_samples, 1, + shuffle=kwargs.get("shuffle_test", False), + num_workers=1, + pin_memory=False, + sampler=self.eval_sampler) + + elif mode == 'test': + self.testing_samples = DataLoadPreprocess( + config, mode, transform=transform) + self.data = DataLoader(self.testing_samples, + 1, shuffle=False, num_workers=1) + + else: + print( + 'mode should be one of \'train, test, online_eval\'. Got {}'.format(mode)) + + +def repetitive_roundrobin(*iterables): + """ + cycles through iterables but sample wise + first yield first sample from first iterable then first sample from second iterable and so on + then second sample from first iterable then second sample from second iterable and so on + + If one iterable is shorter than the others, it is repeated until all iterables are exhausted + repetitive_roundrobin('ABC', 'D', 'EF') --> A D E B D F C D E + """ + # Repetitive roundrobin + iterables_ = [iter(it) for it in iterables] + exhausted = [False] * len(iterables) + while not all(exhausted): + for i, it in enumerate(iterables_): + try: + yield next(it) + except StopIteration: + exhausted[i] = True + iterables_[i] = itertools.cycle(iterables[i]) + # First elements may get repeated if one iterable is shorter than the others + yield next(iterables_[i]) + + +class RepetitiveRoundRobinDataLoader(object): + def __init__(self, *dataloaders): + self.dataloaders = dataloaders + + def __iter__(self): + return repetitive_roundrobin(*self.dataloaders) + + def __len__(self): + # First samples get repeated, thats why the plus one + return len(self.dataloaders) * (max(len(dl) for dl in self.dataloaders) + 1) + + +class MixedNYUKITTI(object): + def __init__(self, config, mode, device='cpu', **kwargs): + config = edict(config) + config.workers = config.workers // 2 + self.config = config + nyu_conf = change_dataset(edict(config), 'nyu') + kitti_conf = change_dataset(edict(config), 'kitti') + + # make nyu default for testing + self.config = config = nyu_conf + img_size = self.config.get("img_size", None) + img_size = img_size if self.config.get( + "do_input_resize", False) else None + if mode == 'train': + nyu_loader = DepthDataLoader( + nyu_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data + kitti_loader = DepthDataLoader( + kitti_conf, mode, device=device, transform=preprocessing_transforms(mode, size=img_size)).data + # It has been changed to repetitive roundrobin + self.data = RepetitiveRoundRobinDataLoader( + nyu_loader, kitti_loader) + else: + self.data = DepthDataLoader(nyu_conf, mode, device=device).data + + +def remove_leading_slash(s): + if s[0] == '/' or s[0] == '\\': + return s[1:] + return s + + +class CachedReader: + def __init__(self, shared_dict=None): + if shared_dict: + self._cache = shared_dict + else: + self._cache = {} + + def open(self, fpath): + im = self._cache.get(fpath, None) + if im is None: + im = self._cache[fpath] = Image.open(fpath) + return im + + +class ImReader: + def __init__(self): + pass + + # @cache + def open(self, fpath): + return Image.open(fpath) + + +class DataLoadPreprocess(Dataset): + def __init__(self, config, mode, transform=None, is_for_online_eval=False, **kwargs): + self.config = config + if mode == 'online_eval': + with open(config.filenames_file_eval, 'r') as f: + self.filenames = f.readlines() + else: + with open(config.filenames_file, 'r') as f: + self.filenames = f.readlines() + + self.mode = mode + self.transform = transform + self.to_tensor = ToTensor(mode) + self.is_for_online_eval = is_for_online_eval + if config.use_shared_dict: + self.reader = CachedReader(config.shared_dict) + else: + self.reader = ImReader() + + def postprocess(self, sample): + return sample + + def __getitem__(self, idx): + sample_path = self.filenames[idx] + focal = float(sample_path.split()[2]) + sample = {} + + if self.mode == 'train': + if self.config.dataset == 'kitti' and self.config.use_right and random.random() > 0.5: + image_path = os.path.join( + self.config.data_path, remove_leading_slash(sample_path.split()[3])) + depth_path = os.path.join( + self.config.gt_path, remove_leading_slash(sample_path.split()[4])) + else: + image_path = os.path.join( + self.config.data_path, remove_leading_slash(sample_path.split()[0])) + depth_path = os.path.join( + self.config.gt_path, remove_leading_slash(sample_path.split()[1])) + + image = self.reader.open(image_path) + depth_gt = self.reader.open(depth_path) + w, h = image.size + + if self.config.do_kb_crop: + height = image.height + width = image.width + top_margin = int(height - 352) + left_margin = int((width - 1216) / 2) + depth_gt = depth_gt.crop( + (left_margin, top_margin, left_margin + 1216, top_margin + 352)) + image = image.crop( + (left_margin, top_margin, left_margin + 1216, top_margin + 352)) + + # Avoid blank boundaries due to pixel registration? + # Train images have white border. Test images have black border. + if self.config.dataset == 'nyu' and self.config.avoid_boundary: + # print("Avoiding Blank Boundaries!") + # We just crop and pad again with reflect padding to original size + # original_size = image.size + crop_params = get_white_border(np.array(image, dtype=np.uint8)) + image = image.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom)) + depth_gt = depth_gt.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom)) + + # Use reflect padding to fill the blank + image = np.array(image) + image = np.pad(image, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), mode='reflect') + image = Image.fromarray(image) + + depth_gt = np.array(depth_gt) + depth_gt = np.pad(depth_gt, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right)), 'constant', constant_values=0) + depth_gt = Image.fromarray(depth_gt) + + + if self.config.do_random_rotate and (self.config.aug): + random_angle = (random.random() - 0.5) * 2 * self.config.degree + image = self.rotate_image(image, random_angle) + depth_gt = self.rotate_image( + depth_gt, random_angle, flag=Image.NEAREST) + + image = np.asarray(image, dtype=np.float32) / 255.0 + depth_gt = np.asarray(depth_gt, dtype=np.float32) + depth_gt = np.expand_dims(depth_gt, axis=2) + + if self.config.dataset == 'nyu': + depth_gt = depth_gt / 1000.0 + else: + depth_gt = depth_gt / 256.0 + + if self.config.aug and (self.config.random_crop): + image, depth_gt = self.random_crop( + image, depth_gt, self.config.input_height, self.config.input_width) + + if self.config.aug and self.config.random_translate: + # print("Random Translation!") + image, depth_gt = self.random_translate(image, depth_gt, self.config.max_translation) + + image, depth_gt = self.train_preprocess(image, depth_gt) + mask = np.logical_and(depth_gt > self.config.min_depth, + depth_gt < self.config.max_depth).squeeze()[None, ...] + sample = {'image': image, 'depth': depth_gt, 'focal': focal, + 'mask': mask, **sample} + + else: + if self.mode == 'online_eval': + data_path = self.config.data_path_eval + else: + data_path = self.config.data_path + + image_path = os.path.join( + data_path, remove_leading_slash(sample_path.split()[0])) + image = np.asarray(self.reader.open(image_path), + dtype=np.float32) / 255.0 + + if self.mode == 'online_eval': + gt_path = self.config.gt_path_eval + depth_path = os.path.join( + gt_path, remove_leading_slash(sample_path.split()[1])) + has_valid_depth = False + try: + depth_gt = self.reader.open(depth_path) + has_valid_depth = True + except IOError: + depth_gt = False + # print('Missing gt for {}'.format(image_path)) + + if has_valid_depth: + depth_gt = np.asarray(depth_gt, dtype=np.float32) + depth_gt = np.expand_dims(depth_gt, axis=2) + if self.config.dataset == 'nyu': + depth_gt = depth_gt / 1000.0 + else: + depth_gt = depth_gt / 256.0 + + mask = np.logical_and( + depth_gt >= self.config.min_depth, depth_gt <= self.config.max_depth).squeeze()[None, ...] + else: + mask = False + + if self.config.do_kb_crop: + height = image.shape[0] + width = image.shape[1] + top_margin = int(height - 352) + left_margin = int((width - 1216) / 2) + image = image[top_margin:top_margin + 352, + left_margin:left_margin + 1216, :] + if self.mode == 'online_eval' and has_valid_depth: + depth_gt = depth_gt[top_margin:top_margin + + 352, left_margin:left_margin + 1216, :] + + if self.mode == 'online_eval': + sample = {'image': image, 'depth': depth_gt, 'focal': focal, 'has_valid_depth': has_valid_depth, + 'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1], + 'mask': mask} + else: + sample = {'image': image, 'focal': focal} + + if (self.mode == 'train') or ('has_valid_depth' in sample and sample['has_valid_depth']): + mask = np.logical_and(depth_gt > self.config.min_depth, + depth_gt < self.config.max_depth).squeeze()[None, ...] + sample['mask'] = mask + + if self.transform: + sample = self.transform(sample) + + sample = self.postprocess(sample) + sample['dataset'] = self.config.dataset + sample = {**sample, 'image_path': sample_path.split()[0], 'depth_path': sample_path.split()[1]} + + return sample + + def rotate_image(self, image, angle, flag=Image.BILINEAR): + result = image.rotate(angle, resample=flag) + return result + + def random_crop(self, img, depth, height, width): + assert img.shape[0] >= height + assert img.shape[1] >= width + assert img.shape[0] == depth.shape[0] + assert img.shape[1] == depth.shape[1] + x = random.randint(0, img.shape[1] - width) + y = random.randint(0, img.shape[0] - height) + img = img[y:y + height, x:x + width, :] + depth = depth[y:y + height, x:x + width, :] + + return img, depth + + def random_translate(self, img, depth, max_t=20): + assert img.shape[0] == depth.shape[0] + assert img.shape[1] == depth.shape[1] + p = self.config.translate_prob + do_translate = random.random() + if do_translate > p: + return img, depth + x = random.randint(-max_t, max_t) + y = random.randint(-max_t, max_t) + M = np.float32([[1, 0, x], [0, 1, y]]) + # print(img.shape, depth.shape) + img = cv2.warpAffine(img, M, (img.shape[1], img.shape[0])) + depth = cv2.warpAffine(depth, M, (depth.shape[1], depth.shape[0])) + depth = depth.squeeze()[..., None] # add channel dim back. Affine warp removes it + # print("after", img.shape, depth.shape) + return img, depth + + def train_preprocess(self, image, depth_gt): + if self.config.aug: + # Random flipping + do_flip = random.random() + if do_flip > 0.5: + image = (image[:, ::-1, :]).copy() + depth_gt = (depth_gt[:, ::-1, :]).copy() + + # Random gamma, brightness, color augmentation + do_augment = random.random() + if do_augment > 0.5: + image = self.augment_image(image) + + return image, depth_gt + + def augment_image(self, image): + # gamma augmentation + gamma = random.uniform(0.9, 1.1) + image_aug = image ** gamma + + # brightness augmentation + if self.config.dataset == 'nyu': + brightness = random.uniform(0.75, 1.25) + else: + brightness = random.uniform(0.9, 1.1) + image_aug = image_aug * brightness + + # color augmentation + colors = np.random.uniform(0.9, 1.1, size=3) + white = np.ones((image.shape[0], image.shape[1])) + color_image = np.stack([white * colors[i] for i in range(3)], axis=2) + image_aug *= color_image + image_aug = np.clip(image_aug, 0, 1) + + return image_aug + + def __len__(self): + return len(self.filenames) + + +class ToTensor(object): + def __init__(self, mode, do_normalize=False, size=None): + self.mode = mode + self.normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) if do_normalize else nn.Identity() + self.size = size + if size is not None: + self.resize = transforms.Resize(size=size) + else: + self.resize = nn.Identity() + + def __call__(self, sample): + image, focal = sample['image'], sample['focal'] + image = self.to_tensor(image) + image = self.normalize(image) + image = self.resize(image) + + if self.mode == 'test': + return {'image': image, 'focal': focal} + + depth = sample['depth'] + if self.mode == 'train': + depth = self.to_tensor(depth) + return {**sample, 'image': image, 'depth': depth, 'focal': focal} + else: + has_valid_depth = sample['has_valid_depth'] + image = self.resize(image) + return {**sample, 'image': image, 'depth': depth, 'focal': focal, 'has_valid_depth': has_valid_depth, + 'image_path': sample['image_path'], 'depth_path': sample['depth_path']} + + def to_tensor(self, pic): + if not (_is_pil_image(pic) or _is_numpy_image(pic)): + raise TypeError( + 'pic should be PIL Image or ndarray. Got {}'.format(type(pic))) + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img diff --git a/annotator/zoe/zoedepth/data/ddad.py b/annotator/zoe/zoedepth/data/ddad.py new file mode 100644 index 0000000000000000000000000000000000000000..4bd0492bdec767685d3a21992b4a26e62d002d97 --- /dev/null +++ b/annotator/zoe/zoedepth/data/ddad.py @@ -0,0 +1,117 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +class ToTensor(object): + def __init__(self, resize_shape): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x : x + self.resize = transforms.Resize(resize_shape) + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + image = self.resize(image) + + return {'image': image, 'depth': depth, 'dataset': "ddad"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class DDAD(Dataset): + def __init__(self, data_dir_root, resize_shape): + import glob + + # image paths are of the form /{outleft, depthmap}/*.png + self.image_files = glob.glob(os.path.join(data_dir_root, '*.png')) + self.depth_files = [r.replace("_rgb.png", "_depth.npy") + for r in self.image_files] + self.transform = ToTensor(resize_shape) + + def __getitem__(self, idx): + + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 + depth = np.load(depth_path) # meters + + # depth[depth > 8] = -1 + depth = depth[..., None] + + sample = dict(image=image, depth=depth) + sample = self.transform(sample) + + if idx == 0: + print(sample["image"].shape) + + return sample + + def __len__(self): + return len(self.image_files) + + +def get_ddad_loader(data_dir_root, resize_shape, batch_size=1, **kwargs): + dataset = DDAD(data_dir_root, resize_shape) + return DataLoader(dataset, batch_size, **kwargs) diff --git a/annotator/zoe/zoedepth/data/diml_indoor_test.py b/annotator/zoe/zoedepth/data/diml_indoor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..f720ad9aefaee78ef4ec363dfef0f82ace850a6d --- /dev/null +++ b/annotator/zoe/zoedepth/data/diml_indoor_test.py @@ -0,0 +1,125 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +class ToTensor(object): + def __init__(self): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x : x + self.resize = transforms.Resize((480, 640)) + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + image = self.resize(image) + + return {'image': image, 'depth': depth, 'dataset': "diml_indoor"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class DIML_Indoor(Dataset): + def __init__(self, data_dir_root): + import glob + + # image paths are of the form /{HR, LR}//{color, depth_filled}/*.png + self.image_files = glob.glob(os.path.join( + data_dir_root, "LR", '*', 'color', '*.png')) + self.depth_files = [r.replace("color", "depth_filled").replace( + "_c.png", "_depth_filled.png") for r in self.image_files] + self.transform = ToTensor() + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 + depth = np.asarray(Image.open(depth_path), + dtype='uint16') / 1000.0 # mm to meters + + # print(np.shape(image)) + # print(np.shape(depth)) + + # depth[depth > 8] = -1 + depth = depth[..., None] + + sample = dict(image=image, depth=depth) + + # return sample + sample = self.transform(sample) + + if idx == 0: + print(sample["image"].shape) + + return sample + + def __len__(self): + return len(self.image_files) + + +def get_diml_indoor_loader(data_dir_root, batch_size=1, **kwargs): + dataset = DIML_Indoor(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) + +# get_diml_indoor_loader(data_dir_root="datasets/diml/indoor/test/HR") +# get_diml_indoor_loader(data_dir_root="datasets/diml/indoor/test/LR") diff --git a/annotator/zoe/zoedepth/data/diml_outdoor_test.py b/annotator/zoe/zoedepth/data/diml_outdoor_test.py new file mode 100644 index 0000000000000000000000000000000000000000..8670b48f5febafb819dac22848ad79ccb5dd5ae4 --- /dev/null +++ b/annotator/zoe/zoedepth/data/diml_outdoor_test.py @@ -0,0 +1,114 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +class ToTensor(object): + def __init__(self): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x : x + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + return {'image': image, 'depth': depth, 'dataset': "diml_outdoor"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class DIML_Outdoor(Dataset): + def __init__(self, data_dir_root): + import glob + + # image paths are of the form /{outleft, depthmap}/*.png + self.image_files = glob.glob(os.path.join( + data_dir_root, "*", 'outleft', '*.png')) + self.depth_files = [r.replace("outleft", "depthmap") + for r in self.image_files] + self.transform = ToTensor() + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 + depth = np.asarray(Image.open(depth_path), + dtype='uint16') / 1000.0 # mm to meters + + # depth[depth > 8] = -1 + depth = depth[..., None] + + sample = dict(image=image, depth=depth, dataset="diml_outdoor") + + # return sample + return self.transform(sample) + + def __len__(self): + return len(self.image_files) + + +def get_diml_outdoor_loader(data_dir_root, batch_size=1, **kwargs): + dataset = DIML_Outdoor(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) + +# get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/HR") +# get_diml_outdoor_loader(data_dir_root="datasets/diml/outdoor/test/LR") diff --git a/annotator/zoe/zoedepth/data/diode.py b/annotator/zoe/zoedepth/data/diode.py new file mode 100644 index 0000000000000000000000000000000000000000..1510c87116b8f70ce2e1428873a8e4da042bee23 --- /dev/null +++ b/annotator/zoe/zoedepth/data/diode.py @@ -0,0 +1,125 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +class ToTensor(object): + def __init__(self): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x : x + self.resize = transforms.Resize(480) + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + image = self.resize(image) + + return {'image': image, 'depth': depth, 'dataset': "diode"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class DIODE(Dataset): + def __init__(self, data_dir_root): + import glob + + # image paths are of the form /scene_#/scan_#/*.png + self.image_files = glob.glob( + os.path.join(data_dir_root, '*', '*', '*.png')) + self.depth_files = [r.replace(".png", "_depth.npy") + for r in self.image_files] + self.depth_mask_files = [ + r.replace(".png", "_depth_mask.npy") for r in self.image_files] + self.transform = ToTensor() + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + depth_mask_path = self.depth_mask_files[idx] + + image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 + depth = np.load(depth_path) # in meters + valid = np.load(depth_mask_path) # binary + + # depth[depth > 8] = -1 + # depth = depth[..., None] + + sample = dict(image=image, depth=depth, valid=valid) + + # return sample + sample = self.transform(sample) + + if idx == 0: + print(sample["image"].shape) + + return sample + + def __len__(self): + return len(self.image_files) + + +def get_diode_loader(data_dir_root, batch_size=1, **kwargs): + dataset = DIODE(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) + +# get_diode_loader(data_dir_root="datasets/diode/val/outdoor") diff --git a/annotator/zoe/zoedepth/data/hypersim.py b/annotator/zoe/zoedepth/data/hypersim.py new file mode 100644 index 0000000000000000000000000000000000000000..4334198971830200f72ea2910d03f4c7d6a43334 --- /dev/null +++ b/annotator/zoe/zoedepth/data/hypersim.py @@ -0,0 +1,138 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import glob +import os + +import h5py +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +def hypersim_distance_to_depth(npyDistance): + intWidth, intHeight, fltFocal = 1024, 768, 886.81 + + npyImageplaneX = np.linspace((-0.5 * intWidth) + 0.5, (0.5 * intWidth) - 0.5, intWidth).reshape( + 1, intWidth).repeat(intHeight, 0).astype(np.float32)[:, :, None] + npyImageplaneY = np.linspace((-0.5 * intHeight) + 0.5, (0.5 * intHeight) - 0.5, + intHeight).reshape(intHeight, 1).repeat(intWidth, 1).astype(np.float32)[:, :, None] + npyImageplaneZ = np.full([intHeight, intWidth, 1], fltFocal, np.float32) + npyImageplane = np.concatenate( + [npyImageplaneX, npyImageplaneY, npyImageplaneZ], 2) + + npyDepth = npyDistance / np.linalg.norm(npyImageplane, 2, 2) * fltFocal + return npyDepth + + +class ToTensor(object): + def __init__(self): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x: x + self.resize = transforms.Resize((480, 640)) + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + image = self.resize(image) + + return {'image': image, 'depth': depth, 'dataset': "hypersim"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class HyperSim(Dataset): + def __init__(self, data_dir_root): + # image paths are of the form //images/scene_cam_#_final_preview/*.tonemap.jpg + # depth paths are of the form //images/scene_cam_#_final_preview/*.depth_meters.hdf5 + self.image_files = glob.glob(os.path.join( + data_dir_root, '*', 'images', 'scene_cam_*_final_preview', '*.tonemap.jpg')) + self.depth_files = [r.replace("_final_preview", "_geometry_hdf5").replace( + ".tonemap.jpg", ".depth_meters.hdf5") for r in self.image_files] + self.transform = ToTensor() + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 + + # depth from hdf5 + depth_fd = h5py.File(depth_path, "r") + # in meters (Euclidean distance) + distance_meters = np.array(depth_fd['dataset']) + depth = hypersim_distance_to_depth( + distance_meters) # in meters (planar depth) + + # depth[depth > 8] = -1 + depth = depth[..., None] + + sample = dict(image=image, depth=depth) + sample = self.transform(sample) + + if idx == 0: + print(sample["image"].shape) + + return sample + + def __len__(self): + return len(self.image_files) + + +def get_hypersim_loader(data_dir_root, batch_size=1, **kwargs): + dataset = HyperSim(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) diff --git a/annotator/zoe/zoedepth/data/ibims.py b/annotator/zoe/zoedepth/data/ibims.py new file mode 100644 index 0000000000000000000000000000000000000000..b66abfabcf4cfc617d4a60ec818780c3548d9920 --- /dev/null +++ b/annotator/zoe/zoedepth/data/ibims.py @@ -0,0 +1,81 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms as T + + +class iBims(Dataset): + def __init__(self, config): + root_folder = config.ibims_root + with open(os.path.join(root_folder, "imagelist.txt"), 'r') as f: + imglist = f.read().split() + + samples = [] + for basename in imglist: + img_path = os.path.join(root_folder, 'rgb', basename + ".png") + depth_path = os.path.join(root_folder, 'depth', basename + ".png") + valid_mask_path = os.path.join( + root_folder, 'mask_invalid', basename+".png") + transp_mask_path = os.path.join( + root_folder, 'mask_transp', basename+".png") + + samples.append( + (img_path, depth_path, valid_mask_path, transp_mask_path)) + + self.samples = samples + # self.normalize = T.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x : x + + def __getitem__(self, idx): + img_path, depth_path, valid_mask_path, transp_mask_path = self.samples[idx] + + img = np.asarray(Image.open(img_path), dtype=np.float32) / 255.0 + depth = np.asarray(Image.open(depth_path), + dtype=np.uint16).astype('float')*50.0/65535 + + mask_valid = np.asarray(Image.open(valid_mask_path)) + mask_transp = np.asarray(Image.open(transp_mask_path)) + + # depth = depth * mask_valid * mask_transp + depth = np.where(mask_valid * mask_transp, depth, -1) + + img = torch.from_numpy(img).permute(2, 0, 1) + img = self.normalize(img) + depth = torch.from_numpy(depth).unsqueeze(0) + return dict(image=img, depth=depth, image_path=img_path, depth_path=depth_path, dataset='ibims') + + def __len__(self): + return len(self.samples) + + +def get_ibims_loader(config, batch_size=1, **kwargs): + dataloader = DataLoader(iBims(config), batch_size=batch_size, **kwargs) + return dataloader diff --git a/annotator/zoe/zoedepth/data/preprocess.py b/annotator/zoe/zoedepth/data/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..e08cc309dc823ae6efd7cda8db9eb37130dc5499 --- /dev/null +++ b/annotator/zoe/zoedepth/data/preprocess.py @@ -0,0 +1,154 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import numpy as np +from dataclasses import dataclass +from typing import Tuple, List + +# dataclass to store the crop parameters +@dataclass +class CropParams: + top: int + bottom: int + left: int + right: int + + + +def get_border_params(rgb_image, tolerance=0.1, cut_off=20, value=0, level_diff_threshold=5, channel_axis=-1, min_border=5) -> CropParams: + gray_image = np.mean(rgb_image, axis=channel_axis) + h, w = gray_image.shape + + + def num_value_pixels(arr): + return np.sum(np.abs(arr - value) < level_diff_threshold) + + def is_above_tolerance(arr, total_pixels): + return (num_value_pixels(arr) / total_pixels) > tolerance + + # Crop top border until number of value pixels become below tolerance + top = min_border + while is_above_tolerance(gray_image[top, :], w) and top < h-1: + top += 1 + if top > cut_off: + break + + # Crop bottom border until number of value pixels become below tolerance + bottom = h - min_border + while is_above_tolerance(gray_image[bottom, :], w) and bottom > 0: + bottom -= 1 + if h - bottom > cut_off: + break + + # Crop left border until number of value pixels become below tolerance + left = min_border + while is_above_tolerance(gray_image[:, left], h) and left < w-1: + left += 1 + if left > cut_off: + break + + # Crop right border until number of value pixels become below tolerance + right = w - min_border + while is_above_tolerance(gray_image[:, right], h) and right > 0: + right -= 1 + if w - right > cut_off: + break + + + return CropParams(top, bottom, left, right) + + +def get_white_border(rgb_image, value=255, **kwargs) -> CropParams: + """Crops the white border of the RGB. + + Args: + rgb: RGB image, shape (H, W, 3). + Returns: + Crop parameters. + """ + if value == 255: + # assert range of values in rgb image is [0, 255] + assert np.max(rgb_image) <= 255 and np.min(rgb_image) >= 0, "RGB image values are not in range [0, 255]." + assert rgb_image.max() > 1, "RGB image values are not in range [0, 255]." + elif value == 1: + # assert range of values in rgb image is [0, 1] + assert np.max(rgb_image) <= 1 and np.min(rgb_image) >= 0, "RGB image values are not in range [0, 1]." + + return get_border_params(rgb_image, value=value, **kwargs) + +def get_black_border(rgb_image, **kwargs) -> CropParams: + """Crops the black border of the RGB. + + Args: + rgb: RGB image, shape (H, W, 3). + + Returns: + Crop parameters. + """ + + return get_border_params(rgb_image, value=0, **kwargs) + +def crop_image(image: np.ndarray, crop_params: CropParams) -> np.ndarray: + """Crops the image according to the crop parameters. + + Args: + image: RGB or depth image, shape (H, W, 3) or (H, W). + crop_params: Crop parameters. + + Returns: + Cropped image. + """ + return image[crop_params.top:crop_params.bottom, crop_params.left:crop_params.right] + +def crop_images(*images: np.ndarray, crop_params: CropParams) -> Tuple[np.ndarray]: + """Crops the images according to the crop parameters. + + Args: + images: RGB or depth images, shape (H, W, 3) or (H, W). + crop_params: Crop parameters. + + Returns: + Cropped images. + """ + return tuple(crop_image(image, crop_params) for image in images) + +def crop_black_or_white_border(rgb_image, *other_images: np.ndarray, tolerance=0.1, cut_off=20, level_diff_threshold=5) -> Tuple[np.ndarray]: + """Crops the white and black border of the RGB and depth images. + + Args: + rgb: RGB image, shape (H, W, 3). This image is used to determine the border. + other_images: The other images to crop according to the border of the RGB image. + Returns: + Cropped RGB and other images. + """ + # crop black border + crop_params = get_black_border(rgb_image, tolerance=tolerance, cut_off=cut_off, level_diff_threshold=level_diff_threshold) + cropped_images = crop_images(rgb_image, *other_images, crop_params=crop_params) + + # crop white border + crop_params = get_white_border(cropped_images[0], tolerance=tolerance, cut_off=cut_off, level_diff_threshold=level_diff_threshold) + cropped_images = crop_images(*cropped_images, crop_params=crop_params) + + return cropped_images + \ No newline at end of file diff --git a/annotator/zoe/zoedepth/data/sun_rgbd_loader.py b/annotator/zoe/zoedepth/data/sun_rgbd_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..9e2bdb9aefe68ca4439f41eff3bba722c49fb976 --- /dev/null +++ b/annotator/zoe/zoedepth/data/sun_rgbd_loader.py @@ -0,0 +1,106 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +class ToTensor(object): + def __init__(self): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x : x + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + return {'image': image, 'depth': depth, 'dataset': "sunrgbd"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class SunRGBD(Dataset): + def __init__(self, data_dir_root): + # test_file_dirs = loadmat(train_test_file)['alltest'].squeeze() + # all_test = [t[0].replace("/n/fs/sun3d/data/", "") for t in test_file_dirs] + # self.all_test = [os.path.join(data_dir_root, t) for t in all_test] + import glob + self.image_files = glob.glob( + os.path.join(data_dir_root, 'rgb', 'rgb', '*')) + self.depth_files = [ + r.replace("rgb/rgb", "gt/gt").replace("jpg", "png") for r in self.image_files] + self.transform = ToTensor() + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = np.asarray(Image.open(image_path), dtype=np.float32) / 255.0 + depth = np.asarray(Image.open(depth_path), dtype='uint16') / 1000.0 + depth[depth > 8] = -1 + depth = depth[..., None] + return self.transform(dict(image=image, depth=depth)) + + def __len__(self): + return len(self.image_files) + + +def get_sunrgbd_loader(data_dir_root, batch_size=1, **kwargs): + dataset = SunRGBD(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) diff --git a/annotator/zoe/zoedepth/data/transforms.py b/annotator/zoe/zoedepth/data/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..374416dff24fb4fd55598f3946d6d6b091ddefc9 --- /dev/null +++ b/annotator/zoe/zoedepth/data/transforms.py @@ -0,0 +1,481 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import math +import random + +import cv2 +import numpy as np + + +class RandomFliplr(object): + """Horizontal flip of the sample with given probability. + """ + + def __init__(self, probability=0.5): + """Init. + + Args: + probability (float, optional): Flip probability. Defaults to 0.5. + """ + self.__probability = probability + + def __call__(self, sample): + prob = random.random() + + if prob < self.__probability: + for k, v in sample.items(): + if len(v.shape) >= 2: + sample[k] = np.fliplr(v).copy() + + return sample + + +def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): + """Rezise the sample to ensure the given size. Keeps aspect ratio. + + Args: + sample (dict): sample + size (tuple): image size + + Returns: + tuple: new size + """ + shape = list(sample["disparity"].shape) + + if shape[0] >= size[0] and shape[1] >= size[1]: + return sample + + scale = [0, 0] + scale[0] = size[0] / shape[0] + scale[1] = size[1] / shape[1] + + scale = max(scale) + + shape[0] = math.ceil(scale * shape[0]) + shape[1] = math.ceil(scale * shape[1]) + + # resize + sample["image"] = cv2.resize( + sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST + ) + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + tuple(shape[::-1]), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return tuple(shape) + + +class RandomCrop(object): + """Get a random crop of the sample with the given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_if_needed=False, + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): output width + height (int): output height + resize_if_needed (bool, optional): If True, sample might be upsampled to ensure + that a crop of size (width, height) is possbile. Defaults to False. + """ + self.__size = (height, width) + self.__resize_if_needed = resize_if_needed + self.__image_interpolation_method = image_interpolation_method + + def __call__(self, sample): + + shape = sample["disparity"].shape + + if self.__size[0] > shape[0] or self.__size[1] > shape[1]: + if self.__resize_if_needed: + shape = apply_min_size( + sample, self.__size, self.__image_interpolation_method + ) + else: + raise Exception( + "Output size {} bigger than input size {}.".format( + self.__size, shape + ) + ) + + offset = ( + np.random.randint(shape[0] - self.__size[0] + 1), + np.random.randint(shape[1] - self.__size[1] + 1), + ) + + for k, v in sample.items(): + if k == "code" or k == "basis": + continue + + if len(sample[k].shape) >= 2: + sample[k] = v[ + offset[0]: offset[0] + self.__size[0], + offset[1]: offset[1] + self.__size[1], + ] + + return sample + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + letter_box=False, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + self.__letter_box = letter_box + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) + * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) + * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def make_letter_box(self, sample): + top = bottom = (self.__height - sample.shape[0]) // 2 + left = right = (self.__width - sample.shape[1]) // 2 + sample = cv2.copyMakeBorder( + sample, top, bottom, left, right, cv2.BORDER_CONSTANT, None, 0) + return sample + + def __call__(self, sample): + width, height = self.get_size( + sample["image"].shape[1], sample["image"].shape[0] + ) + + # resize sample + sample["image"] = cv2.resize( + sample["image"], + (width, height), + interpolation=self.__image_interpolation_method, + ) + + if self.__letter_box: + sample["image"] = self.make_letter_box(sample["image"]) + + if self.__resize_target: + if "disparity" in sample: + sample["disparity"] = cv2.resize( + sample["disparity"], + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if self.__letter_box: + sample["disparity"] = self.make_letter_box( + sample["disparity"]) + + if "depth" in sample: + sample["depth"] = cv2.resize( + sample["depth"], (width, + height), interpolation=cv2.INTER_NEAREST + ) + + if self.__letter_box: + sample["depth"] = self.make_letter_box(sample["depth"]) + + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if self.__letter_box: + sample["mask"] = self.make_letter_box(sample["mask"]) + + sample["mask"] = sample["mask"].astype(bool) + + return sample + + +class ResizeFixed(object): + def __init__(self, size): + self.__size = size + + def __call__(self, sample): + sample["image"] = cv2.resize( + sample["image"], self.__size[::-1], interpolation=cv2.INTER_LINEAR + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], self.__size[::- + 1], interpolation=cv2.INTER_NEAREST + ) + + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + self.__size[::-1], + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return sample + + +class Rescale(object): + """Rescale target values to the interval [0, max_val]. + If input is constant, values are set to max_val / 2. + """ + + def __init__(self, max_val=1.0, use_mask=True): + """Init. + + Args: + max_val (float, optional): Max output value. Defaults to 1.0. + use_mask (bool, optional): Only operate on valid pixels (mask == True). Defaults to True. + """ + self.__max_val = max_val + self.__use_mask = use_mask + + def __call__(self, sample): + disp = sample["disparity"] + + if self.__use_mask: + mask = sample["mask"] + else: + mask = np.ones_like(disp, dtype=np.bool) + + if np.sum(mask) == 0: + return sample + + min_val = np.min(disp[mask]) + max_val = np.max(disp[mask]) + + if max_val > min_val: + sample["disparity"][mask] = ( + (disp[mask] - min_val) / (max_val - min_val) * self.__max_val + ) + else: + sample["disparity"][mask] = np.ones_like( + disp[mask]) * self.__max_val / 2.0 + + return sample + + +# mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class DepthToDisparity(object): + """Convert depth to disparity. Removes depth from sample. + """ + + def __init__(self, eps=1e-4): + self.__eps = eps + + def __call__(self, sample): + assert "depth" in sample + + sample["mask"][sample["depth"] < self.__eps] = False + + sample["disparity"] = np.zeros_like(sample["depth"]) + sample["disparity"][sample["depth"] >= self.__eps] = ( + 1.0 / sample["depth"][sample["depth"] >= self.__eps] + ) + + del sample["depth"] + + return sample + + +class DisparityToDepth(object): + """Convert disparity to depth. Removes disparity from sample. + """ + + def __init__(self, eps=1e-4): + self.__eps = eps + + def __call__(self, sample): + assert "disparity" in sample + + disp = np.abs(sample["disparity"]) + sample["mask"][disp < self.__eps] = False + + # print(sample["disparity"]) + # print(sample["mask"].sum()) + # exit() + + sample["depth"] = np.zeros_like(disp) + sample["depth"][disp >= self.__eps] = ( + 1.0 / disp[disp >= self.__eps] + ) + + del sample["disparity"] + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + if "disparity" in sample: + disparity = sample["disparity"].astype(np.float32) + sample["disparity"] = np.ascontiguousarray(disparity) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + return sample diff --git a/annotator/zoe/zoedepth/data/vkitti.py b/annotator/zoe/zoedepth/data/vkitti.py new file mode 100644 index 0000000000000000000000000000000000000000..72a2e5a8346f6e630ede0e28d6959725af8d7e72 --- /dev/null +++ b/annotator/zoe/zoedepth/data/vkitti.py @@ -0,0 +1,151 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch +from torch.utils.data import Dataset, DataLoader +from torchvision import transforms +import os + +from PIL import Image +import numpy as np +import cv2 + + +class ToTensor(object): + def __init__(self): + self.normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + # self.resize = transforms.Resize((375, 1242)) + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + # image = self.resize(image) + + return {'image': image, 'depth': depth, 'dataset': "vkitti"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class VKITTI(Dataset): + def __init__(self, data_dir_root, do_kb_crop=True): + import glob + # image paths are of the form /{HR, LR}//{color, depth_filled}/*.png + self.image_files = glob.glob(os.path.join( + data_dir_root, "test_color", '*.png')) + self.depth_files = [r.replace("test_color", "test_depth") + for r in self.image_files] + self.do_kb_crop = True + self.transform = ToTensor() + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = Image.open(image_path) + depth = Image.open(depth_path) + depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | + cv2.IMREAD_ANYDEPTH) + print("dpeth min max", depth.min(), depth.max()) + + # print(np.shape(image)) + # print(np.shape(depth)) + + # depth[depth > 8] = -1 + + if self.do_kb_crop and False: + height = image.height + width = image.width + top_margin = int(height - 352) + left_margin = int((width - 1216) / 2) + depth = depth.crop( + (left_margin, top_margin, left_margin + 1216, top_margin + 352)) + image = image.crop( + (left_margin, top_margin, left_margin + 1216, top_margin + 352)) + # uv = uv[:, top_margin:top_margin + 352, left_margin:left_margin + 1216] + + image = np.asarray(image, dtype=np.float32) / 255.0 + # depth = np.asarray(depth, dtype=np.uint16) /1. + depth = depth[..., None] + sample = dict(image=image, depth=depth) + + # return sample + sample = self.transform(sample) + + if idx == 0: + print(sample["image"].shape) + + return sample + + def __len__(self): + return len(self.image_files) + + +def get_vkitti_loader(data_dir_root, batch_size=1, **kwargs): + dataset = VKITTI(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) + + +if __name__ == "__main__": + loader = get_vkitti_loader( + data_dir_root="/home/bhatsf/shortcuts/datasets/vkitti_test") + print("Total files", len(loader.dataset)) + for i, sample in enumerate(loader): + print(sample["image"].shape) + print(sample["depth"].shape) + print(sample["dataset"]) + print(sample['depth'].min(), sample['depth'].max()) + if i > 5: + break diff --git a/annotator/zoe/zoedepth/data/vkitti2.py b/annotator/zoe/zoedepth/data/vkitti2.py new file mode 100644 index 0000000000000000000000000000000000000000..9bcfb0414b7f3f21859f30ae34bd71689516a3e7 --- /dev/null +++ b/annotator/zoe/zoedepth/data/vkitti2.py @@ -0,0 +1,187 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import os + +import cv2 +import numpy as np +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torchvision import transforms + + +class ToTensor(object): + def __init__(self): + # self.normalize = transforms.Normalize( + # mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + self.normalize = lambda x: x + # self.resize = transforms.Resize((375, 1242)) + + def __call__(self, sample): + image, depth = sample['image'], sample['depth'] + + image = self.to_tensor(image) + image = self.normalize(image) + depth = self.to_tensor(depth) + + # image = self.resize(image) + + return {'image': image, 'depth': depth, 'dataset': "vkitti"} + + def to_tensor(self, pic): + + if isinstance(pic, np.ndarray): + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img + + # # handle PIL Image + if pic.mode == 'I': + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == 'I;16': + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes())) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == 'YCbCr': + nchannel = 3 + elif pic.mode == 'I;16': + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float() + else: + return img + + +class VKITTI2(Dataset): + def __init__(self, data_dir_root, do_kb_crop=True, split="test"): + import glob + + # image paths are of the form /rgb///frames//Camera<0,1>/rgb_{}.jpg + self.image_files = glob.glob(os.path.join( + data_dir_root, "rgb", "**", "frames", "rgb", "Camera_0", '*.jpg'), recursive=True) + self.depth_files = [r.replace("/rgb/", "/depth/").replace( + "rgb_", "depth_").replace(".jpg", ".png") for r in self.image_files] + self.do_kb_crop = True + self.transform = ToTensor() + + # If train test split is not created, then create one. + # Split is such that 8% of the frames from each scene are used for testing. + if not os.path.exists(os.path.join(data_dir_root, "train.txt")): + import random + scenes = set([os.path.basename(os.path.dirname( + os.path.dirname(os.path.dirname(f)))) for f in self.image_files]) + train_files = [] + test_files = [] + for scene in scenes: + scene_files = [f for f in self.image_files if os.path.basename( + os.path.dirname(os.path.dirname(os.path.dirname(f)))) == scene] + random.shuffle(scene_files) + train_files.extend(scene_files[:int(len(scene_files) * 0.92)]) + test_files.extend(scene_files[int(len(scene_files) * 0.92):]) + with open(os.path.join(data_dir_root, "train.txt"), "w") as f: + f.write("\n".join(train_files)) + with open(os.path.join(data_dir_root, "test.txt"), "w") as f: + f.write("\n".join(test_files)) + + if split == "train": + with open(os.path.join(data_dir_root, "train.txt"), "r") as f: + self.image_files = f.read().splitlines() + self.depth_files = [r.replace("/rgb/", "/depth/").replace( + "rgb_", "depth_").replace(".jpg", ".png") for r in self.image_files] + elif split == "test": + with open(os.path.join(data_dir_root, "test.txt"), "r") as f: + self.image_files = f.read().splitlines() + self.depth_files = [r.replace("/rgb/", "/depth/").replace( + "rgb_", "depth_").replace(".jpg", ".png") for r in self.image_files] + + def __getitem__(self, idx): + image_path = self.image_files[idx] + depth_path = self.depth_files[idx] + + image = Image.open(image_path) + # depth = Image.open(depth_path) + depth = cv2.imread(depth_path, cv2.IMREAD_ANYCOLOR | + cv2.IMREAD_ANYDEPTH) / 100.0 # cm to m + depth = Image.fromarray(depth) + # print("dpeth min max", depth.min(), depth.max()) + + # print(np.shape(image)) + # print(np.shape(depth)) + + if self.do_kb_crop: + if idx == 0: + print("Using KB input crop") + height = image.height + width = image.width + top_margin = int(height - 352) + left_margin = int((width - 1216) / 2) + depth = depth.crop( + (left_margin, top_margin, left_margin + 1216, top_margin + 352)) + image = image.crop( + (left_margin, top_margin, left_margin + 1216, top_margin + 352)) + # uv = uv[:, top_margin:top_margin + 352, left_margin:left_margin + 1216] + + image = np.asarray(image, dtype=np.float32) / 255.0 + # depth = np.asarray(depth, dtype=np.uint16) /1. + depth = np.asarray(depth, dtype=np.float32) / 1. + depth[depth > 80] = -1 + + depth = depth[..., None] + sample = dict(image=image, depth=depth) + + # return sample + sample = self.transform(sample) + + if idx == 0: + print(sample["image"].shape) + + return sample + + def __len__(self): + return len(self.image_files) + + +def get_vkitti2_loader(data_dir_root, batch_size=1, **kwargs): + dataset = VKITTI2(data_dir_root) + return DataLoader(dataset, batch_size, **kwargs) + + +if __name__ == "__main__": + loader = get_vkitti2_loader( + data_dir_root="/home/bhatsf/shortcuts/datasets/vkitti2") + print("Total files", len(loader.dataset)) + for i, sample in enumerate(loader): + print(sample["image"].shape) + print(sample["depth"].shape) + print(sample["dataset"]) + print(sample['depth'].min(), sample['depth'].max()) + if i > 5: + break diff --git a/annotator/zoe/zoedepth/models/__init__.py b/annotator/zoe/zoedepth/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f2668792389157609abb2a0846fb620e7d67eb9 --- /dev/null +++ b/annotator/zoe/zoedepth/models/__init__.py @@ -0,0 +1,24 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + diff --git a/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-310.pyc b/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e722780827d14d75be8fbe71199c66a839c81c62 Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-38.pyc b/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a241bf304eab10e73755614544e60ee940914eaf Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-39.pyc b/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bec0ff331c5fbd6b4ba77ca5a8e6a7cdd40b6aa5 Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/__init__.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-310.pyc b/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9eb4cf3ebb80b713954e632c350fbb4050acd67e Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-38.pyc b/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7000ca5b59ef284fec3d8c54a9133ebeeba35e3f Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-39.pyc b/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4375313235fdfe5b1791851b0ee60ccd311ee3b0 Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/depth_model.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-310.pyc b/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9e71b2f9870bc2ce03b1f8c716f0df1d78ec571 Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-38.pyc b/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d1f3adf9c6e4711d88a288c7b24a5b3c1f615b9 Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-39.pyc b/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7152452b7147210ba2671d64612d1f73a06fa0d6 Binary files /dev/null and b/annotator/zoe/zoedepth/models/__pycache__/model_io.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/__init__.py b/annotator/zoe/zoedepth/models/base_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f2668792389157609abb2a0846fb620e7d67eb9 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/__init__.py @@ -0,0 +1,24 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + diff --git a/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..208b73d68aa744eb07d1adb256a3f9f917338ba6 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6884e9a44f1b82c4199365f159125b1cc3e99f9 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-39.pyc b/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a69e23a609931e33ae65eeb4c74633a2e0a741e4 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/__pycache__/__init__.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1af317fe68273b019b9451578e71dfadbbfe7230 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfead3647d4bc7a100ca35ad77c49d8e1112734f Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-39.pyc b/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c6f51d4522b689215d141e9b33c4e74acaeb50a Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/__pycache__/midas.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas.py b/annotator/zoe/zoedepth/models/base_models/midas.py new file mode 100644 index 0000000000000000000000000000000000000000..ee660bc93d44c28efe8d8c674e715ea2ecb4c183 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas.py @@ -0,0 +1,379 @@ +# MIT License +import os + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch +import torch.nn as nn +import numpy as np +from torchvision.transforms import Normalize + + +def denormalize(x): + """Reverses the imagenet normalization applied to the input. + + Args: + x (torch.Tensor - shape(N,3,H,W)): input tensor + + Returns: + torch.Tensor - shape(N,3,H,W): Denormalized input + """ + mean = torch.Tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1).to(x.device) + std = torch.Tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1).to(x.device) + return x * std + mean + +def get_activation(name, bank): + def hook(model, input, output): + bank[name] = output + return hook + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + ): + """Init. + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + print("Params passed to Resize transform:") + print("\twidth: ", width) + print("\theight: ", height) + print("\tresize_target: ", resize_target) + print("\tkeep_aspect_ratio: ", keep_aspect_ratio) + print("\tensure_multiple_of: ", ensure_multiple_of) + print("\tresize_method: ", resize_method) + + self.__width = width + self.__height = height + + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) + * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) + * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, x): + width, height = self.get_size(*x.shape[-2:][::-1]) + return nn.functional.interpolate(x, (height, width), mode='bilinear', align_corners=True) + +class PrepForMidas(object): + def __init__(self, resize_mode="minimal", keep_aspect_ratio=True, img_size=384, do_resize=True): + if isinstance(img_size, int): + img_size = (img_size, img_size) + net_h, net_w = img_size + self.normalization = Normalize( + mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + self.resizer = Resize(net_w, net_h, keep_aspect_ratio=keep_aspect_ratio, ensure_multiple_of=32, resize_method=resize_mode) \ + if do_resize else nn.Identity() + + def __call__(self, x): + return self.normalization(self.resizer(x)) + + +class MidasCore(nn.Module): + def __init__(self, midas, trainable=False, fetch_features=True, layer_names=('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'), freeze_bn=False, keep_aspect_ratio=True, + img_size=384, **kwargs): + """Midas Base model used for multi-scale feature extraction. + + Args: + midas (torch.nn.Module): Midas model. + trainable (bool, optional): Train midas model. Defaults to False. + fetch_features (bool, optional): Extract multi-scale features. Defaults to True. + layer_names (tuple, optional): Layers used for feature extraction. Order = (head output features, last layer features, ...decoder features). Defaults to ('out_conv', 'l4_rn', 'r4', 'r3', 'r2', 'r1'). + freeze_bn (bool, optional): Freeze BatchNorm. Generally results in better finetuning performance. Defaults to False. + keep_aspect_ratio (bool, optional): Keep the aspect ratio of input images while resizing. Defaults to True. + img_size (int, tuple, optional): Input resolution. Defaults to 384. + """ + super().__init__() + self.core = midas + self.output_channels = None + self.core_out = {} + self.trainable = trainable + self.fetch_features = fetch_features + # midas.scratch.output_conv = nn.Identity() + self.handles = [] + # self.layer_names = ['out_conv','l4_rn', 'r4', 'r3', 'r2', 'r1'] + self.layer_names = layer_names + + self.set_trainable(trainable) + self.set_fetch_features(fetch_features) + + self.prep = PrepForMidas(keep_aspect_ratio=keep_aspect_ratio, + img_size=img_size, do_resize=kwargs.get('do_resize', True)) + + if freeze_bn: + self.freeze_bn() + + def set_trainable(self, trainable): + self.trainable = trainable + if trainable: + self.unfreeze() + else: + self.freeze() + return self + + def set_fetch_features(self, fetch_features): + self.fetch_features = fetch_features + if fetch_features: + if len(self.handles) == 0: + self.attach_hooks(self.core) + else: + self.remove_hooks() + return self + + def freeze(self): + for p in self.parameters(): + p.requires_grad = False + self.trainable = False + return self + + def unfreeze(self): + for p in self.parameters(): + p.requires_grad = True + self.trainable = True + return self + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + return self + + def forward(self, x, denorm=False, return_rel_depth=False): + with torch.no_grad(): + if denorm: + x = denormalize(x) + x = self.prep(x) + # print("Shape after prep: ", x.shape) + + with torch.set_grad_enabled(self.trainable): + + # print("Input size to Midascore", x.shape) + rel_depth = self.core(x) + # print("Output from midas shape", rel_depth.shape) + if not self.fetch_features: + return rel_depth + out = [self.core_out[k] for k in self.layer_names] + + if return_rel_depth: + return rel_depth, out + return out + + def get_rel_pos_params(self): + for name, p in self.core.pretrained.named_parameters(): + if "relative_position" in name: + yield p + + def get_enc_params_except_rel_pos(self): + for name, p in self.core.pretrained.named_parameters(): + if "relative_position" not in name: + yield p + + def freeze_encoder(self, freeze_rel_pos=False): + if freeze_rel_pos: + for p in self.core.pretrained.parameters(): + p.requires_grad = False + else: + for p in self.get_enc_params_except_rel_pos(): + p.requires_grad = False + return self + + def attach_hooks(self, midas): + if len(self.handles) > 0: + self.remove_hooks() + if "out_conv" in self.layer_names: + self.handles.append(list(midas.scratch.output_conv.children())[ + 3].register_forward_hook(get_activation("out_conv", self.core_out))) + if "r4" in self.layer_names: + self.handles.append(midas.scratch.refinenet4.register_forward_hook( + get_activation("r4", self.core_out))) + if "r3" in self.layer_names: + self.handles.append(midas.scratch.refinenet3.register_forward_hook( + get_activation("r3", self.core_out))) + if "r2" in self.layer_names: + self.handles.append(midas.scratch.refinenet2.register_forward_hook( + get_activation("r2", self.core_out))) + if "r1" in self.layer_names: + self.handles.append(midas.scratch.refinenet1.register_forward_hook( + get_activation("r1", self.core_out))) + if "l4_rn" in self.layer_names: + self.handles.append(midas.scratch.layer4_rn.register_forward_hook( + get_activation("l4_rn", self.core_out))) + + return self + + def remove_hooks(self): + for h in self.handles: + h.remove() + return self + + def __del__(self): + self.remove_hooks() + + def set_output_channels(self, model_type): + self.output_channels = MIDAS_SETTINGS[model_type] + + @staticmethod + def build(midas_model_type="DPT_BEiT_L_384", train_midas=False, use_pretrained_midas=True, fetch_features=False, freeze_bn=True, force_keep_ar=False, force_reload=False, **kwargs): + if midas_model_type not in MIDAS_SETTINGS: + raise ValueError( + f"Invalid model type: {midas_model_type}. Must be one of {list(MIDAS_SETTINGS.keys())}") + if "img_size" in kwargs: + kwargs = MidasCore.parse_img_size(kwargs) + img_size = kwargs.pop("img_size", [384, 384]) + print("img_size", img_size) + midas_path = os.path.join(os.path.dirname(__file__), 'midas_repo') + midas = torch.hub.load(midas_path, midas_model_type, + pretrained=use_pretrained_midas, force_reload=force_reload, source='local') + kwargs.update({'keep_aspect_ratio': force_keep_ar}) + midas_core = MidasCore(midas, trainable=train_midas, fetch_features=fetch_features, + freeze_bn=freeze_bn, img_size=img_size, **kwargs) + midas_core.set_output_channels(midas_model_type) + return midas_core + + @staticmethod + def build_from_config(config): + return MidasCore.build(**config) + + @staticmethod + def parse_img_size(config): + assert 'img_size' in config + if isinstance(config['img_size'], str): + assert "," in config['img_size'], "img_size should be a string with comma separated img_size=H,W" + config['img_size'] = list(map(int, config['img_size'].split(","))) + assert len( + config['img_size']) == 2, "img_size should be a string with comma separated img_size=H,W" + elif isinstance(config['img_size'], int): + config['img_size'] = [config['img_size'], config['img_size']] + else: + assert isinstance(config['img_size'], list) and len( + config['img_size']) == 2, "img_size should be a list of H,W" + return config + + +nchannels2models = { + tuple([256]*5): ["DPT_BEiT_L_384", "DPT_BEiT_L_512", "DPT_BEiT_B_384", "DPT_SwinV2_L_384", "DPT_SwinV2_B_384", "DPT_SwinV2_T_256", "DPT_Large", "DPT_Hybrid"], + (512, 256, 128, 64, 64): ["MiDaS_small"] +} + +# Model name to number of output channels +MIDAS_SETTINGS = {m: k for k, v in nchannels2models.items() + for m in v + } diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md b/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9568ea71c755b6938ee5482ba9f09be722e75943 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/README.md @@ -0,0 +1,259 @@ +## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer + +This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3): + +>Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer +René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun + + +and our [preprint](https://arxiv.org/abs/2103.13413): + +> Vision Transformers for Dense Prediction +> René Ranftl, Alexey Bochkovskiy, Vladlen Koltun + + +MiDaS was trained on up to 12 datasets (ReDWeb, DIML, Movies, MegaDepth, WSVD, TartanAir, HRWSI, ApolloScape, BlendedMVS, IRS, KITTI, NYU Depth V2) with +multi-objective optimization. +The original model that was trained on 5 datasets (`MIX 5` in the paper) can be found [here](https://github.com/isl-org/MiDaS/releases/tag/v2). +The figure below shows an overview of the different MiDaS models; the bubble size scales with number of parameters. + +![](figures/Improvement_vs_FPS.png) + +### Setup + +1) Pick one or more models and download the corresponding weights to the `weights` folder: + +MiDaS 3.1 +- For highest quality: [dpt_beit_large_512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) +- For moderately less quality, but better speed-performance trade-off: [dpt_swin2_large_384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt) +- For embedded devices: [dpt_swin2_tiny_256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt), [dpt_levit_224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt) +- For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small [.xml](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.xml), [.bin](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.bin) + +MiDaS 3.0: Legacy transformer models [dpt_large_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) and [dpt_hybrid_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) + +MiDaS 2.1: Legacy convolutional models [midas_v21_384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) and [midas_v21_small_256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) + +1) Set up dependencies: + + ```shell + conda env create -f environment.yaml + conda activate midas-py310 + ``` + +#### optional + +For the Next-ViT model, execute + +```shell +git submodule add https://github.com/isl-org/Next-ViT midas/external/next_vit +``` + +For the OpenVINO model, install + +```shell +pip install openvino +``` + +### Usage + +1) Place one or more input images in the folder `input`. + +2) Run the model with + + ```shell + python run.py --model_type --input_path input --output_path output + ``` + where `````` is chosen from [dpt_beit_large_512](#model_type), [dpt_beit_large_384](#model_type), + [dpt_beit_base_384](#model_type), [dpt_swin2_large_384](#model_type), [dpt_swin2_base_384](#model_type), + [dpt_swin2_tiny_256](#model_type), [dpt_swin_large_384](#model_type), [dpt_next_vit_large_384](#model_type), + [dpt_levit_224](#model_type), [dpt_large_384](#model_type), [dpt_hybrid_384](#model_type), + [midas_v21_384](#model_type), [midas_v21_small_256](#model_type), [openvino_midas_v21_small_256](#model_type). + +3) The resulting depth maps are written to the `output` folder. + +#### optional + +1) By default, the inference resizes the height of input images to the size of a model to fit into the encoder. This + size is given by the numbers in the model names of the [accuracy table](#accuracy). Some models do not only support a single + inference height but a range of different heights. Feel free to explore different heights by appending the extra + command line argument `--height`. Unsupported height values will throw an error. Note that using this argument may + decrease the model accuracy. +2) By default, the inference keeps the aspect ratio of input images when feeding them into the encoder if this is + supported by a model (all models except for Swin, Swin2, LeViT). In order to resize to a square resolution, + disregarding the aspect ratio while preserving the height, use the command line argument `--square`. + +#### via Camera + + If you want the input images to be grabbed from the camera and shown in a window, leave the input and output paths + away and choose a model type as shown above: + + ```shell + python run.py --model_type --side + ``` + + The argument `--side` is optional and causes both the input RGB image and the output depth map to be shown + side-by-side for comparison. + +#### via Docker + +1) Make sure you have installed Docker and the + [NVIDIA Docker runtime](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-\(Native-GPU-Support\)). + +2) Build the Docker image: + + ```shell + docker build -t midas . + ``` + +3) Run inference: + + ```shell + docker run --rm --gpus all -v $PWD/input:/opt/MiDaS/input -v $PWD/output:/opt/MiDaS/output -v $PWD/weights:/opt/MiDaS/weights midas + ``` + + This command passes through all of your NVIDIA GPUs to the container, mounts the + `input` and `output` directories and then runs the inference. + +#### via PyTorch Hub + +The pretrained model is also available on [PyTorch Hub](https://pytorch.org/hub/intelisl_midas_v2/) + +#### via TensorFlow or ONNX + +See [README](https://github.com/isl-org/MiDaS/tree/master/tf) in the `tf` subdirectory. + +Currently only supports MiDaS v2.1. + + +#### via Mobile (iOS / Android) + +See [README](https://github.com/isl-org/MiDaS/tree/master/mobile) in the `mobile` subdirectory. + +#### via ROS1 (Robot Operating System) + +See [README](https://github.com/isl-org/MiDaS/tree/master/ros) in the `ros` subdirectory. + +Currently only supports MiDaS v2.1. DPT-based models to be added. + + +### Accuracy + +We provide a **zero-shot error** $\epsilon_d$ which is evaluated for 6 different datasets +(see [paper](https://arxiv.org/abs/1907.01341v3)). **Lower error values are better**. +$\color{green}{\textsf{Overall model quality is represented by the improvement}}$ ([Imp.](#improvement)) with respect to +MiDaS 3.0 DPTL-384. The models are grouped by the height used for inference, whereas the square training resolution is given by +the numbers in the model names. The table also shows the **number of parameters** (in millions) and the +**frames per second** for inference at the training resolution (for GPU RTX 3090): + +| MiDaS Model | DIW
WHDR | Eth3d
AbsRel | Sintel
AbsRel | TUM
δ1 | KITTI
δ1 | NYUv2
δ1 | $\color{green}{\textsf{Imp.}}$
% | Par.
M | FPS
  | +|-----------------------------------------------------------------------------------------------------------------------|-------------------------:|-----------------------------:|------------------------------:|-------------------------:|-------------------------:|-------------------------:|-------------------------------------------------:|----------------------:|--------------------------:| +| **Inference height 512** | | | | | | | | | | +| [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1137 | 0.0659 | 0.2366 | **6.13** | 11.56* | **1.86*** | $\color{green}{\textsf{19}}$ | **345** | **5.7** | +| [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)$\tiny{\square}$ | **0.1121** | **0.0614** | **0.2090** | 6.46 | **5.00*** | 1.90* | $\color{green}{\textsf{34}}$ | **345** | **5.7** | +| | | | | | | | | | | +| **Inference height 384** | | | | | | | | | | +| [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1245 | 0.0681 | **0.2176** | **6.13** | 6.28* | **2.16*** | $\color{green}{\textsf{28}}$ | 345 | 12 | +| [v3.1 Swin2L-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)$\tiny{\square}$ | 0.1106 | 0.0732 | 0.2442 | 8.87 | **5.84*** | 2.92* | $\color{green}{\textsf{22}}$ | 213 | 41 | +| [v3.1 Swin2B-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt)$\tiny{\square}$ | 0.1095 | 0.0790 | 0.2404 | 8.93 | 5.97* | 3.28* | $\color{green}{\textsf{22}}$ | 102 | 39 | +| [v3.1 SwinL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt)$\tiny{\square}$ | 0.1126 | 0.0853 | 0.2428 | 8.74 | 6.60* | 3.34* | $\color{green}{\textsf{17}}$ | 213 | 49 | +| [v3.1 BEiTL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt) | 0.1239 | **0.0667** | 0.2545 | 7.17 | 9.84* | 2.21* | $\color{green}{\textsf{17}}$ | 344 | 13 | +| [v3.1 Next-ViTL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt) | **0.1031** | 0.0954 | 0.2295 | 9.21 | 6.89* | 3.47* | $\color{green}{\textsf{16}}$ | **72** | 30 | +| [v3.1 BEiTB-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt) | 0.1159 | 0.0967 | 0.2901 | 9.88 | 26.60* | 3.91* | $\color{green}{\textsf{-31}}$ | 112 | 31 | +| [v3.0 DPTL-384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) | 0.1082 | 0.0888 | 0.2697 | 9.97 | 8.46 | 8.32 | $\color{green}{\textsf{0}}$ | 344 | **61** | +| [v3.0 DPTH-384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) | 0.1106 | 0.0934 | 0.2741 | 10.89 | 11.56 | 8.69 | $\color{green}{\textsf{-10}}$ | 123 | 50 | +| [v2.1 Large384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) | 0.1295 | 0.1155 | 0.3285 | 12.51 | 16.08 | 8.71 | $\color{green}{\textsf{-32}}$ | 105 | 47 | +| | | | | | | | | | | +| **Inference height 256** | | | | | | | | | | +| [v3.1 Swin2T-256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt)$\tiny{\square}$ | **0.1211** | **0.1106** | **0.2868** | **13.43** | **10.13*** | **5.55*** | $\color{green}{\textsf{-11}}$ | 42 | 64 | +| [v2.1 Small256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) | 0.1344 | 0.1344 | 0.3370 | 14.53 | 29.27 | 13.43 | $\color{green}{\textsf{-76}}$ | **21** | **90** | +| | | | | | | | | | | +| **Inference height 224** | | | | | | | | | | +| [v3.1 LeViT224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)$\tiny{\square}$ | **0.1314** | **0.1206** | **0.3148** | **18.21** | **15.27*** | **8.64*** | $\color{green}{\textsf{-40}}$ | **51** | **73** | + +* No zero-shot error, because models are also trained on KITTI and NYU Depth V2\ +$\square$ Validation performed at **square resolution**, either because the transformer encoder backbone of a model +does not support non-square resolutions (Swin, Swin2, LeViT) or for comparison with these models. All other +validations keep the aspect ratio. A difference in resolution limits the comparability of the zero-shot error and the +improvement, because these quantities are averages over the pixels of an image and do not take into account the +advantage of more details due to a higher resolution.\ +Best values per column and same validation height in bold + +#### Improvement + +The improvement in the above table is defined as the relative zero-shot error with respect to MiDaS v3.0 +DPTL-384 and averaging over the datasets. So, if $\epsilon_d$ is the zero-shot error for dataset $d$, then +the $\color{green}{\textsf{improvement}}$ is given by $100(1-(1/6)\sum_d\epsilon_d/\epsilon_{d,\rm{DPT_{L-384}}})$%. + +Note that the improvements of 10% for MiDaS v2.0 → v2.1 and 21% for MiDaS v2.1 → v3.0 are not visible from the +improvement column (Imp.) in the table but would require an evaluation with respect to MiDaS v2.1 Large384 +and v2.0 Large384 respectively instead of v3.0 DPTL-384. + +### Depth map comparison + +Zoom in for better visibility +![](figures/Comparison.png) + +### Speed on Camera Feed + +Test configuration +- Windows 10 +- 11th Gen Intel Core i7-1185G7 3.00GHz +- 16GB RAM +- Camera resolution 640x480 +- openvino_midas_v21_small_256 + +Speed: 22 FPS + +### Changelog + +* [Dec 2022] Released MiDaS v3.1: + - New models based on 5 different types of transformers ([BEiT](https://arxiv.org/pdf/2106.08254.pdf), [Swin2](https://arxiv.org/pdf/2111.09883.pdf), [Swin](https://arxiv.org/pdf/2103.14030.pdf), [Next-ViT](https://arxiv.org/pdf/2207.05501.pdf), [LeViT](https://arxiv.org/pdf/2104.01136.pdf)) + - Training datasets extended from 10 to 12, including also KITTI and NYU Depth V2 using [BTS](https://github.com/cleinc/bts) split + - Best model, BEiTLarge 512, with resolution 512x512, is on average about [28% more accurate](#Accuracy) than MiDaS v3.0 + - Integrated live depth estimation from camera feed +* [Sep 2021] Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See [Gradio Web Demo](https://huggingface.co/spaces/akhaliq/DPT-Large). +* [Apr 2021] Released MiDaS v3.0: + - New models based on [Dense Prediction Transformers](https://arxiv.org/abs/2103.13413) are on average [21% more accurate](#Accuracy) than MiDaS v2.1 + - Additional models can be found [here](https://github.com/isl-org/DPT) +* [Nov 2020] Released MiDaS v2.1: + - New model that was trained on 10 datasets and is on average about [10% more accurate](#Accuracy) than [MiDaS v2.0](https://github.com/isl-org/MiDaS/releases/tag/v2) + - New light-weight model that achieves [real-time performance](https://github.com/isl-org/MiDaS/tree/master/mobile) on mobile platforms. + - Sample applications for [iOS](https://github.com/isl-org/MiDaS/tree/master/mobile/ios) and [Android](https://github.com/isl-org/MiDaS/tree/master/mobile/android) + - [ROS package](https://github.com/isl-org/MiDaS/tree/master/ros) for easy deployment on robots +* [Jul 2020] Added TensorFlow and ONNX code. Added [online demo](http://35.202.76.57/). +* [Dec 2019] Released new version of MiDaS - the new model is significantly more accurate and robust +* [Jul 2019] Initial release of MiDaS ([Link](https://github.com/isl-org/MiDaS/releases/tag/v1)) + +### Citation + +Please cite our paper if you use this code or any of the models: +``` +@ARTICLE {Ranftl2022, + author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun", + title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer", + journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence", + year = "2022", + volume = "44", + number = "3" +} +``` + +If you use a DPT-based model, please also cite: + +``` +@article{Ranftl2021, + author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun}, + title = {Vision Transformers for Dense Prediction}, + journal = {ICCV}, + year = {2021}, +} +``` + +### Acknowledgements + +Our work builds on and uses code from [timm](https://github.com/rwightman/pytorch-image-models) and [Next-ViT](https://github.com/bytedance/Next-ViT). +We'd like to thank the authors for making these libraries available. + +### License + +MIT License diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/__pycache__/hubconf.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/__pycache__/hubconf.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c85002bea2c398167442d6c5e7e851e4ec8fa183 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/__pycache__/hubconf.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/__pycache__/hubconf.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/__pycache__/hubconf.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d56ff99ae59faa6ce5d3a65c334c7c4ec425b114 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/__pycache__/hubconf.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml b/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b9abe5693b9e0de56b7d20728f4d0e6333c5822d --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/environment.yaml @@ -0,0 +1,16 @@ +name: midas-py310 +channels: + - pytorch + - defaults +dependencies: + - nvidia::cudatoolkit=11.7 + - python=3.10.8 + - pytorch::pytorch=1.13.0 + - torchvision=0.14.0 + - pip=22.3.1 + - numpy=1.23.4 + - pip: + - opencv-python==4.6.0.66 + - imutils==0.5.4 + - timm==0.6.12 + - einops==0.6.0 \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py new file mode 100644 index 0000000000000000000000000000000000000000..0d638be5151c4e305daff0c47d1ea3fc8066377d --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/hubconf.py @@ -0,0 +1,435 @@ +dependencies = ["torch"] + +import torch + +from midas.dpt_depth import DPTDepthModel +from midas.midas_net import MidasNet +from midas.midas_net_custom import MidasNet_small + +def DPT_BEiT_L_512(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_BEiT_L_512 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="beitl16_512", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_BEiT_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_BEiT_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="beitl16_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_BEiT_B_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_BEiT_B_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="beitb16_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_SwinV2_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_SwinV2_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swin2l24_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_SwinV2_B_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_SwinV2_B_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swin2b24_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_SwinV2_T_256(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_SwinV2_T_256 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swin2t16_256", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Swin_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_Swin_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swinl12_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Next_ViT_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_Next_ViT_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="next_vit_large_6m", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_LeViT_224(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_LeViT_224 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="levit_384", + non_negative=True, + head_features_1=64, + head_features_2=8, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Large(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT-Large model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="vitl16_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Hybrid(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT-Hybrid model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="vitb_rn50_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def MiDaS(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS v2.1 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = MidasNet() + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def MiDaS_small(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS v2.1 small model for monocular depth estimation on resource-constrained devices + pretrained (bool): load pretrained weights into model + """ + + model = MidasNet_small(None, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + + +def transforms(): + import cv2 + from torchvision.transforms import Compose + from midas.transforms import Resize, NormalizeImage, PrepareForNet + from midas import transforms + + transforms.default_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 384, + 384, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="upper_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.small_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 256, + 256, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="upper_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.dpt_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 384, + 384, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.beit512_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 512, + 512, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.swin384_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 384, + 384, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.swin256_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 256, + 256, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.levit_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 224, + 224, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + return transforms diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/base_model.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/base_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fa61645d89d86e68eec0c2bed11d0159ba07a32a Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/base_model.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/base_model.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/base_model.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef00688a031186c062604bb61ef83f2b653f03ac Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/base_model.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/blocks.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/blocks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4866684fa56f4718e67bc5fbbe4220b937aee3e1 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/blocks.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/blocks.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/blocks.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca11489fb1efdf485c7d3fe8e5b687ab25e1073e Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/blocks.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/dpt_depth.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/dpt_depth.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..693396d6909e1849c0e761f44dfcebfbee778eb6 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/dpt_depth.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/dpt_depth.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/dpt_depth.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..474316e9a791e8a8444541b171d6a90d09a8adf7 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/dpt_depth.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d7a0de5dc6a7ce0a56081b1ff5752d3ad6b4d7d Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..503488fd76d58b2161ff0b8b76d0de18db2f7b0b Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net_custom.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net_custom.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40a8b72a3036c7310a70cbc7161f91e5087d915d Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net_custom.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net_custom.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net_custom.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24bbed4e7db7f7e7a1f6f85923b2efbc4ebcd362 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/__pycache__/midas_net_custom.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/beit.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/beit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f61763264648984c4326f1254a140476a94f9d8b Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/beit.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/beit.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/beit.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cda8d9715e6cd328d0d4212cbe408eb93e43543a Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/beit.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/levit.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/levit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2844d75ee7553d96e772e228c43ff0fe8232db35 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/levit.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/levit.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/levit.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1b2de6b170bfb6d10afaf47aa0514fa1329c500 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/levit.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24834a284bd2faf7d10b9bdd10d7ae397ad2e4c5 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a57e14060bd8437ecbc4bae4ab6e43df5abd4bc Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin2.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cf951be942fbe532210a69d692a10403f0c3ea2 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin2.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin2.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin2.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af1f19c59533e4a3e439b989993e5c65bc29d7a5 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin2.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin_common.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin_common.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..051e29a4133f5782143c082c48b9bb4d2b3b79f5 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin_common.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin_common.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin_common.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..941cc590ca288bec11134ae101c292d5d086f955 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/swin_common.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/utils.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0d8022c96db232d019833ae3700c92e097f1393 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/utils.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/utils.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..311065874edacc1312a3ba2b8e0596b1905bbf1a Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/utils.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/vit.cpython-310.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/vit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6674935b9eb5388abf3e592d225c8d9ff66fb7f0 Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/vit.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/vit.cpython-38.pyc b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/vit.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b918e48ad7ece3905c12469d6433afcd848f34ab Binary files /dev/null and b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/__pycache__/vit.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py new file mode 100644 index 0000000000000000000000000000000000000000..7a24e02cd2b979844bf638b46ac60949ee9ce691 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/beit.py @@ -0,0 +1,196 @@ +import timm +import torch +import types + +import numpy as np +import torch.nn.functional as F + +from .utils import forward_adapted_unflatten, make_backbone_default +from timm.models.beit import gen_relative_position_index +from torch.utils.checkpoint import checkpoint +from typing import Optional + + +def forward_beit(pretrained, x): + return forward_adapted_unflatten(pretrained, x, "forward_features") + + +def patch_embed_forward(self, x): + """ + Modification of timm.models.layers.patch_embed.py: PatchEmbed.forward to support arbitrary window sizes. + """ + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + return x + + +def _get_rel_pos_bias(self, window_size): + """ + Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes. + """ + old_height = 2 * self.window_size[0] - 1 + old_width = 2 * self.window_size[1] - 1 + + new_height = 2 * window_size[0] - 1 + new_width = 2 * window_size[1] - 1 + + old_relative_position_bias_table = self.relative_position_bias_table + + old_num_relative_distance = self.num_relative_distance + new_num_relative_distance = new_height * new_width + 3 + + old_sub_table = old_relative_position_bias_table[:old_num_relative_distance - 3] + + old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2) + new_sub_table = F.interpolate(old_sub_table, size=(new_height, new_width), mode="bilinear") + new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1) + + new_relative_position_bias_table = torch.cat( + [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3:]]) + + key = str(window_size[1]) + "," + str(window_size[0]) + if key not in self.relative_position_indices.keys(): + self.relative_position_indices[key] = gen_relative_position_index(window_size) + + relative_position_bias = new_relative_position_bias_table[ + self.relative_position_indices[key].view(-1)].view( + window_size[0] * window_size[1] + 1, + window_size[0] * window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + return relative_position_bias.unsqueeze(0) + + +def attention_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None): + """ + Modification of timm.models.beit.py: Attention.forward to support arbitrary window sizes. + """ + B, N, C = x.shape + + qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + window_size = tuple(np.array(resolution) // 16) + attn = attn + self._get_rel_pos_bias(window_size) + if shared_rel_pos_bias is not None: + attn = attn + shared_rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +def block_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None): + """ + Modification of timm.models.beit.py: Block.forward to support arbitrary window sizes. + """ + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), resolution, shared_rel_pos_bias=shared_rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), resolution, + shared_rel_pos_bias=shared_rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +def beit_forward_features(self, x): + """ + Modification of timm.models.beit.py: Beit.forward_features to support arbitrary window sizes. + """ + resolution = x.shape[2:] + + x = self.patch_embed(x) + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias) + else: + x = blk(x, resolution, shared_rel_pos_bias=rel_pos_bias) + x = self.norm(x) + return x + + +def _make_beit_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[0, 4, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + start_index_readout=1, +): + backbone = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index, + start_index_readout) + + backbone.model.patch_embed.forward = types.MethodType(patch_embed_forward, backbone.model.patch_embed) + backbone.model.forward_features = types.MethodType(beit_forward_features, backbone.model) + + for block in backbone.model.blocks: + attn = block.attn + attn._get_rel_pos_bias = types.MethodType(_get_rel_pos_bias, attn) + attn.forward = types.MethodType(attention_forward, attn) + attn.relative_position_indices = {} + + block.forward = types.MethodType(block_forward, block) + + return backbone + + +def _make_pretrained_beitl16_512(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("beit_large_patch16_512", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks is None else hooks + + features = [256, 512, 1024, 1024] + + return _make_beit_backbone( + model, + features=features, + size=[512, 512], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_beitl16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("beit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks is None else hooks + return _make_beit_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_beitb16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("beit_base_patch16_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_beit_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout, + ) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py new file mode 100644 index 0000000000000000000000000000000000000000..6d023a98702a0451806d26f33f8bccf931814f10 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/levit.py @@ -0,0 +1,106 @@ +import timm +import torch +import torch.nn as nn +import numpy as np + +from .utils import activations, get_activation, Transpose + + +def forward_levit(pretrained, x): + pretrained.model.forward_features(x) + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + + layer_1 = pretrained.act_postprocess1(layer_1) + layer_2 = pretrained.act_postprocess2(layer_2) + layer_3 = pretrained.act_postprocess3(layer_3) + + return layer_1, layer_2, layer_3 + + +def _make_levit_backbone( + model, + hooks=[3, 11, 21], + patch_grid=[14, 14] +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + + pretrained.activations = activations + + patch_grid_size = np.array(patch_grid, dtype=int) + + pretrained.act_postprocess1 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) + ) + pretrained.act_postprocess2 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist())) + ) + pretrained.act_postprocess3 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist())) + ) + + return pretrained + + +class ConvTransposeNorm(nn.Sequential): + """ + Modification of + https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm + such that ConvTranspose2d is used instead of Conv2d. + """ + + def __init__( + self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1, + groups=1, bn_weight_init=1): + super().__init__() + self.add_module('c', + nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False)) + self.add_module('bn', nn.BatchNorm2d(out_chs)) + + nn.init.constant_(self.bn.weight, bn_weight_init) + + @torch.no_grad() + def fuse(self): + c, bn = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = c.weight * w[:, None, None, None] + b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 + m = nn.ConvTranspose2d( + w.size(1), w.size(0), w.shape[2:], stride=self.c.stride, + padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + +def stem_b4_transpose(in_chs, out_chs, activation): + """ + Modification of + https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16 + such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half. + """ + return nn.Sequential( + ConvTransposeNorm(in_chs, out_chs, 3, 2, 1), + activation(), + ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1), + activation()) + + +def _make_pretrained_levit_384(pretrained, hooks=None): + model = timm.create_model("levit_384", pretrained=pretrained) + + hooks = [3, 11, 21] if hooks == None else hooks + return _make_levit_backbone( + model, + hooks=hooks + ) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..8afdd8b743b5ab023a359dc3b721e601b1a40d11 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/next_vit.py @@ -0,0 +1,39 @@ +import timm + +import torch.nn as nn + +from pathlib import Path +from .utils import activations, forward_default, get_activation + +from ..external.next_vit.classification.nextvit import * + + +def forward_next_vit(pretrained, x): + return forward_default(pretrained, x, "forward") + + +def _make_next_vit_backbone( + model, + hooks=[2, 6, 36, 39], +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + return pretrained + + +def _make_pretrained_next_vit_large_6m(hooks=None): + model = timm.create_model("nextvit_large") + + hooks = [2, 6, 36, 39] if hooks == None else hooks + return _make_next_vit_backbone( + model, + hooks=hooks, + ) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c71367e3e78b087f80b2ab3e2f495a9c372f1a --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin.py @@ -0,0 +1,13 @@ +import timm + +from .swin_common import _make_swin_backbone + + +def _make_pretrained_swinl12_384(pretrained, hooks=None): + model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained) + + hooks = [1, 1, 17, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks + ) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py new file mode 100644 index 0000000000000000000000000000000000000000..ce4c8f1d6fc1807a207dc6b9a261c6f7b14a87a3 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin2.py @@ -0,0 +1,34 @@ +import timm + +from .swin_common import _make_swin_backbone + + +def _make_pretrained_swin2l24_384(pretrained, hooks=None): + model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained) + + hooks = [1, 1, 17, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks + ) + + +def _make_pretrained_swin2b24_384(pretrained, hooks=None): + model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained) + + hooks = [1, 1, 17, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks + ) + + +def _make_pretrained_swin2t16_256(pretrained, hooks=None): + model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained) + + hooks = [1, 1, 5, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks, + patch_grid=[64, 64] + ) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin_common.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin_common.py new file mode 100644 index 0000000000000000000000000000000000000000..94d63d408f18511179d90b3ac6f697385d1e556d --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/swin_common.py @@ -0,0 +1,52 @@ +import torch + +import torch.nn as nn +import numpy as np + +from .utils import activations, forward_default, get_activation, Transpose + + +def forward_swin(pretrained, x): + return forward_default(pretrained, x) + + +def _make_swin_backbone( + model, + hooks=[1, 1, 17, 1], + patch_grid=[96, 96] +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + if hasattr(model, "patch_grid"): + used_patch_grid = model.patch_grid + else: + used_patch_grid = patch_grid + + patch_grid_size = np.array(used_patch_grid, dtype=int) + + pretrained.act_postprocess1 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) + ) + pretrained.act_postprocess2 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist())) + ) + pretrained.act_postprocess3 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist())) + ) + pretrained.act_postprocess4 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist())) + ) + + return pretrained diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/utils.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0558899dddcfccec5f01a764d4f21738eb612149 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/utils.py @@ -0,0 +1,249 @@ +import torch + +import torch.nn as nn + + +class Slice(nn.Module): + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index:] + + +class AddReadout(nn.Module): + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index:] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) + features = torch.cat((x[:, self.start_index:], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +activations = {} + + +def get_activation(name): + def hook(model, input, output): + activations[name] = output + + return hook + + +def forward_default(pretrained, x, function_name="forward_features"): + exec(f"pretrained.model.{function_name}(x)") + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + layer_4 = pretrained.activations["4"] + + if hasattr(pretrained, "act_postprocess1"): + layer_1 = pretrained.act_postprocess1(layer_1) + if hasattr(pretrained, "act_postprocess2"): + layer_2 = pretrained.act_postprocess2(layer_2) + if hasattr(pretrained, "act_postprocess3"): + layer_3 = pretrained.act_postprocess3(layer_3) + if hasattr(pretrained, "act_postprocess4"): + layer_4 = pretrained.act_postprocess4(layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def forward_adapted_unflatten(pretrained, x, function_name="forward_features"): + b, c, h, w = x.shape + + exec(f"glob = pretrained.model.{function_name}(x)") + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + layer_4 = pretrained.activations["4"] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size( + [ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ] + ), + ) + ) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3: len(pretrained.act_postprocess1)](layer_1) + layer_2 = pretrained.act_postprocess2[3: len(pretrained.act_postprocess2)](layer_2) + layer_3 = pretrained.act_postprocess3[3: len(pretrained.act_postprocess3)](layer_3) + layer_4 = pretrained.act_postprocess4[3: len(pretrained.act_postprocess4)](layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == "ignore": + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == "add": + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == "project": + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def make_backbone_default( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + start_index_readout=1, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index_readout) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + return pretrained diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/vit.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..413f9693bd4548342280e329c9128c1a52cea920 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/backbones/vit.py @@ -0,0 +1,221 @@ +import torch +import torch.nn as nn +import timm +import types +import math +import torch.nn.functional as F + +from .utils import (activations, forward_adapted_unflatten, get_activation, get_readout_oper, + make_backbone_default, Transpose) + + +def forward_vit(pretrained, x): + return forward_adapted_unflatten(pretrained, x, "forward_flex") + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, : self.start_index], + posemb[0, self.start_index:], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed( + self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] + ) + + B = x.shape[0] + + if hasattr(self.patch_embed, "backbone"): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, "dist_token", None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + if self.no_embed_class: + x = x + pos_embed + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + if not self.no_embed_class: + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + return x + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + start_index_readout=1, +): + pretrained = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index, + start_index_readout) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks == None else hooks + return _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_base_patch16_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks == None else hooks + return _make_vit_b16_backbone( + model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout + ) + + +def _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=[0, 1, 8, 11], + vit_features=768, + patch_size=[16, 16], + number_stages=2, + use_vit_only=False, + use_readout="ignore", + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + + used_number_stages = 0 if use_vit_only else number_stages + for s in range(used_number_stages): + pretrained.model.patch_embed.backbone.stages[s].register_forward_hook( + get_activation(str(s + 1)) + ) + for s in range(used_number_stages, 4): + pretrained.model.blocks[hooks[s]].register_forward_hook(get_activation(str(s + 1))) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) + + for s in range(used_number_stages): + value = nn.Sequential(nn.Identity(), nn.Identity(), nn.Identity()) + exec(f"pretrained.act_postprocess{s + 1}=value") + for s in range(used_number_stages, 4): + if s < number_stages: + final_layer = nn.ConvTranspose2d( + in_channels=features[s], + out_channels=features[s], + kernel_size=4 // (2 ** s), + stride=4 // (2 ** s), + padding=0, + bias=True, + dilation=1, + groups=1, + ) + elif s > number_stages: + final_layer = nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ) + else: + final_layer = None + + layers = [ + readout_oper[s], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[s], + kernel_size=1, + stride=1, + padding=0, + ), + ] + if final_layer is not None: + layers.append(final_layer) + + value = nn.Sequential(*layers) + exec(f"pretrained.act_postprocess{s + 1}=value") + + pretrained.model.start_index = start_index + pretrained.model.patch_size = patch_size + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_pretrained_vitb_rn50_384( + pretrained, use_readout="ignore", hooks=None, use_vit_only=False +): + model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained) + + hooks = [0, 1, 8, 11] if hooks == None else hooks + return _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/base_model.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5cf430239b47ec5ec07531263f26f5c24a2311cd --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/base_model.py @@ -0,0 +1,16 @@ +import torch + + +class BaseModel(torch.nn.Module): + def load(self, path): + """Load model from file. + + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device('cpu')) + + if "optimizer" in parameters: + parameters = parameters["model"] + + self.load_state_dict(parameters) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/blocks.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..6d87a00680bb6ed9a6d7c3043ea30a1e90361794 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/blocks.py @@ -0,0 +1,439 @@ +import torch +import torch.nn as nn + +from .backbones.beit import ( + _make_pretrained_beitl16_512, + _make_pretrained_beitl16_384, + _make_pretrained_beitb16_384, + forward_beit, +) +from .backbones.swin_common import ( + forward_swin, +) +from .backbones.swin2 import ( + _make_pretrained_swin2l24_384, + _make_pretrained_swin2b24_384, + _make_pretrained_swin2t16_256, +) +from .backbones.swin import ( + _make_pretrained_swinl12_384, +) +from .backbones.levit import ( + _make_pretrained_levit_384, + forward_levit, +) +from .backbones.vit import ( + _make_pretrained_vitb_rn50_384, + _make_pretrained_vitl16_384, + _make_pretrained_vitb16_384, + forward_vit, +) + +def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, + use_vit_only=False, use_readout="ignore", in_features=[96, 256, 512, 1024]): + if backbone == "beitl16_512": + pretrained = _make_pretrained_beitl16_512( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # BEiT_512-L (backbone) + elif backbone == "beitl16_384": + pretrained = _make_pretrained_beitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # BEiT_384-L (backbone) + elif backbone == "beitb16_384": + pretrained = _make_pretrained_beitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # BEiT_384-B (backbone) + elif backbone == "swin2l24_384": + pretrained = _make_pretrained_swin2l24_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [192, 384, 768, 1536], features, groups=groups, expand=expand + ) # Swin2-L/12to24 (backbone) + elif backbone == "swin2b24_384": + pretrained = _make_pretrained_swin2b24_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [128, 256, 512, 1024], features, groups=groups, expand=expand + ) # Swin2-B/12to24 (backbone) + elif backbone == "swin2t16_256": + pretrained = _make_pretrained_swin2t16_256( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # Swin2-T/16 (backbone) + elif backbone == "swinl12_384": + pretrained = _make_pretrained_swinl12_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [192, 384, 768, 1536], features, groups=groups, expand=expand + ) # Swin-L/12 (backbone) + elif backbone == "next_vit_large_6m": + from .backbones.next_vit import _make_pretrained_next_vit_large_6m + pretrained = _make_pretrained_next_vit_large_6m(hooks=hooks) + scratch = _make_scratch( + in_features, features, groups=groups, expand=expand + ) # Next-ViT-L on ImageNet-1K-6M (backbone) + elif backbone == "levit_384": + pretrained = _make_pretrained_levit_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [384, 512, 768], features, groups=groups, expand=expand + ) # LeViT 384 (backbone) + elif backbone == "vitl16_384": + pretrained = _make_pretrained_vitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # ViT-L/16 - 85.0% Top1 (backbone) + elif backbone == "vitb_rn50_384": + pretrained = _make_pretrained_vitb_rn50_384( + use_pretrained, + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) + scratch = _make_scratch( + [256, 512, 768, 768], features, groups=groups, expand=expand + ) # ViT-H/16 - 85.0% Top1 (backbone) + elif backbone == "vitb16_384": + pretrained = _make_pretrained_vitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # ViT-B/16 - 84.6% Top1 (backbone) + elif backbone == "resnext101_wsl": + pretrained = _make_pretrained_resnext101_wsl(use_pretrained) + scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3 + elif backbone == "efficientnet_lite3": + pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable) + scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3 + else: + print(f"Backbone '{backbone}' not implemented") + assert False + + return pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + if len(in_shape) >= 4: + out_shape4 = out_shape + + if expand: + out_shape1 = out_shape + out_shape2 = out_shape*2 + out_shape3 = out_shape*4 + if len(in_shape) >= 4: + out_shape4 = out_shape*8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + if len(in_shape) >= 4: + scratch.layer4_rn = nn.Conv2d( + in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + + return scratch + + +def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): + efficientnet = torch.hub.load( + "rwightman/gen-efficientnet-pytorch", + "tf_efficientnet_lite3", + pretrained=use_pretrained, + exportable=exportable + ) + return _make_efficientnet_backbone(efficientnet) + + +def _make_efficientnet_backbone(effnet): + pretrained = nn.Module() + + pretrained.layer1 = nn.Sequential( + effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2] + ) + pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) + pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) + pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) + + return pretrained + + +def _make_resnet_backbone(resnet): + pretrained = nn.Module() + pretrained.layer1 = nn.Sequential( + resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 + ) + + pretrained.layer2 = resnet.layer2 + pretrained.layer3 = resnet.layer3 + pretrained.layer4 = resnet.layer4 + + return pretrained + + +def _make_pretrained_resnext101_wsl(use_pretrained): + resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") + return _make_resnet_backbone(resnet) + + + +class Interpolate(nn.Module): + """Interpolation module. + """ + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners + ) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode="bilinear", align_corners=True + ) + + return output + + + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups=1 + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + if self.bn==True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn==True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn==True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + # return out + x + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups=1 + + self.expand = expand + out_features = features + if self.expand==True: + out_features = features//2 + + self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + self.size=size + + def forward(self, *xs, size=None): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + if (size is None) and (self.size is None): + modifier = {"scale_factor": 2} + elif size is None: + modifier = {"size": self.size} + else: + modifier = {"size": size} + + output = nn.functional.interpolate( + output, **modifier, mode="bilinear", align_corners=self.align_corners + ) + + output = self.out_conv(output) + + return output + diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/dpt_depth.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/dpt_depth.py new file mode 100644 index 0000000000000000000000000000000000000000..3129d09cb43a7c79b23916236991fabbedb78f55 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/dpt_depth.py @@ -0,0 +1,166 @@ +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import ( + FeatureFusionBlock_custom, + Interpolate, + _make_encoder, + forward_beit, + forward_swin, + forward_levit, + forward_vit, +) +from .backbones.levit import stem_b4_transpose +from timm.models.layers import get_act_layer + + +def _make_fusion_block(features, use_bn, size = None): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + size=size, + ) + + +class DPT(BaseModel): + def __init__( + self, + head, + features=256, + backbone="vitb_rn50_384", + readout="project", + channels_last=False, + use_bn=False, + **kwargs + ): + + super(DPT, self).__init__() + + self.channels_last = channels_last + + # For the Swin, Swin 2, LeViT and Next-ViT Transformers, the hierarchical architectures prevent setting the + # hooks freely. Instead, the hooks have to be chosen according to the ranges specified in the comments. + hooks = { + "beitl16_512": [5, 11, 17, 23], + "beitl16_384": [5, 11, 17, 23], + "beitb16_384": [2, 5, 8, 11], + "swin2l24_384": [1, 1, 17, 1], # Allowed ranges: [0, 1], [0, 1], [ 0, 17], [ 0, 1] + "swin2b24_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1] + "swin2t16_256": [1, 1, 5, 1], # [0, 1], [0, 1], [ 0, 5], [ 0, 1] + "swinl12_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1] + "next_vit_large_6m": [2, 6, 36, 39], # [0, 2], [3, 6], [ 7, 36], [37, 39] + "levit_384": [3, 11, 21], # [0, 3], [6, 11], [14, 21] + "vitb_rn50_384": [0, 1, 8, 11], + "vitb16_384": [2, 5, 8, 11], + "vitl16_384": [5, 11, 17, 23], + }[backbone] + + if "next_vit" in backbone: + in_features = { + "next_vit_large_6m": [96, 256, 512, 1024], + }[backbone] + else: + in_features = None + + # Instantiate backbone and reassemble blocks + self.pretrained, self.scratch = _make_encoder( + backbone, + features, + False, # Set to true of you want to train from scratch, uses ImageNet weights + groups=1, + expand=False, + exportable=False, + hooks=hooks, + use_readout=readout, + in_features=in_features, + ) + + self.number_layers = len(hooks) if hooks is not None else 4 + size_refinenet3 = None + self.scratch.stem_transpose = None + + if "beit" in backbone: + self.forward_transformer = forward_beit + elif "swin" in backbone: + self.forward_transformer = forward_swin + elif "next_vit" in backbone: + from .backbones.next_vit import forward_next_vit + self.forward_transformer = forward_next_vit + elif "levit" in backbone: + self.forward_transformer = forward_levit + size_refinenet3 = 7 + self.scratch.stem_transpose = stem_b4_transpose(256, 128, get_act_layer("hard_swish")) + else: + self.forward_transformer = forward_vit + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn, size_refinenet3) + if self.number_layers >= 4: + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.scratch.output_conv = head + + + def forward(self, x): + if self.channels_last == True: + x.contiguous(memory_format=torch.channels_last) + + layers = self.forward_transformer(self.pretrained, x) + if self.number_layers == 3: + layer_1, layer_2, layer_3 = layers + else: + layer_1, layer_2, layer_3, layer_4 = layers + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + if self.number_layers >= 4: + layer_4_rn = self.scratch.layer4_rn(layer_4) + + if self.number_layers == 3: + path_3 = self.scratch.refinenet3(layer_3_rn, size=layer_2_rn.shape[2:]) + else: + path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:]) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:]) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + if self.scratch.stem_transpose is not None: + path_1 = self.scratch.stem_transpose(path_1) + + out = self.scratch.output_conv(path_1) + + return out + + +class DPTDepthModel(DPT): + def __init__(self, path=None, non_negative=True, **kwargs): + features = kwargs["features"] if "features" in kwargs else 256 + head_features_1 = kwargs["head_features_1"] if "head_features_1" in kwargs else features + head_features_2 = kwargs["head_features_2"] if "head_features_2" in kwargs else 32 + kwargs.pop("head_features_1", None) + kwargs.pop("head_features_2", None) + + head = nn.Sequential( + nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) + + def forward(self, x): + return super().forward(x).squeeze(dim=1) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net.py new file mode 100644 index 0000000000000000000000000000000000000000..8a954977800b0a0f48807e80fa63041910e33c1f --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net.py @@ -0,0 +1,76 @@ +"""MidashNet: Network for monocular depth estimation trained by mixing several datasets. +This file contains code that is adapted from +https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py +""" +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import FeatureFusionBlock, Interpolate, _make_encoder + + +class MidasNet(BaseModel): + """Network for monocular depth estimation. + """ + + def __init__(self, path=None, features=256, non_negative=True): + """Init. + + Args: + path (str, optional): Path to saved model. Defaults to None. + features (int, optional): Number of features. Defaults to 256. + backbone (str, optional): Backbone network for encoder. Defaults to resnet50 + """ + print("Loading weights: ", path) + + super(MidasNet, self).__init__() + + use_pretrained = False if path is None else True + + self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) + + self.scratch.refinenet4 = FeatureFusionBlock(features) + self.scratch.refinenet3 = FeatureFusionBlock(features) + self.scratch.refinenet2 = FeatureFusionBlock(features) + self.scratch.refinenet1 = FeatureFusionBlock(features) + + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear"), + nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + ) + + if path: + self.load(path) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + + layer_1 = self.pretrained.layer1(x) + layer_2 = self.pretrained.layer2(layer_1) + layer_3 = self.pretrained.layer3(layer_2) + layer_4 = self.pretrained.layer4(layer_3) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return torch.squeeze(out, dim=1) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net_custom.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net_custom.py new file mode 100644 index 0000000000000000000000000000000000000000..50e4acb5e53d5fabefe3dde16ab49c33c2b7797c --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/midas_net_custom.py @@ -0,0 +1,128 @@ +"""MidashNet: Network for monocular depth estimation trained by mixing several datasets. +This file contains code that is adapted from +https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py +""" +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder + + +class MidasNet_small(BaseModel): + """Network for monocular depth estimation. + """ + + def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True, + blocks={'expand': True}): + """Init. + + Args: + path (str, optional): Path to saved model. Defaults to None. + features (int, optional): Number of features. Defaults to 256. + backbone (str, optional): Backbone network for encoder. Defaults to resnet50 + """ + print("Loading weights: ", path) + + super(MidasNet_small, self).__init__() + + use_pretrained = False if path else True + + self.channels_last = channels_last + self.blocks = blocks + self.backbone = backbone + + self.groups = 1 + + features1=features + features2=features + features3=features + features4=features + self.expand = False + if "expand" in self.blocks and self.blocks['expand'] == True: + self.expand = True + features1=features + features2=features*2 + features3=features*4 + features4=features*8 + + self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable) + + self.scratch.activation = nn.ReLU(False) + + self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners) + + + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups), + Interpolate(scale_factor=2, mode="bilinear"), + nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + self.scratch.activation, + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + if path: + self.load(path) + + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + if self.channels_last==True: + print("self.channels_last = ", self.channels_last) + x.contiguous(memory_format=torch.channels_last) + + + layer_1 = self.pretrained.layer1(x) + layer_2 = self.pretrained.layer2(layer_1) + layer_3 = self.pretrained.layer3(layer_2) + layer_4 = self.pretrained.layer4(layer_3) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return torch.squeeze(out, dim=1) + + + +def fuse_model(m): + prev_previous_type = nn.Identity() + prev_previous_name = '' + previous_type = nn.Identity() + previous_name = '' + for name, module in m.named_modules(): + if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU: + # print("FUSED ", prev_previous_name, previous_name, name) + torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True) + elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d: + # print("FUSED ", prev_previous_name, previous_name) + torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True) + # elif previous_type == nn.Conv2d and type(module) == nn.ReLU: + # print("FUSED ", previous_name, name) + # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True) + + prev_previous_type = previous_type + prev_previous_name = previous_name + previous_type = type(module) + previous_name = name \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/model_loader.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/model_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..f1cd1f2d43054bfd3d650587c7b2ed35f1347c9e --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/model_loader.py @@ -0,0 +1,242 @@ +import cv2 +import torch + +from midas.dpt_depth import DPTDepthModel +from midas.midas_net import MidasNet +from midas.midas_net_custom import MidasNet_small +from midas.transforms import Resize, NormalizeImage, PrepareForNet + +from torchvision.transforms import Compose + +default_models = { + "dpt_beit_large_512": "weights/dpt_beit_large_512.pt", + "dpt_beit_large_384": "weights/dpt_beit_large_384.pt", + "dpt_beit_base_384": "weights/dpt_beit_base_384.pt", + "dpt_swin2_large_384": "weights/dpt_swin2_large_384.pt", + "dpt_swin2_base_384": "weights/dpt_swin2_base_384.pt", + "dpt_swin2_tiny_256": "weights/dpt_swin2_tiny_256.pt", + "dpt_swin_large_384": "weights/dpt_swin_large_384.pt", + "dpt_next_vit_large_384": "weights/dpt_next_vit_large_384.pt", + "dpt_levit_224": "weights/dpt_levit_224.pt", + "dpt_large_384": "weights/dpt_large_384.pt", + "dpt_hybrid_384": "weights/dpt_hybrid_384.pt", + "midas_v21_384": "weights/midas_v21_384.pt", + "midas_v21_small_256": "weights/midas_v21_small_256.pt", + "openvino_midas_v21_small_256": "weights/openvino_midas_v21_small_256.xml", +} + + +def load_model(device, model_path, model_type="dpt_large_384", optimize=True, height=None, square=False): + """Load the specified network. + + Args: + device (device): the torch device used + model_path (str): path to saved model + model_type (str): the type of the model to be loaded + optimize (bool): optimize the model to half-integer on CUDA? + height (int): inference encoder image height + square (bool): resize to a square resolution? + + Returns: + The loaded network, the transform which prepares images as input to the network and the dimensions of the + network input + """ + if "openvino" in model_type: + from openvino.runtime import Core + + keep_aspect_ratio = not square + + if model_type == "dpt_beit_large_512": + model = DPTDepthModel( + path=model_path, + backbone="beitl16_512", + non_negative=True, + ) + net_w, net_h = 512, 512 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_beit_large_384": + model = DPTDepthModel( + path=model_path, + backbone="beitl16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_beit_base_384": + model = DPTDepthModel( + path=model_path, + backbone="beitb16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin2_large_384": + model = DPTDepthModel( + path=model_path, + backbone="swin2l24_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin2_base_384": + model = DPTDepthModel( + path=model_path, + backbone="swin2b24_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin2_tiny_256": + model = DPTDepthModel( + path=model_path, + backbone="swin2t16_256", + non_negative=True, + ) + net_w, net_h = 256, 256 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin_large_384": + model = DPTDepthModel( + path=model_path, + backbone="swinl12_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_next_vit_large_384": + model = DPTDepthModel( + path=model_path, + backbone="next_vit_large_6m", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + # We change the notation from dpt_levit_224 (MiDaS notation) to levit_384 (timm notation) here, where the 224 refers + # to the resolution 224x224 used by LeViT and 384 is the first entry of the embed_dim, see _cfg and model_cfgs of + # https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/levit.py + # (commit id: 927f031293a30afb940fff0bee34b85d9c059b0e) + elif model_type == "dpt_levit_224": + model = DPTDepthModel( + path=model_path, + backbone="levit_384", + non_negative=True, + head_features_1=64, + head_features_2=8, + ) + net_w, net_h = 224, 224 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_large_384": + model = DPTDepthModel( + path=model_path, + backbone="vitl16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_hybrid_384": + model = DPTDepthModel( + path=model_path, + backbone="vitb_rn50_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "midas_v21_384": + model = MidasNet(model_path, non_negative=True) + net_w, net_h = 384, 384 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + elif model_type == "midas_v21_small_256": + model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, + non_negative=True, blocks={'expand': True}) + net_w, net_h = 256, 256 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + elif model_type == "openvino_midas_v21_small_256": + ie = Core() + uncompiled_model = ie.read_model(model=model_path) + model = ie.compile_model(uncompiled_model, "CPU") + net_w, net_h = 256, 256 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + else: + print(f"model_type '{model_type}' not implemented, use: --model_type large") + assert False + + if not "openvino" in model_type: + print("Model loaded, number of parameters = {:.0f}M".format(sum(p.numel() for p in model.parameters()) / 1e6)) + else: + print("Model loaded, optimized with OpenVINO") + + if "openvino" in model_type: + keep_aspect_ratio = False + + if height is not None: + net_w, net_h = height, height + + transform = Compose( + [ + Resize( + net_w, + net_h, + resize_target=None, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=32, + resize_method=resize_mode, + image_interpolation_method=cv2.INTER_CUBIC, + ), + normalization, + PrepareForNet(), + ] + ) + + if not "openvino" in model_type: + model.eval() + + if optimize and (device == torch.device("cuda")): + if not "openvino" in model_type: + model = model.to(memory_format=torch.channels_last) + model = model.half() + else: + print("Error: OpenVINO models are already optimized. No optimization to half-float possible.") + exit() + + if not "openvino" in model_type: + model.to(device) + + return model, transform, net_w, net_h diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/transforms.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..350cbc11662633ad7f8968eb10be2e7de6e384e9 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/midas/transforms.py @@ -0,0 +1,234 @@ +import numpy as np +import cv2 +import math + + +def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): + """Rezise the sample to ensure the given size. Keeps aspect ratio. + + Args: + sample (dict): sample + size (tuple): image size + + Returns: + tuple: new size + """ + shape = list(sample["disparity"].shape) + + if shape[0] >= size[0] and shape[1] >= size[1]: + return sample + + scale = [0, 0] + scale[0] = size[0] / shape[0] + scale[1] = size[1] / shape[1] + + scale = max(scale) + + shape[0] = math.ceil(scale * shape[0]) + shape[1] = math.ceil(scale * shape[1]) + + # resize + sample["image"] = cv2.resize( + sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST + ) + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + tuple(shape[::-1]), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return tuple(shape) + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size( + sample["image"].shape[1], sample["image"].shape[0] + ) + + # resize sample + sample["image"] = cv2.resize( + sample["image"], + (width, height), + interpolation=self.__image_interpolation_method, + ) + + if self.__resize_target: + if "disparity" in sample: + sample["disparity"] = cv2.resize( + sample["disparity"], + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if "depth" in sample: + sample["depth"] = cv2.resize( + sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST + ) + + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + if "disparity" in sample: + disparity = sample["disparity"].astype(np.float32) + sample["disparity"] = np.ascontiguousarray(disparity) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + return sample diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/LICENSE b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..6606ec028d1c629986e7019fe3564f5b4bfe425d --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Alexey + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/README.md b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d43c2606767798ee46b34292e0483197424ec23 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/README.md @@ -0,0 +1,131 @@ +# MiDaS for ROS1 by using LibTorch in C++ + +### Requirements + +- Ubuntu 17.10 / 18.04 / 20.04, Debian Stretch +- ROS Melodic for Ubuntu (17.10 / 18.04) / Debian Stretch, ROS Noetic for Ubuntu 20.04 +- C++11 +- LibTorch >= 1.6 + +## Quick Start with a MiDaS Example + +MiDaS is a neural network to compute depth from a single image. + +* input from `image_topic`: `sensor_msgs/Image` - `RGB8` image with any shape +* output to `midas_topic`: `sensor_msgs/Image` - `TYPE_32FC1` inverse relative depth maps in range [0 - 255] with original size and channels=1 + +### Install Dependecies + +* install ROS Melodic for Ubuntu 17.10 / 18.04: +```bash +wget https://raw.githubusercontent.com/isl-org/MiDaS/master/ros/additions/install_ros_melodic_ubuntu_17_18.sh +./install_ros_melodic_ubuntu_17_18.sh +``` + +or Noetic for Ubuntu 20.04: + +```bash +wget https://raw.githubusercontent.com/isl-org/MiDaS/master/ros/additions/install_ros_noetic_ubuntu_20.sh +./install_ros_noetic_ubuntu_20.sh +``` + + +* install LibTorch 1.7 with CUDA 11.0: + +On **Jetson (ARM)**: +```bash +wget https://nvidia.box.com/shared/static/wa34qwrwtk9njtyarwt5nvo6imenfy26.whl -O torch-1.7.0-cp36-cp36m-linux_aarch64.whl +sudo apt-get install python3-pip libopenblas-base libopenmpi-dev +pip3 install Cython +pip3 install numpy torch-1.7.0-cp36-cp36m-linux_aarch64.whl +``` +Or compile LibTorch from source: https://github.com/pytorch/pytorch#from-source + +On **Linux (x86_64)**: +```bash +cd ~/ +wget https://download.pytorch.org/libtorch/cu110/libtorch-cxx11-abi-shared-with-deps-1.7.0%2Bcu110.zip +unzip libtorch-cxx11-abi-shared-with-deps-1.7.0+cu110.zip +``` + +* create symlink for OpenCV: + +```bash +sudo ln -s /usr/include/opencv4 /usr/include/opencv +``` + +* download and install MiDaS: + +```bash +source ~/.bashrc +cd ~/ +mkdir catkin_ws +cd catkin_ws +git clone https://github.com/isl-org/MiDaS +mkdir src +cp -r MiDaS/ros/* src + +chmod +x src/additions/*.sh +chmod +x src/*.sh +chmod +x src/midas_cpp/scripts/*.py +cp src/additions/do_catkin_make.sh ./do_catkin_make.sh +./do_catkin_make.sh +./src/additions/downloads.sh +``` + +### Usage + +* run only `midas` node: `~/catkin_ws/src/launch_midas_cpp.sh` + +#### Test + +* Test - capture video and show result in the window: + * place any `test.mp4` video file to the directory `~/catkin_ws/src/` + * run `midas` node: `~/catkin_ws/src/launch_midas_cpp.sh` + * run test nodes in another terminal: `cd ~/catkin_ws/src && ./run_talker_listener_test.sh` and wait 30 seconds + + (to use Python 2, run command `sed -i 's/python3/python2/' ~/catkin_ws/src/midas_cpp/scripts/*.py` ) + +## Mobile version of MiDaS - Monocular Depth Estimation + +### Accuracy + +* MiDaS v2 small - ResNet50 default-decoder 384x384 +* MiDaS v2.1 small - EfficientNet-Lite3 small-decoder 256x256 + +**Zero-shot error** (the lower - the better): + +| Model | DIW WHDR | Eth3d AbsRel | Sintel AbsRel | Kitti δ>1.25 | NyuDepthV2 δ>1.25 | TUM δ>1.25 | +|---|---|---|---|---|---|---| +| MiDaS v2 small 384x384 | **0.1248** | 0.1550 | **0.3300** | **21.81** | 15.73 | 17.00 | +| MiDaS v2.1 small 256x256 | 0.1344 | **0.1344** | 0.3370 | 29.27 | **13.43** | **14.53** | +| Relative improvement, % | -8 % | **+13 %** | -2 % | -34 % | **+15 %** | **+15 %** | + +None of Train/Valid/Test subsets of datasets (DIW, Eth3d, Sintel, Kitti, NyuDepthV2, TUM) were not involved in Training or Fine Tuning. + +### Inference speed (FPS) on nVidia GPU + +Inference speed excluding pre and post processing, batch=1, **Frames Per Second** (the higher - the better): + +| Model | Jetson Nano, FPS | RTX 2080Ti, FPS | +|---|---|---| +| MiDaS v2 small 384x384 | 1.6 | 117 | +| MiDaS v2.1 small 256x256 | 8.1 | 232 | +| SpeedUp, X times | **5x** | **2x** | + +### Citation + +This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3): + +>Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer +René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun + +Please cite our paper if you use this code or any of the models: +``` +@article{Ranftl2020, + author = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun}, + title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, + year = {2020}, +} +``` diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/do_catkin_make.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/do_catkin_make.sh new file mode 100644 index 0000000000000000000000000000000000000000..0d416fc00282aab146326bbba12a9274e1ba29b8 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/do_catkin_make.sh @@ -0,0 +1,5 @@ +mkdir src +catkin_make +source devel/setup.bash +echo $ROS_PACKAGE_PATH +chmod +x ./devel/setup.bash diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/downloads.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/downloads.sh new file mode 100644 index 0000000000000000000000000000000000000000..9c967d4e2dc7997da26399a063b5a54ecc314eb1 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/downloads.sh @@ -0,0 +1,5 @@ +mkdir ~/.ros +wget https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small-traced.pt +cp ./model-small-traced.pt ~/.ros/model-small-traced.pt + + diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_melodic_ubuntu_17_18.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_melodic_ubuntu_17_18.sh new file mode 100644 index 0000000000000000000000000000000000000000..b868112631e9d9bc7bccb601407dfc857b8a99d5 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_melodic_ubuntu_17_18.sh @@ -0,0 +1,34 @@ +#@title { display-mode: "code" } + +#from http://wiki.ros.org/indigo/Installation/Ubuntu + +#1.2 Setup sources.list +sudo sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list' + +# 1.3 Setup keys +sudo apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 +sudo apt-key adv --keyserver 'hkp://ha.pool.sks-keyservers.net:80' --recv-key 421C365BD9FF1F717815A3895523BAEEB01FA116 + +curl -sSL 'http://keyserver.ubuntu.com/pks/lookup?op=get&search=0xC1CF6E31E6BADE8868B172B4F42ED6FBAB17C654' | sudo apt-key add - + +# 1.4 Installation +sudo apt-get update +sudo apt-get upgrade + +# Desktop-Full Install: +sudo apt-get install ros-melodic-desktop-full + +printf "\nsource /opt/ros/melodic/setup.bash\n" >> ~/.bashrc + +# 1.5 Initialize rosdep +sudo rosdep init +rosdep update + + +# 1.7 Getting rosinstall (python) +sudo apt-get install python-rosinstall +sudo apt-get install python-catkin-tools +sudo apt-get install python-rospy +sudo apt-get install python-rosdep +sudo apt-get install python-roscd +sudo apt-get install python-pip \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_noetic_ubuntu_20.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_noetic_ubuntu_20.sh new file mode 100644 index 0000000000000000000000000000000000000000..d73ea1a3d92359819167d735a92d2a650b9bc245 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/install_ros_noetic_ubuntu_20.sh @@ -0,0 +1,33 @@ +#@title { display-mode: "code" } + +#from http://wiki.ros.org/indigo/Installation/Ubuntu + +#1.2 Setup sources.list +sudo sh -c 'echo "deb http://packages.ros.org/ros/ubuntu $(lsb_release -sc) main" > /etc/apt/sources.list.d/ros-latest.list' + +# 1.3 Setup keys +sudo apt-key adv --keyserver 'hkp://keyserver.ubuntu.com:80' --recv-key C1CF6E31E6BADE8868B172B4F42ED6FBAB17C654 + +curl -sSL 'http://keyserver.ubuntu.com/pks/lookup?op=get&search=0xC1CF6E31E6BADE8868B172B4F42ED6FBAB17C654' | sudo apt-key add - + +# 1.4 Installation +sudo apt-get update +sudo apt-get upgrade + +# Desktop-Full Install: +sudo apt-get install ros-noetic-desktop-full + +printf "\nsource /opt/ros/noetic/setup.bash\n" >> ~/.bashrc + +# 1.5 Initialize rosdep +sudo rosdep init +rosdep update + + +# 1.7 Getting rosinstall (python) +sudo apt-get install python3-rosinstall +sudo apt-get install python3-catkin-tools +sudo apt-get install python3-rospy +sudo apt-get install python3-rosdep +sudo apt-get install python3-roscd +sudo apt-get install python3-pip \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/make_package_cpp.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/make_package_cpp.sh new file mode 100644 index 0000000000000000000000000000000000000000..d0ef6073a9c9ce40744e1c81d557c1c68255b95e --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/additions/make_package_cpp.sh @@ -0,0 +1,16 @@ +cd ~/catkin_ws/src +catkin_create_pkg midas_cpp std_msgs roscpp cv_bridge sensor_msgs image_transport +cd ~/catkin_ws +catkin_make + +chmod +x ~/catkin_ws/devel/setup.bash +printf "\nsource ~/catkin_ws/devel/setup.bash" >> ~/.bashrc +source ~/catkin_ws/devel/setup.bash + + +sudo rosdep init +rosdep update +#rospack depends1 midas_cpp +roscd midas_cpp +#cat package.xml +#rospack depends midas_cpp \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/launch_midas_cpp.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/launch_midas_cpp.sh new file mode 100644 index 0000000000000000000000000000000000000000..5a0d1583fffdc49216c625dfd07af2ae3b01a7a0 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/launch_midas_cpp.sh @@ -0,0 +1,2 @@ +source ~/catkin_ws/devel/setup.bash +roslaunch midas_cpp midas_cpp.launch model_name:="model-small-traced.pt" input_topic:="image_topic" output_topic:="midas_topic" out_orig_size:="true" \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/CMakeLists.txt b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..885341691d217f9c4c8fcb1e4ff568d87788c7b8 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/CMakeLists.txt @@ -0,0 +1,189 @@ +cmake_minimum_required(VERSION 3.0.2) +project(midas_cpp) + +## Compile as C++11, supported in ROS Kinetic and newer +# add_compile_options(-std=c++11) + +## Find catkin macros and libraries +## if COMPONENTS list like find_package(catkin REQUIRED COMPONENTS xyz) +## is used, also find other catkin packages +find_package(catkin REQUIRED COMPONENTS + cv_bridge + image_transport + roscpp + rospy + sensor_msgs + std_msgs +) + +## System dependencies are found with CMake's conventions +# find_package(Boost REQUIRED COMPONENTS system) + +list(APPEND CMAKE_PREFIX_PATH "~/libtorch") +list(APPEND CMAKE_PREFIX_PATH "/usr/local/lib/python3.6/dist-packages/torch/lib") +list(APPEND CMAKE_PREFIX_PATH "/usr/local/lib/python2.7/dist-packages/torch/lib") + +if(NOT EXISTS "~/libtorch") + if (EXISTS "/usr/local/lib/python3.6/dist-packages/torch") + include_directories(/usr/local/include) + include_directories(/usr/local/lib/python3.6/dist-packages/torch/include/torch/csrc/api/include) + include_directories(/usr/local/lib/python3.6/dist-packages/torch/include) + + link_directories(/usr/local/lib) + link_directories(/usr/local/lib/python3.6/dist-packages/torch/lib) + + set(CMAKE_PREFIX_PATH /usr/local/lib/python3.6/dist-packages/torch) + set(Boost_USE_MULTITHREADED ON) + set(Torch_DIR /usr/local/lib/python3.6/dist-packages/torch) + + elseif (EXISTS "/usr/local/lib/python2.7/dist-packages/torch") + + include_directories(/usr/local/include) + include_directories(/usr/local/lib/python2.7/dist-packages/torch/include/torch/csrc/api/include) + include_directories(/usr/local/lib/python2.7/dist-packages/torch/include) + + link_directories(/usr/local/lib) + link_directories(/usr/local/lib/python2.7/dist-packages/torch/lib) + + set(CMAKE_PREFIX_PATH /usr/local/lib/python2.7/dist-packages/torch) + set(Boost_USE_MULTITHREADED ON) + set(Torch_DIR /usr/local/lib/python2.7/dist-packages/torch) + endif() +endif() + + + +find_package(Torch REQUIRED) +find_package(OpenCV REQUIRED) +include_directories( ${OpenCV_INCLUDE_DIRS} ) + +add_executable(midas_cpp src/main.cpp) +target_link_libraries(midas_cpp "${TORCH_LIBRARIES}" "${OpenCV_LIBS} ${catkin_LIBRARIES}") +set_property(TARGET midas_cpp PROPERTY CXX_STANDARD 14) + + + +################################### +## catkin specific configuration ## +################################### +## The catkin_package macro generates cmake config files for your package +## Declare things to be passed to dependent projects +## INCLUDE_DIRS: uncomment this if your package contains header files +## LIBRARIES: libraries you create in this project that dependent projects also need +## CATKIN_DEPENDS: catkin_packages dependent projects also need +## DEPENDS: system dependencies of this project that dependent projects also need +catkin_package( +# INCLUDE_DIRS include +# LIBRARIES midas_cpp +# CATKIN_DEPENDS cv_bridge image_transport roscpp sensor_msgs std_msgs +# DEPENDS system_lib +) + +########### +## Build ## +########### + +## Specify additional locations of header files +## Your package locations should be listed before other locations +include_directories( +# include + ${catkin_INCLUDE_DIRS} +) + +## Declare a C++ library +# add_library(${PROJECT_NAME} +# src/${PROJECT_NAME}/midas_cpp.cpp +# ) + +## Add cmake target dependencies of the library +## as an example, code may need to be generated before libraries +## either from message generation or dynamic reconfigure +# add_dependencies(${PROJECT_NAME} ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) + +## Declare a C++ executable +## With catkin_make all packages are built within a single CMake context +## The recommended prefix ensures that target names across packages don't collide +# add_executable(${PROJECT_NAME}_node src/midas_cpp_node.cpp) + +## Rename C++ executable without prefix +## The above recommended prefix causes long target names, the following renames the +## target back to the shorter version for ease of user use +## e.g. "rosrun someones_pkg node" instead of "rosrun someones_pkg someones_pkg_node" +# set_target_properties(${PROJECT_NAME}_node PROPERTIES OUTPUT_NAME node PREFIX "") + +## Add cmake target dependencies of the executable +## same as for the library above +# add_dependencies(${PROJECT_NAME}_node ${${PROJECT_NAME}_EXPORTED_TARGETS} ${catkin_EXPORTED_TARGETS}) + +## Specify libraries to link a library or executable target against +# target_link_libraries(${PROJECT_NAME}_node +# ${catkin_LIBRARIES} +# ) + +############# +## Install ## +############# + +# all install targets should use catkin DESTINATION variables +# See http://ros.org/doc/api/catkin/html/adv_user_guide/variables.html + +## Mark executable scripts (Python etc.) for installation +## in contrast to setup.py, you can choose the destination +# catkin_install_python(PROGRAMS +# scripts/my_python_script +# DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +# ) + +## Mark executables for installation +## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_executables.html +# install(TARGETS ${PROJECT_NAME}_node +# RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +# ) + +## Mark libraries for installation +## See http://docs.ros.org/melodic/api/catkin/html/howto/format1/building_libraries.html +# install(TARGETS ${PROJECT_NAME} +# ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} +# LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} +# RUNTIME DESTINATION ${CATKIN_GLOBAL_BIN_DESTINATION} +# ) + +## Mark cpp header files for installation +# install(DIRECTORY include/${PROJECT_NAME}/ +# DESTINATION ${CATKIN_PACKAGE_INCLUDE_DESTINATION} +# FILES_MATCHING PATTERN "*.h" +# PATTERN ".svn" EXCLUDE +# ) + +## Mark other files for installation (e.g. launch and bag files, etc.) +# install(FILES +# # myfile1 +# # myfile2 +# DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION} +# ) + +############# +## Testing ## +############# + +## Add gtest based cpp test target and link libraries +# catkin_add_gtest(${PROJECT_NAME}-test test/test_midas_cpp.cpp) +# if(TARGET ${PROJECT_NAME}-test) +# target_link_libraries(${PROJECT_NAME}-test ${PROJECT_NAME}) +# endif() + +## Add folders to be run by python nosetests +# catkin_add_nosetests(test) + +install(TARGETS ${PROJECT_NAME} + ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} + LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} + RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +) + +add_custom_command( + TARGET midas_cpp POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_BINARY_DIR}/midas_cpp + ${CMAKE_SOURCE_DIR}/midas_cpp +) \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_cpp.launch b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_cpp.launch new file mode 100644 index 0000000000000000000000000000000000000000..88e86f42f668e76ad4976ec6794a8cb0f20cac65 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_cpp.launch @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_talker_listener.launch b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_talker_listener.launch new file mode 100644 index 0000000000000000000000000000000000000000..8817a4f4933c56986fe0edc0886b2fded3d3406d --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/launch/midas_talker_listener.launch @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/package.xml b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/package.xml new file mode 100644 index 0000000000000000000000000000000000000000..9cac90eba75409bd170f73531c54c83c52ff047a --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/package.xml @@ -0,0 +1,77 @@ + + + midas_cpp + 0.1.0 + The midas_cpp package + + Alexey Bochkovskiy + MIT + https://github.com/isl-org/MiDaS/tree/master/ros + + + + + + + TODO + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + catkin + cv_bridge + image_transport + roscpp + rospy + sensor_msgs + std_msgs + cv_bridge + image_transport + roscpp + rospy + sensor_msgs + std_msgs + cv_bridge + image_transport + roscpp + rospy + sensor_msgs + std_msgs + + + + + + + + diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener.py new file mode 100644 index 0000000000000000000000000000000000000000..6927ea7a83ac9309e5f883ee974a5dcfa8a2aa3b --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +from __future__ import print_function + +import roslib +#roslib.load_manifest('my_package') +import sys +import rospy +import cv2 +import numpy as np +from std_msgs.msg import String +from sensor_msgs.msg import Image +from cv_bridge import CvBridge, CvBridgeError + +class video_show: + + def __init__(self): + self.show_output = rospy.get_param('~show_output', True) + self.save_output = rospy.get_param('~save_output', False) + self.output_video_file = rospy.get_param('~output_video_file','result.mp4') + # rospy.loginfo(f"Listener - params: show_output={self.show_output}, save_output={self.save_output}, output_video_file={self.output_video_file}") + + self.bridge = CvBridge() + self.image_sub = rospy.Subscriber("midas_topic", Image, self.callback) + + def callback(self, data): + try: + cv_image = self.bridge.imgmsg_to_cv2(data) + except CvBridgeError as e: + print(e) + return + + if cv_image.size == 0: + return + + rospy.loginfo("Listener: Received new frame") + cv_image = cv_image.astype("uint8") + + if self.show_output==True: + cv2.imshow("video_show", cv_image) + cv2.waitKey(10) + + if self.save_output==True: + if self.video_writer_init==False: + fourcc = cv2.VideoWriter_fourcc(*'XVID') + self.out = cv2.VideoWriter(self.output_video_file, fourcc, 25, (cv_image.shape[1], cv_image.shape[0])) + + self.out.write(cv_image) + + + +def main(args): + rospy.init_node('listener', anonymous=True) + ic = video_show() + try: + rospy.spin() + except KeyboardInterrupt: + print("Shutting down") + cv2.destroyAllWindows() + +if __name__ == '__main__': + main(sys.argv) \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener_original.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener_original.py new file mode 100644 index 0000000000000000000000000000000000000000..20e235f6958d644b89383752ab18e9e2275f55e5 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/listener_original.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +from __future__ import print_function + +import roslib +#roslib.load_manifest('my_package') +import sys +import rospy +import cv2 +import numpy as np +from std_msgs.msg import String +from sensor_msgs.msg import Image +from cv_bridge import CvBridge, CvBridgeError + +class video_show: + + def __init__(self): + self.show_output = rospy.get_param('~show_output', True) + self.save_output = rospy.get_param('~save_output', False) + self.output_video_file = rospy.get_param('~output_video_file','result.mp4') + # rospy.loginfo(f"Listener original - params: show_output={self.show_output}, save_output={self.save_output}, output_video_file={self.output_video_file}") + + self.bridge = CvBridge() + self.image_sub = rospy.Subscriber("image_topic", Image, self.callback) + + def callback(self, data): + try: + cv_image = self.bridge.imgmsg_to_cv2(data) + except CvBridgeError as e: + print(e) + return + + if cv_image.size == 0: + return + + rospy.loginfo("Listener_original: Received new frame") + cv_image = cv_image.astype("uint8") + + if self.show_output==True: + cv2.imshow("video_show_orig", cv_image) + cv2.waitKey(10) + + if self.save_output==True: + if self.video_writer_init==False: + fourcc = cv2.VideoWriter_fourcc(*'XVID') + self.out = cv2.VideoWriter(self.output_video_file, fourcc, 25, (cv_image.shape[1], cv_image.shape[0])) + + self.out.write(cv_image) + + + +def main(args): + rospy.init_node('listener_original', anonymous=True) + ic = video_show() + try: + rospy.spin() + except KeyboardInterrupt: + print("Shutting down") + cv2.destroyAllWindows() + +if __name__ == '__main__': + main(sys.argv) \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/talker.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/talker.py new file mode 100644 index 0000000000000000000000000000000000000000..8219cc8632484a2efd02984347c615efad6b78b2 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/scripts/talker.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + + +import roslib +#roslib.load_manifest('my_package') +import sys +import rospy +import cv2 +from std_msgs.msg import String +from sensor_msgs.msg import Image +from cv_bridge import CvBridge, CvBridgeError + + +def talker(): + rospy.init_node('talker', anonymous=True) + + use_camera = rospy.get_param('~use_camera', False) + input_video_file = rospy.get_param('~input_video_file','test.mp4') + # rospy.loginfo(f"Talker - params: use_camera={use_camera}, input_video_file={input_video_file}") + + # rospy.loginfo("Talker: Trying to open a video stream") + if use_camera == True: + cap = cv2.VideoCapture(0) + else: + cap = cv2.VideoCapture(input_video_file) + + pub = rospy.Publisher('image_topic', Image, queue_size=1) + rate = rospy.Rate(30) # 30hz + bridge = CvBridge() + + while not rospy.is_shutdown(): + ret, cv_image = cap.read() + if ret==False: + print("Talker: Video is over") + rospy.loginfo("Video is over") + return + + try: + image = bridge.cv2_to_imgmsg(cv_image, "bgr8") + except CvBridgeError as e: + rospy.logerr("Talker: cv2image conversion failed: ", e) + print(e) + continue + + rospy.loginfo("Talker: Publishing frame") + pub.publish(image) + rate.sleep() + +if __name__ == '__main__': + try: + talker() + except rospy.ROSInterruptException: + pass diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/src/main.cpp b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e4fc72c6955f66af71c9cb1fc7a7b1f643129685 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/midas_cpp/src/main.cpp @@ -0,0 +1,285 @@ +#include +#include +#include +#include + +#include + +#include // One-stop header. + +#include +#include +#include +#include + +#include +#include + +// includes for OpenCV >= 3.x +#ifndef CV_VERSION_EPOCH +#include +#include +#include +#endif + +// OpenCV includes for OpenCV 2.x +#ifdef CV_VERSION_EPOCH +#include +#include +#include +#include +#endif + +static const std::string OPENCV_WINDOW = "Image window"; + +class Midas +{ + ros::NodeHandle nh_; + image_transport::ImageTransport it_; + image_transport::Subscriber image_sub_; + image_transport::Publisher image_pub_; + + torch::jit::script::Module module; + torch::Device device; + + auto ToTensor(cv::Mat img, bool show_output = false, bool unsqueeze = false, int unsqueeze_dim = 0) + { + //std::cout << "image shape: " << img.size() << std::endl; + at::Tensor tensor_image = torch::from_blob(img.data, { img.rows, img.cols, 3 }, at::kByte); + + if (unsqueeze) + { + tensor_image.unsqueeze_(unsqueeze_dim); + //std::cout << "tensors new shape: " << tensor_image.sizes() << std::endl; + } + + if (show_output) + { + std::cout << tensor_image.slice(2, 0, 1) << std::endl; + } + //std::cout << "tenor shape: " << tensor_image.sizes() << std::endl; + return tensor_image; + } + + auto ToInput(at::Tensor tensor_image) + { + // Create a vector of inputs. + return std::vector{tensor_image}; + } + + auto ToCvImage(at::Tensor tensor, int cv_type = CV_8UC3) + { + int width = tensor.sizes()[0]; + int height = tensor.sizes()[1]; + try + { + cv::Mat output_mat; + if (cv_type == CV_8UC4 || cv_type == CV_8UC3 || cv_type == CV_8UC2 || cv_type == CV_8UC1) { + cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr()); + output_mat = cv_image; + } + else if (cv_type == CV_32FC4 || cv_type == CV_32FC3 || cv_type == CV_32FC2 || cv_type == CV_32FC1) { + cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr()); + output_mat = cv_image; + } + else if (cv_type == CV_64FC4 || cv_type == CV_64FC3 || cv_type == CV_64FC2 || cv_type == CV_64FC1) { + cv::Mat cv_image(cv::Size{ height, width }, cv_type, tensor.data_ptr()); + output_mat = cv_image; + } + + //show_image(output_mat, "converted image from tensor"); + return output_mat.clone(); + } + catch (const c10::Error& e) + { + std::cout << "an error has occured : " << e.msg() << std::endl; + } + return cv::Mat(height, width, CV_8UC3); + } + + std::string input_topic, output_topic, model_name; + bool out_orig_size; + int net_width, net_height; + torch::NoGradGuard guard; + at::Tensor mean, std; + at::Tensor output, tensor; + +public: + Midas() + : nh_(), it_(nh_), device(torch::Device(torch::kCPU)) + { + ros::param::param("~input_topic", input_topic, "image_topic"); + ros::param::param("~output_topic", output_topic, "midas_topic"); + ros::param::param("~model_name", model_name, "model-small-traced.pt"); + ros::param::param("~out_orig_size", out_orig_size, true); + ros::param::param("~net_width", net_width, 256); + ros::param::param("~net_height", net_height, 256); + + std::cout << ", input_topic = " << input_topic << + ", output_topic = " << output_topic << + ", model_name = " << model_name << + ", out_orig_size = " << out_orig_size << + ", net_width = " << net_width << + ", net_height = " << net_height << + std::endl; + + // Subscrive to input video feed and publish output video feed + image_sub_ = it_.subscribe(input_topic, 1, &Midas::imageCb, this); + image_pub_ = it_.advertise(output_topic, 1); + + std::cout << "Try to load torchscript model \n"; + + try { + // Deserialize the ScriptModule from a file using torch::jit::load(). + module = torch::jit::load(model_name); + } + catch (const c10::Error& e) { + std::cerr << "error loading the model\n"; + exit(0); + } + + std::cout << "ok\n"; + + try { + module.eval(); + torch::jit::getProfilingMode() = false; + torch::jit::setGraphExecutorOptimize(true); + + mean = torch::tensor({ 0.485, 0.456, 0.406 }); + std = torch::tensor({ 0.229, 0.224, 0.225 }); + + if (torch::hasCUDA()) { + std::cout << "cuda is available" << std::endl; + at::globalContext().setBenchmarkCuDNN(true); + device = torch::Device(torch::kCUDA); + module.to(device); + mean = mean.to(device); + std = std.to(device); + } + } + catch (const c10::Error& e) + { + std::cerr << " module initialization: " << e.msg() << std::endl; + } + } + + ~Midas() + { + } + + void imageCb(const sensor_msgs::ImageConstPtr& msg) + { + cv_bridge::CvImagePtr cv_ptr; + try + { + // sensor_msgs::Image to cv::Mat + cv_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::RGB8); + } + catch (cv_bridge::Exception& e) + { + ROS_ERROR("cv_bridge exception: %s", e.what()); + return; + } + + // pre-processing + auto tensor_cpu = ToTensor(cv_ptr->image); // OpenCV-image -> Libtorch-tensor + + try { + tensor = tensor_cpu.to(device); // move to device (CPU or GPU) + + tensor = tensor.toType(c10::kFloat); + tensor = tensor.permute({ 2, 0, 1 }); // HWC -> CHW + tensor = tensor.unsqueeze(0); + tensor = at::upsample_bilinear2d(tensor, { net_height, net_width }, true); // resize + tensor = tensor.squeeze(0); + tensor = tensor.permute({ 1, 2, 0 }); // CHW -> HWC + + tensor = tensor.div(255).sub(mean).div(std); // normalization + tensor = tensor.permute({ 2, 0, 1 }); // HWC -> CHW + tensor.unsqueeze_(0); // CHW -> NCHW + } + catch (const c10::Error& e) + { + std::cerr << " pre-processing exception: " << e.msg() << std::endl; + return; + } + + auto input_to_net = ToInput(tensor); // input to the network + + // inference + output; + try { + output = module.forward(input_to_net).toTensor(); // run inference + } + catch (const c10::Error& e) + { + std::cerr << " module.forward() exception: " << e.msg() << std::endl; + return; + } + + output = output.detach().to(torch::kF32); + + // move to CPU temporary + at::Tensor output_tmp = output; + output_tmp = output_tmp.to(torch::kCPU); + + // normalization + float min_val = std::numeric_limits::max(); + float max_val = std::numeric_limits::min(); + + for (int i = 0; i < net_width * net_height; ++i) { + float val = output_tmp.data_ptr()[i]; + if (min_val > val) min_val = val; + if (max_val < val) max_val = val; + } + float range_val = max_val - min_val; + + output = output.sub(min_val).div(range_val).mul(255.0F).clamp(0, 255).to(torch::kF32); // .to(torch::kU8); + + // resize to the original size if required + if (out_orig_size) { + try { + output = at::upsample_bilinear2d(output.unsqueeze(0), { cv_ptr->image.size().height, cv_ptr->image.size().width }, true); + output = output.squeeze(0); + } + catch (const c10::Error& e) + { + std::cout << " upsample_bilinear2d() exception: " << e.msg() << std::endl; + return; + } + } + output = output.permute({ 1, 2, 0 }).to(torch::kCPU); + + int cv_type = CV_32FC1; // CV_8UC1; + auto cv_img = ToCvImage(output, cv_type); + + sensor_msgs::Image img_msg; + + try { + // cv::Mat -> sensor_msgs::Image + std_msgs::Header header; // empty header + header.seq = 0; // user defined counter + header.stamp = ros::Time::now();// time + //cv_bridge::CvImage img_bridge = cv_bridge::CvImage(header, sensor_msgs::image_encodings::MONO8, cv_img); + cv_bridge::CvImage img_bridge = cv_bridge::CvImage(header, sensor_msgs::image_encodings::TYPE_32FC1, cv_img); + + img_bridge.toImageMsg(img_msg); // cv_bridge -> sensor_msgs::Image + } + catch (cv_bridge::Exception& e) + { + ROS_ERROR("cv_bridge exception: %s", e.what()); + return; + } + + // Output modified video stream + image_pub_.publish(img_msg); + } +}; + +int main(int argc, char** argv) +{ + ros::init(argc, argv, "midas", ros::init_options::AnonymousName); + Midas ic; + ros::spin(); + return 0; +} \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/run_talker_listener_test.sh b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/run_talker_listener_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..a997c4261072d0d627598fe06a723fcc7522d347 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/ros/run_talker_listener_test.sh @@ -0,0 +1,16 @@ +# place any test.mp4 file near with this file + +# roscore +# rosnode kill -a + +source ~/catkin_ws/devel/setup.bash + +roscore & +P1=$! +rosrun midas_cpp talker.py & +P2=$! +rosrun midas_cpp listener_original.py & +P3=$! +rosrun midas_cpp listener.py & +P4=$! +wait $P1 $P2 $P3 $P4 \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/run.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/run.py new file mode 100644 index 0000000000000000000000000000000000000000..5696ef0547af093713ea416d18edd77d11879d0a --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/run.py @@ -0,0 +1,277 @@ +"""Compute depth maps for images in the input folder. +""" +import os +import glob +import torch +import utils +import cv2 +import argparse +import time + +import numpy as np + +from imutils.video import VideoStream +from midas.model_loader import default_models, load_model + +first_execution = True +def process(device, model, model_type, image, input_size, target_size, optimize, use_camera): + """ + Run the inference and interpolate. + + Args: + device (torch.device): the torch device used + model: the model used for inference + model_type: the type of the model + image: the image fed into the neural network + input_size: the size (width, height) of the neural network input (for OpenVINO) + target_size: the size (width, height) the neural network output is interpolated to + optimize: optimize the model to half-floats on CUDA? + use_camera: is the camera used? + + Returns: + the prediction + """ + global first_execution + + if "openvino" in model_type: + if first_execution or not use_camera: + print(f" Input resized to {input_size[0]}x{input_size[1]} before entering the encoder") + first_execution = False + + sample = [np.reshape(image, (1, 3, *input_size))] + prediction = model(sample)[model.output(0)][0] + prediction = cv2.resize(prediction, dsize=target_size, + interpolation=cv2.INTER_CUBIC) + else: + sample = torch.from_numpy(image).to(device).unsqueeze(0) + + if optimize and device == torch.device("cuda"): + if first_execution: + print(" Optimization to half-floats activated. Use with caution, because models like Swin require\n" + " float precision to work properly and may yield non-finite depth values to some extent for\n" + " half-floats.") + sample = sample.to(memory_format=torch.channels_last) + sample = sample.half() + + if first_execution or not use_camera: + height, width = sample.shape[2:] + print(f" Input resized to {width}x{height} before entering the encoder") + first_execution = False + + prediction = model.forward(sample) + prediction = ( + torch.nn.functional.interpolate( + prediction.unsqueeze(1), + size=target_size[::-1], + mode="bicubic", + align_corners=False, + ) + .squeeze() + .cpu() + .numpy() + ) + + return prediction + + +def create_side_by_side(image, depth, grayscale): + """ + Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map + for better visibility. + + Args: + image: the RGB image + depth: the depth map + grayscale: use a grayscale colormap? + + Returns: + the image and depth map place side by side + """ + depth_min = depth.min() + depth_max = depth.max() + normalized_depth = 255 * (depth - depth_min) / (depth_max - depth_min) + normalized_depth *= 3 + + right_side = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3 + if not grayscale: + right_side = cv2.applyColorMap(np.uint8(right_side), cv2.COLORMAP_INFERNO) + + if image is None: + return right_side + else: + return np.concatenate((image, right_side), axis=1) + + +def run(input_path, output_path, model_path, model_type="dpt_beit_large_512", optimize=False, side=False, height=None, + square=False, grayscale=False): + """Run MonoDepthNN to compute depth maps. + + Args: + input_path (str): path to input folder + output_path (str): path to output folder + model_path (str): path to saved model + model_type (str): the model type + optimize (bool): optimize the model to half-floats on CUDA? + side (bool): RGB and depth side by side in output images? + height (int): inference encoder image height + square (bool): resize to a square resolution? + grayscale (bool): use a grayscale colormap? + """ + print("Initialize") + + # select device + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print("Device: %s" % device) + + model, transform, net_w, net_h = load_model(device, model_path, model_type, optimize, height, square) + + # get input + if input_path is not None: + image_names = glob.glob(os.path.join(input_path, "*")) + num_images = len(image_names) + else: + print("No input path specified. Grabbing images from camera.") + + # create output folder + if output_path is not None: + os.makedirs(output_path, exist_ok=True) + + print("Start processing") + + if input_path is not None: + if output_path is None: + print("Warning: No output path specified. Images will be processed but not shown or stored anywhere.") + for index, image_name in enumerate(image_names): + + print(" Processing {} ({}/{})".format(image_name, index + 1, num_images)) + + # input + original_image_rgb = utils.read_image(image_name) # in [0, 1] + image = transform({"image": original_image_rgb})["image"] + + # compute + with torch.no_grad(): + prediction = process(device, model, model_type, image, (net_w, net_h), original_image_rgb.shape[1::-1], + optimize, False) + + # output + if output_path is not None: + filename = os.path.join( + output_path, os.path.splitext(os.path.basename(image_name))[0] + '-' + model_type + ) + if not side: + utils.write_depth(filename, prediction, grayscale, bits=2) + else: + original_image_bgr = np.flip(original_image_rgb, 2) + content = create_side_by_side(original_image_bgr*255, prediction, grayscale) + cv2.imwrite(filename + ".png", content) + utils.write_pfm(filename + ".pfm", prediction.astype(np.float32)) + + else: + with torch.no_grad(): + fps = 1 + video = VideoStream(0).start() + time_start = time.time() + frame_index = 0 + while True: + frame = video.read() + if frame is not None: + original_image_rgb = np.flip(frame, 2) # in [0, 255] (flip required to get RGB) + image = transform({"image": original_image_rgb/255})["image"] + + prediction = process(device, model, model_type, image, (net_w, net_h), + original_image_rgb.shape[1::-1], optimize, True) + + original_image_bgr = np.flip(original_image_rgb, 2) if side else None + content = create_side_by_side(original_image_bgr, prediction, grayscale) + cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', content/255) + + if output_path is not None: + filename = os.path.join(output_path, 'Camera' + '-' + model_type + '_' + str(frame_index)) + cv2.imwrite(filename + ".png", content) + + alpha = 0.1 + if time.time()-time_start > 0: + fps = (1 - alpha) * fps + alpha * 1 / (time.time()-time_start) # exponential moving average + time_start = time.time() + print(f"\rFPS: {round(fps,2)}", end="") + + if cv2.waitKey(1) == 27: # Escape key + break + + frame_index += 1 + print() + + print("Finished") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('-i', '--input_path', + default=None, + help='Folder with input images (if no input path is specified, images are tried to be grabbed ' + 'from camera)' + ) + + parser.add_argument('-o', '--output_path', + default=None, + help='Folder for output images' + ) + + parser.add_argument('-m', '--model_weights', + default=None, + help='Path to the trained weights of model' + ) + + parser.add_argument('-t', '--model_type', + default='dpt_beit_large_512', + help='Model type: ' + 'dpt_beit_large_512, dpt_beit_large_384, dpt_beit_base_384, dpt_swin2_large_384, ' + 'dpt_swin2_base_384, dpt_swin2_tiny_256, dpt_swin_large_384, dpt_next_vit_large_384, ' + 'dpt_levit_224, dpt_large_384, dpt_hybrid_384, midas_v21_384, midas_v21_small_256 or ' + 'openvino_midas_v21_small_256' + ) + + parser.add_argument('-s', '--side', + action='store_true', + help='Output images contain RGB and depth images side by side' + ) + + parser.add_argument('--optimize', dest='optimize', action='store_true', help='Use half-float optimization') + parser.set_defaults(optimize=False) + + parser.add_argument('--height', + type=int, default=None, + help='Preferred height of images feed into the encoder during inference. Note that the ' + 'preferred height may differ from the actual height, because an alignment to multiples of ' + '32 takes place. Many models support only the height chosen during training, which is ' + 'used automatically if this parameter is not set.' + ) + parser.add_argument('--square', + action='store_true', + help='Option to resize images to a square resolution by changing their widths when images are ' + 'fed into the encoder during inference. If this parameter is not set, the aspect ratio of ' + 'images is tried to be preserved if supported by the model.' + ) + parser.add_argument('--grayscale', + action='store_true', + help='Use a grayscale colormap instead of the inferno one. Although the inferno colormap, ' + 'which is used by default, is better for visibility, it does not allow storing 16-bit ' + 'depth values in PNGs but only 8-bit ones due to the precision limitation of this ' + 'colormap.' + ) + + args = parser.parse_args() + + + if args.model_weights is None: + args.model_weights = default_models[args.model_type] + + # set torch options + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = True + + # compute depth maps + run(args.input_path, args.output_path, args.model_weights, args.model_type, args.optimize, args.side, args.height, + args.square, args.grayscale) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/README.md b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5b5fe0e63668eab45a55b140826cb3762862b17c --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/README.md @@ -0,0 +1,147 @@ +## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer + +### TensorFlow inference using `.pb` and `.onnx` models + +1. [Run inference on TensorFlow-model by using TensorFlow](#run-inference-on-tensorflow-model-by-using-tensorFlow) + +2. [Run inference on ONNX-model by using TensorFlow](#run-inference-on-onnx-model-by-using-tensorflow) + +3. [Make ONNX model from downloaded Pytorch model file](#make-onnx-model-from-downloaded-pytorch-model-file) + + +### Run inference on TensorFlow-model by using TensorFlow + +1) Download the model weights [model-f6b98070.pb](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.pb) +and [model-small.pb](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small.pb) and place the +file in the `/tf/` folder. + +2) Set up dependencies: + +```shell +# install OpenCV +pip install --upgrade pip +pip install opencv-python + +# install TensorFlow +pip install -I grpcio tensorflow==2.3.0 tensorflow-addons==0.11.2 numpy==1.18.0 +``` + +#### Usage + +1) Place one or more input images in the folder `tf/input`. + +2) Run the model: + + ```shell + python tf/run_pb.py + ``` + + Or run the small model: + + ```shell + python tf/run_pb.py --model_weights model-small.pb --model_type small + ``` + +3) The resulting inverse depth maps are written to the `tf/output` folder. + + +### Run inference on ONNX-model by using ONNX-Runtime + +1) Download the model weights [model-f6b98070.onnx](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.onnx) +and [model-small.onnx](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-small.onnx) and place the +file in the `/tf/` folder. + +2) Set up dependencies: + +```shell +# install OpenCV +pip install --upgrade pip +pip install opencv-python + +# install ONNX +pip install onnx==1.7.0 + +# install ONNX Runtime +pip install onnxruntime==1.5.2 +``` + +#### Usage + +1) Place one or more input images in the folder `tf/input`. + +2) Run the model: + + ```shell + python tf/run_onnx.py + ``` + + Or run the small model: + + ```shell + python tf/run_onnx.py --model_weights model-small.onnx --model_type small + ``` + +3) The resulting inverse depth maps are written to the `tf/output` folder. + + + +### Make ONNX model from downloaded Pytorch model file + +1) Download the model weights [model-f6b98070.pt](https://github.com/isl-org/MiDaS/releases/download/v2_1/model-f6b98070.pt) and place the +file in the root folder. + +2) Set up dependencies: + +```shell +# install OpenCV +pip install --upgrade pip +pip install opencv-python + +# install PyTorch TorchVision +pip install -I torch==1.7.0 torchvision==0.8.0 + +# install TensorFlow +pip install -I grpcio tensorflow==2.3.0 tensorflow-addons==0.11.2 numpy==1.18.0 + +# install ONNX +pip install onnx==1.7.0 + +# install ONNX-TensorFlow +git clone https://github.com/onnx/onnx-tensorflow.git +cd onnx-tensorflow +git checkout 095b51b88e35c4001d70f15f80f31014b592b81e +pip install -e . +``` + +#### Usage + +1) Run the converter: + + ```shell + python tf/make_onnx_model.py + ``` + +2) The resulting `model-f6b98070.onnx` file is written to the `/tf/` folder. + + +### Requirements + + The code was tested with Python 3.6.9, PyTorch 1.5.1, TensorFlow 2.2.0, TensorFlow-addons 0.8.3, ONNX 1.7.0, ONNX-TensorFlow (GitHub-master-17.07.2020) and OpenCV 4.3.0. + +### Citation + +Please cite our paper if you use this code or any of the models: +``` +@article{Ranftl2019, + author = {Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun}, + title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer}, + journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, + year = {2020}, +} +``` + +### License + +MIT License + + diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/make_onnx_model.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/make_onnx_model.py new file mode 100644 index 0000000000000000000000000000000000000000..d14b0e4e1d2ea70fa315fd7ca7dfd72440a19376 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/make_onnx_model.py @@ -0,0 +1,112 @@ +"""Compute depth maps for images in the input folder. +""" +import os +import ntpath +import glob +import torch +import utils +import cv2 +import numpy as np +from torchvision.transforms import Compose, Normalize +from torchvision import transforms + +from shutil import copyfile +import fileinput +import sys +sys.path.append(os.getcwd() + '/..') + +def modify_file(): + modify_filename = '../midas/blocks.py' + copyfile(modify_filename, modify_filename+'.bak') + + with open(modify_filename, 'r') as file : + filedata = file.read() + + filedata = filedata.replace('align_corners=True', 'align_corners=False') + filedata = filedata.replace('import torch.nn as nn', 'import torch.nn as nn\nimport torchvision.models as models') + filedata = filedata.replace('torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")', 'models.resnext101_32x8d()') + + with open(modify_filename, 'w') as file: + file.write(filedata) + +def restore_file(): + modify_filename = '../midas/blocks.py' + copyfile(modify_filename+'.bak', modify_filename) + +modify_file() + +from midas.midas_net import MidasNet +from midas.transforms import Resize, NormalizeImage, PrepareForNet + +restore_file() + + +class MidasNet_preprocessing(MidasNet): + """Network for monocular depth estimation. + """ + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + + mean = torch.tensor([0.485, 0.456, 0.406]) + std = torch.tensor([0.229, 0.224, 0.225]) + x.sub_(mean[None, :, None, None]).div_(std[None, :, None, None]) + + return MidasNet.forward(self, x) + + +def run(model_path): + """Run MonoDepthNN to compute depth maps. + + Args: + model_path (str): path to saved model + """ + print("initialize") + + # select device + + # load network + #model = MidasNet(model_path, non_negative=True) + model = MidasNet_preprocessing(model_path, non_negative=True) + + model.eval() + + print("start processing") + + # input + img_input = np.zeros((3, 384, 384), np.float32) + + # compute + with torch.no_grad(): + sample = torch.from_numpy(img_input).unsqueeze(0) + prediction = model.forward(sample) + prediction = ( + torch.nn.functional.interpolate( + prediction.unsqueeze(1), + size=img_input.shape[:2], + mode="bicubic", + align_corners=False, + ) + .squeeze() + .cpu() + .numpy() + ) + + torch.onnx.export(model, sample, ntpath.basename(model_path).rsplit('.', 1)[0]+'.onnx', opset_version=9) + + print("finished") + + +if __name__ == "__main__": + # set paths + # MODEL_PATH = "model.pt" + MODEL_PATH = "../model-f6b98070.pt" + + # compute depth maps + run(MODEL_PATH) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_onnx.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..7107b99969a127f951814f743d5c562a436b2430 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_onnx.py @@ -0,0 +1,119 @@ +"""Compute depth maps for images in the input folder. +""" +import os +import glob +import utils +import cv2 +import sys +import numpy as np +import argparse + +import onnx +import onnxruntime as rt + +from transforms import Resize, NormalizeImage, PrepareForNet + + +def run(input_path, output_path, model_path, model_type="large"): + """Run MonoDepthNN to compute depth maps. + + Args: + input_path (str): path to input folder + output_path (str): path to output folder + model_path (str): path to saved model + """ + print("initialize") + + # select device + device = "CUDA:0" + #device = "CPU" + print("device: %s" % device) + + # network resolution + if model_type == "large": + net_w, net_h = 384, 384 + elif model_type == "small": + net_w, net_h = 256, 256 + else: + print(f"model_type '{model_type}' not implemented, use: --model_type large") + assert False + + # load network + print("loading model...") + model = rt.InferenceSession(model_path) + input_name = model.get_inputs()[0].name + output_name = model.get_outputs()[0].name + + resize_image = Resize( + net_w, + net_h, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="upper_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ) + + def compose2(f1, f2): + return lambda x: f2(f1(x)) + + transform = compose2(resize_image, PrepareForNet()) + + # get input + img_names = glob.glob(os.path.join(input_path, "*")) + num_images = len(img_names) + + # create output folder + os.makedirs(output_path, exist_ok=True) + + print("start processing") + + for ind, img_name in enumerate(img_names): + + print(" processing {} ({}/{})".format(img_name, ind + 1, num_images)) + + # input + img = utils.read_image(img_name) + img_input = transform({"image": img})["image"] + + # compute + output = model.run([output_name], {input_name: img_input.reshape(1, 3, net_h, net_w).astype(np.float32)})[0] + prediction = np.array(output).reshape(net_h, net_w) + prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC) + + # output + filename = os.path.join( + output_path, os.path.splitext(os.path.basename(img_name))[0] + ) + utils.write_depth(filename, prediction, bits=2) + + print("finished") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('-i', '--input_path', + default='input', + help='folder with input images' + ) + + parser.add_argument('-o', '--output_path', + default='output', + help='folder for output images' + ) + + parser.add_argument('-m', '--model_weights', + default='model-f6b98070.onnx', + help='path to the trained weights of model' + ) + + parser.add_argument('-t', '--model_type', + default='large', + help='model type: large or small' + ) + + args = parser.parse_args() + + # compute depth maps + run(args.input_path, args.output_path, args.model_weights, args.model_type) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_pb.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_pb.py new file mode 100644 index 0000000000000000000000000000000000000000..e46254f7b37f72e7d87672d70fd4b2f393ad7658 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/run_pb.py @@ -0,0 +1,135 @@ +"""Compute depth maps for images in the input folder. +""" +import os +import glob +import utils +import cv2 +import argparse + +import tensorflow as tf + +from transforms import Resize, NormalizeImage, PrepareForNet + +def run(input_path, output_path, model_path, model_type="large"): + """Run MonoDepthNN to compute depth maps. + + Args: + input_path (str): path to input folder + output_path (str): path to output folder + model_path (str): path to saved model + """ + print("initialize") + + # the runtime initialization will not allocate all memory on the device to avoid out of GPU memory + gpus = tf.config.experimental.list_physical_devices('GPU') + if gpus: + try: + for gpu in gpus: + #tf.config.experimental.set_memory_growth(gpu, True) + tf.config.experimental.set_virtual_device_configuration(gpu, + [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)]) + except RuntimeError as e: + print(e) + + # network resolution + if model_type == "large": + net_w, net_h = 384, 384 + elif model_type == "small": + net_w, net_h = 256, 256 + else: + print(f"model_type '{model_type}' not implemented, use: --model_type large") + assert False + + # load network + graph_def = tf.compat.v1.GraphDef() + with tf.io.gfile.GFile(model_path, 'rb') as f: + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + + + model_operations = tf.compat.v1.get_default_graph().get_operations() + input_node = '0:0' + output_layer = model_operations[len(model_operations) - 1].name + ':0' + print("Last layer name: ", output_layer) + + resize_image = Resize( + net_w, + net_h, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="upper_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ) + + def compose2(f1, f2): + return lambda x: f2(f1(x)) + + transform = compose2(resize_image, PrepareForNet()) + + # get input + img_names = glob.glob(os.path.join(input_path, "*")) + num_images = len(img_names) + + # create output folder + os.makedirs(output_path, exist_ok=True) + + print("start processing") + + with tf.compat.v1.Session() as sess: + try: + # load images + for ind, img_name in enumerate(img_names): + + print(" processing {} ({}/{})".format(img_name, ind + 1, num_images)) + + # input + img = utils.read_image(img_name) + img_input = transform({"image": img})["image"] + + # compute + prob_tensor = sess.graph.get_tensor_by_name(output_layer) + prediction, = sess.run(prob_tensor, {input_node: [img_input] }) + prediction = prediction.reshape(net_h, net_w) + prediction = cv2.resize(prediction, (img.shape[1], img.shape[0]), interpolation=cv2.INTER_CUBIC) + + # output + filename = os.path.join( + output_path, os.path.splitext(os.path.basename(img_name))[0] + ) + utils.write_depth(filename, prediction, bits=2) + + except KeyError: + print ("Couldn't find input node: ' + input_node + ' or output layer: " + output_layer + ".") + exit(-1) + + print("finished") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument('-i', '--input_path', + default='input', + help='folder with input images' + ) + + parser.add_argument('-o', '--output_path', + default='output', + help='folder for output images' + ) + + parser.add_argument('-m', '--model_weights', + default='model-f6b98070.pb', + help='path to the trained weights of model' + ) + + parser.add_argument('-t', '--model_type', + default='large', + help='model type: large or small' + ) + + args = parser.parse_args() + + # compute depth maps + run(args.input_path, args.output_path, args.model_weights, args.model_type) diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/transforms.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..350cbc11662633ad7f8968eb10be2e7de6e384e9 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/transforms.py @@ -0,0 +1,234 @@ +import numpy as np +import cv2 +import math + + +def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): + """Rezise the sample to ensure the given size. Keeps aspect ratio. + + Args: + sample (dict): sample + size (tuple): image size + + Returns: + tuple: new size + """ + shape = list(sample["disparity"].shape) + + if shape[0] >= size[0] and shape[1] >= size[1]: + return sample + + scale = [0, 0] + scale[0] = size[0] / shape[0] + scale[1] = size[1] / shape[1] + + scale = max(scale) + + shape[0] = math.ceil(scale * shape[0]) + shape[1] = math.ceil(scale * shape[1]) + + # resize + sample["image"] = cv2.resize( + sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST + ) + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + tuple(shape[::-1]), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return tuple(shape) + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size( + sample["image"].shape[1], sample["image"].shape[0] + ) + + # resize sample + sample["image"] = cv2.resize( + sample["image"], + (width, height), + interpolation=self.__image_interpolation_method, + ) + + if self.__resize_target: + if "disparity" in sample: + sample["disparity"] = cv2.resize( + sample["disparity"], + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if "depth" in sample: + sample["depth"] = cv2.resize( + sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST + ) + + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + if "disparity" in sample: + disparity = sample["disparity"].astype(np.float32) + sample["disparity"] = np.ascontiguousarray(disparity) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + return sample diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/utils.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ff9a54bd55f5e31a90fad21242efbfda5a6cc1a7 --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/tf/utils.py @@ -0,0 +1,82 @@ +import numpy as np +import sys +import cv2 + + +def write_pfm(path, image, scale=1): + """Write pfm file. + Args: + path (str): pathto file + image (array): data + scale (int, optional): Scale. Defaults to 1. + """ + + with open(path, "wb") as file: + color = None + + if image.dtype.name != "float32": + raise Exception("Image dtype must be float32.") + + image = np.flipud(image) + + if len(image.shape) == 3 and image.shape[2] == 3: # color image + color = True + elif ( + len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 + ): # greyscale + color = False + else: + raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") + + file.write("PF\n" if color else "Pf\n".encode()) + file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) + + endian = image.dtype.byteorder + + if endian == "<" or endian == "=" and sys.byteorder == "little": + scale = -scale + + file.write("%f\n".encode() % scale) + + image.tofile(file) + +def read_image(path): + """Read image and output RGB image (0-1). + Args: + path (str): path to file + Returns: + array: RGB image (0-1) + """ + img = cv2.imread(path) + + if img.ndim == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 + + return img + +def write_depth(path, depth, bits=1): + """Write depth map to pfm and png file. + Args: + path (str): filepath without extension + depth (array): depth + """ + write_pfm(path + ".pfm", depth.astype(np.float32)) + + depth_min = depth.min() + depth_max = depth.max() + + max_val = (2**(8*bits))-1 + + if depth_max - depth_min > np.finfo("float").eps: + out = max_val * (depth - depth_min) / (depth_max - depth_min) + else: + out = 0 + + if bits == 1: + cv2.imwrite(path + ".png", out.astype("uint8")) + elif bits == 2: + cv2.imwrite(path + ".png", out.astype("uint16")) + + return \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/base_models/midas_repo/utils.py b/annotator/zoe/zoedepth/models/base_models/midas_repo/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7a3976fd97dfe6a9dc7d4fa144be8fcb0b18b2db --- /dev/null +++ b/annotator/zoe/zoedepth/models/base_models/midas_repo/utils.py @@ -0,0 +1,199 @@ +"""Utils for monoDepth. +""" +import sys +import re +import numpy as np +import cv2 +import torch + + +def read_pfm(path): + """Read pfm file. + + Args: + path (str): path to file + + Returns: + tuple: (data, scale) + """ + with open(path, "rb") as file: + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header.decode("ascii") == "PF": + color = True + elif header.decode("ascii") == "Pf": + color = False + else: + raise Exception("Not a PFM file: " + path) + + dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) + if dim_match: + width, height = list(map(int, dim_match.groups())) + else: + raise Exception("Malformed PFM header.") + + scale = float(file.readline().decode("ascii").rstrip()) + if scale < 0: + # little-endian + endian = "<" + scale = -scale + else: + # big-endian + endian = ">" + + data = np.fromfile(file, endian + "f") + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + + return data, scale + + +def write_pfm(path, image, scale=1): + """Write pfm file. + + Args: + path (str): pathto file + image (array): data + scale (int, optional): Scale. Defaults to 1. + """ + + with open(path, "wb") as file: + color = None + + if image.dtype.name != "float32": + raise Exception("Image dtype must be float32.") + + image = np.flipud(image) + + if len(image.shape) == 3 and image.shape[2] == 3: # color image + color = True + elif ( + len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 + ): # greyscale + color = False + else: + raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") + + file.write("PF\n" if color else "Pf\n".encode()) + file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) + + endian = image.dtype.byteorder + + if endian == "<" or endian == "=" and sys.byteorder == "little": + scale = -scale + + file.write("%f\n".encode() % scale) + + image.tofile(file) + + +def read_image(path): + """Read image and output RGB image (0-1). + + Args: + path (str): path to file + + Returns: + array: RGB image (0-1) + """ + img = cv2.imread(path) + + if img.ndim == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 + + return img + + +def resize_image(img): + """Resize image and make it fit for network. + + Args: + img (array): image + + Returns: + tensor: data ready for network + """ + height_orig = img.shape[0] + width_orig = img.shape[1] + + if width_orig > height_orig: + scale = width_orig / 384 + else: + scale = height_orig / 384 + + height = (np.ceil(height_orig / scale / 32) * 32).astype(int) + width = (np.ceil(width_orig / scale / 32) * 32).astype(int) + + img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA) + + img_resized = ( + torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float() + ) + img_resized = img_resized.unsqueeze(0) + + return img_resized + + +def resize_depth(depth, width, height): + """Resize depth map and bring to CPU (numpy). + + Args: + depth (tensor): depth + width (int): image width + height (int): image height + + Returns: + array: processed depth + """ + depth = torch.squeeze(depth[0, :, :, :]).to("cpu") + + depth_resized = cv2.resize( + depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC + ) + + return depth_resized + +def write_depth(path, depth, grayscale, bits=1): + """Write depth map to png file. + + Args: + path (str): filepath without extension + depth (array): depth + grayscale (bool): use a grayscale colormap? + """ + if not grayscale: + bits = 1 + + if not np.isfinite(depth).all(): + depth=np.nan_to_num(depth, nan=0.0, posinf=0.0, neginf=0.0) + print("WARNING: Non-finite depth values present") + + depth_min = depth.min() + depth_max = depth.max() + + max_val = (2**(8*bits))-1 + + if depth_max - depth_min > np.finfo("float").eps: + out = max_val * (depth - depth_min) / (depth_max - depth_min) + else: + out = np.zeros(depth.shape, dtype=depth.dtype) + + if not grayscale: + out = cv2.applyColorMap(np.uint8(out), cv2.COLORMAP_INFERNO) + + if bits == 1: + cv2.imwrite(path + ".png", out.astype("uint8")) + elif bits == 2: + cv2.imwrite(path + ".png", out.astype("uint16")) + + return diff --git a/annotator/zoe/zoedepth/models/builder.py b/annotator/zoe/zoedepth/models/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..0818311b642561712a03a66655c638ce09a04cca --- /dev/null +++ b/annotator/zoe/zoedepth/models/builder.py @@ -0,0 +1,51 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +from importlib import import_module +from .depth_model import DepthModel + +def build_model(config) -> DepthModel: + """Builds a model from a config. The model is specified by the model name and version in the config. The model is then constructed using the build_from_config function of the model interface. + This function should be used to construct models for training and evaluation. + + Args: + config (dict): Config dict. Config is constructed in utils/config.py. Each model has its own config file(s) saved in its root model folder. + + Returns: + torch.nn.Module: Model corresponding to name and version as specified in config + """ + module_name = f"zoedepth.models.{config.model}" + try: + module = import_module(module_name) + except ModuleNotFoundError as e: + # print the original error message + print(e) + raise ValueError( + f"Model {config.model} not found. Refer above error for details.") from e + try: + get_version = getattr(module, "get_version") + except AttributeError as e: + raise ValueError( + f"Model {config.model} has no get_version function.") from e + return get_version(config.version_name).build_from_config(config) diff --git a/annotator/zoe/zoedepth/models/depth_model.py b/annotator/zoe/zoedepth/models/depth_model.py new file mode 100644 index 0000000000000000000000000000000000000000..fc421c108ea3928c9add62b4c190500d9bd4eda1 --- /dev/null +++ b/annotator/zoe/zoedepth/models/depth_model.py @@ -0,0 +1,152 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision import transforms +import PIL.Image +from PIL import Image +from typing import Union + + +class DepthModel(nn.Module): + def __init__(self): + super().__init__() + self.device = 'cpu' + + def to(self, device) -> nn.Module: + self.device = device + return super().to(device) + + def forward(self, x, *args, **kwargs): + raise NotImplementedError + + def _infer(self, x: torch.Tensor): + """ + Inference interface for the model + Args: + x (torch.Tensor): input tensor of shape (b, c, h, w) + Returns: + torch.Tensor: output tensor of shape (b, 1, h, w) + """ + return self(x)['metric_depth'] + + def _infer_with_pad_aug(self, x: torch.Tensor, pad_input: bool=True, fh: float=3, fw: float=3, upsampling_mode: str='bicubic', padding_mode="reflect", **kwargs) -> torch.Tensor: + """ + Inference interface for the model with padding augmentation + Padding augmentation fixes the boundary artifacts in the output depth map. + Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset which has a black or white border around the image. + This augmentation pads the input image and crops the prediction back to the original size / view. + + Note: This augmentation is not required for the models trained with 'avoid_boundary'=True. + Args: + x (torch.Tensor): input tensor of shape (b, c, h, w) + pad_input (bool, optional): whether to pad the input or not. Defaults to True. + fh (float, optional): height padding factor. The padding is calculated as sqrt(h/2) * fh. Defaults to 3. + fw (float, optional): width padding factor. The padding is calculated as sqrt(w/2) * fw. Defaults to 3. + upsampling_mode (str, optional): upsampling mode. Defaults to 'bicubic'. + padding_mode (str, optional): padding mode. Defaults to "reflect". + Returns: + torch.Tensor: output tensor of shape (b, 1, h, w) + """ + # assert x is nchw and c = 3 + assert x.dim() == 4, "x must be 4 dimensional, got {}".format(x.dim()) + assert x.shape[1] == 3, "x must have 3 channels, got {}".format(x.shape[1]) + + if pad_input: + assert fh > 0 or fw > 0, "atlease one of fh and fw must be greater than 0" + pad_h = int(np.sqrt(x.shape[2]/2) * fh) + pad_w = int(np.sqrt(x.shape[3]/2) * fw) + padding = [pad_w, pad_w] + if pad_h > 0: + padding += [pad_h, pad_h] + + x = F.pad(x, padding, mode=padding_mode, **kwargs) + out = self._infer(x) + if out.shape[-2:] != x.shape[-2:]: + out = F.interpolate(out, size=(x.shape[2], x.shape[3]), mode=upsampling_mode, align_corners=False) + if pad_input: + # crop to the original size, handling the case where pad_h and pad_w is 0 + if pad_h > 0: + out = out[:, :, pad_h:-pad_h,:] + if pad_w > 0: + out = out[:, :, :, pad_w:-pad_w] + return out + + def infer_with_flip_aug(self, x, pad_input: bool=True, **kwargs) -> torch.Tensor: + """ + Inference interface for the model with horizontal flip augmentation + Horizontal flip augmentation improves the accuracy of the model by averaging the output of the model with and without horizontal flip. + Args: + x (torch.Tensor): input tensor of shape (b, c, h, w) + pad_input (bool, optional): whether to use padding augmentation. Defaults to True. + Returns: + torch.Tensor: output tensor of shape (b, 1, h, w) + """ + # infer with horizontal flip and average + out = self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs) + out_flip = self._infer_with_pad_aug(torch.flip(x, dims=[3]), pad_input=pad_input, **kwargs) + out = (out + torch.flip(out_flip, dims=[3])) / 2 + return out + + def infer(self, x, pad_input: bool=True, with_flip_aug: bool=True, **kwargs) -> torch.Tensor: + """ + Inference interface for the model + Args: + x (torch.Tensor): input tensor of shape (b, c, h, w) + pad_input (bool, optional): whether to use padding augmentation. Defaults to True. + with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True. + Returns: + torch.Tensor: output tensor of shape (b, 1, h, w) + """ + if with_flip_aug: + return self.infer_with_flip_aug(x, pad_input=pad_input, **kwargs) + else: + return self._infer_with_pad_aug(x, pad_input=pad_input, **kwargs) + + @torch.no_grad() + def infer_pil(self, pil_img, pad_input: bool=True, with_flip_aug: bool=True, output_type: str="numpy", **kwargs) -> Union[np.ndarray, PIL.Image.Image, torch.Tensor]: + """ + Inference interface for the model for PIL image + Args: + pil_img (PIL.Image.Image): input PIL image + pad_input (bool, optional): whether to use padding augmentation. Defaults to True. + with_flip_aug (bool, optional): whether to use horizontal flip augmentation. Defaults to True. + output_type (str, optional): output type. Supported values are 'numpy', 'pil' and 'tensor'. Defaults to "numpy". + """ + x = transforms.ToTensor()(pil_img).unsqueeze(0).to(self.device) + out_tensor = self.infer(x, pad_input=pad_input, with_flip_aug=with_flip_aug, **kwargs) + if output_type == "numpy": + return out_tensor.squeeze().cpu().numpy() + elif output_type == "pil": + # uint16 is required for depth pil image + out_16bit_numpy = (out_tensor.squeeze().cpu().numpy()*256).astype(np.uint16) + return Image.fromarray(out_16bit_numpy) + elif output_type == "tensor": + return out_tensor.squeeze().cpu() + else: + raise ValueError(f"output_type {output_type} not supported. Supported values are 'numpy', 'pil' and 'tensor'") + \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-310.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..222ba05828b1f3a40e350e86aa1c7307ab1feac0 Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-38.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cb5b5e7e6040d9bd451493e877d9ec065c31b7c Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-39.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19455ce1be7f7d2a4d2372dc9170a72c3a6d980e Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/attractor.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-310.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..caa9b73eaa442abdaa0f322a6ffd98efe664dcfc Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-38.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b4c82f65cb953163cb7c6003b4b6020d5aa1d2c Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-39.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..776da2c9d96e861a3f5d5ae6bddd60a259988014 Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/dist_layers.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-310.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f39eb7261a1a219e78771d4dc5b377428e029b02 Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-38.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71574330d53e622d27e937eeca0df3fc65f524e8 Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-39.pyc b/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c36af37dd53e12a0e63a438b3078c08c1eebfe17 Binary files /dev/null and b/annotator/zoe/zoedepth/models/layers/__pycache__/localbins_layers.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/layers/attractor.py b/annotator/zoe/zoedepth/models/layers/attractor.py new file mode 100644 index 0000000000000000000000000000000000000000..2a8efe645adea1d88a12e2ac5cc6bb2a251eef9d --- /dev/null +++ b/annotator/zoe/zoedepth/models/layers/attractor.py @@ -0,0 +1,208 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch +import torch.nn as nn + + +@torch.jit.script +def exp_attractor(dx, alpha: float = 300, gamma: int = 2): + """Exponential attractor: dc = exp(-alpha*|dx|^gamma) * dx , where dx = a - c, a = attractor point, c = bin center, dc = shift in bin centermmary for exp_attractor + + Args: + dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center. + alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300. + gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2. + + Returns: + torch.Tensor : Delta shifts - dc; New bin centers = Old bin centers + dc + """ + return torch.exp(-alpha*(torch.abs(dx)**gamma)) * (dx) + + +@torch.jit.script +def inv_attractor(dx, alpha: float = 300, gamma: int = 2): + """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center + This is the default one according to the accompanying paper. + + Args: + dx (torch.Tensor): The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center. + alpha (float, optional): Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction. Defaults to 300. + gamma (int, optional): Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected. Lower gamma = farther reach. Defaults to 2. + + Returns: + torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc + """ + return dx.div(1+alpha*dx.pow(gamma)) + + +class AttractorLayer(nn.Module): + def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10, + alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False): + """ + Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth) + """ + super().__init__() + + self.n_attractors = n_attractors + self.n_bins = n_bins + self.min_depth = min_depth + self.max_depth = max_depth + self.alpha = alpha + self.gamma = gamma + self.kind = kind + self.attractor_type = attractor_type + self.memory_efficient = memory_efficient + + self._net = nn.Sequential( + nn.Conv2d(in_features, mlp_dim, 1, 1, 0), + nn.ReLU(inplace=True), + nn.Conv2d(mlp_dim, n_attractors*2, 1, 1, 0), # x2 for linear norm + nn.ReLU(inplace=True) + ) + + def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False): + """ + Args: + x (torch.Tensor) : feature block; shape - n, c, h, w + b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w + + Returns: + tuple(torch.Tensor,torch.Tensor) : new bin centers normed and scaled; shape - n, nbins, h, w + """ + if prev_b_embedding is not None: + if interpolate: + prev_b_embedding = nn.functional.interpolate( + prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True) + x = x + prev_b_embedding + + A = self._net(x) + eps = 1e-3 + A = A + eps + n, c, h, w = A.shape + A = A.view(n, self.n_attractors, 2, h, w) + A_normed = A / A.sum(dim=2, keepdim=True) # n, a, 2, h, w + A_normed = A[:, :, 0, ...] # n, na, h, w + + b_prev = nn.functional.interpolate( + b_prev, (h, w), mode='bilinear', align_corners=True) + b_centers = b_prev + + if self.attractor_type == 'exp': + dist = exp_attractor + else: + dist = inv_attractor + + if not self.memory_efficient: + func = {'mean': torch.mean, 'sum': torch.sum}[self.kind] + # .shape N, nbins, h, w + delta_c = func(dist(A_normed.unsqueeze( + 2) - b_centers.unsqueeze(1)), dim=1) + else: + delta_c = torch.zeros_like(b_centers, device=b_centers.device) + for i in range(self.n_attractors): + # .shape N, nbins, h, w + delta_c += dist(A_normed[:, i, ...].unsqueeze(1) - b_centers) + + if self.kind == 'mean': + delta_c = delta_c / self.n_attractors + + b_new_centers = b_centers + delta_c + B_centers = (self.max_depth - self.min_depth) * \ + b_new_centers + self.min_depth + B_centers, _ = torch.sort(B_centers, dim=1) + B_centers = torch.clip(B_centers, self.min_depth, self.max_depth) + return b_new_centers, B_centers + + +class AttractorLayerUnnormed(nn.Module): + def __init__(self, in_features, n_bins, n_attractors=16, mlp_dim=128, min_depth=1e-3, max_depth=10, + alpha=300, gamma=2, kind='sum', attractor_type='exp', memory_efficient=False): + """ + Attractor layer for bin centers. Bin centers are unbounded + """ + super().__init__() + + self.n_attractors = n_attractors + self.n_bins = n_bins + self.min_depth = min_depth + self.max_depth = max_depth + self.alpha = alpha + self.gamma = gamma + self.kind = kind + self.attractor_type = attractor_type + self.memory_efficient = memory_efficient + + self._net = nn.Sequential( + nn.Conv2d(in_features, mlp_dim, 1, 1, 0), + nn.ReLU(inplace=True), + nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0), + nn.Softplus() + ) + + def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False): + """ + Args: + x (torch.Tensor) : feature block; shape - n, c, h, w + b_prev (torch.Tensor) : previous bin centers normed; shape - n, prev_nbins, h, w + + Returns: + tuple(torch.Tensor,torch.Tensor) : new bin centers unbounded; shape - n, nbins, h, w. Two outputs just to keep the API consistent with the normed version + """ + if prev_b_embedding is not None: + if interpolate: + prev_b_embedding = nn.functional.interpolate( + prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True) + x = x + prev_b_embedding + + A = self._net(x) + n, c, h, w = A.shape + + b_prev = nn.functional.interpolate( + b_prev, (h, w), mode='bilinear', align_corners=True) + b_centers = b_prev + + if self.attractor_type == 'exp': + dist = exp_attractor + else: + dist = inv_attractor + + if not self.memory_efficient: + func = {'mean': torch.mean, 'sum': torch.sum}[self.kind] + # .shape N, nbins, h, w + delta_c = func( + dist(A.unsqueeze(2) - b_centers.unsqueeze(1)), dim=1) + else: + delta_c = torch.zeros_like(b_centers, device=b_centers.device) + for i in range(self.n_attractors): + delta_c += dist(A[:, i, ...].unsqueeze(1) - + b_centers) # .shape N, nbins, h, w + + if self.kind == 'mean': + delta_c = delta_c / self.n_attractors + + b_new_centers = b_centers + delta_c + B_centers = b_new_centers + + return b_new_centers, B_centers diff --git a/annotator/zoe/zoedepth/models/layers/dist_layers.py b/annotator/zoe/zoedepth/models/layers/dist_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..3208405dfb78fdfc28d5765e5a6d5dbe31967a23 --- /dev/null +++ b/annotator/zoe/zoedepth/models/layers/dist_layers.py @@ -0,0 +1,121 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch +import torch.nn as nn + + +def log_binom(n, k, eps=1e-7): + """ log(nCk) using stirling approximation """ + n = n + eps + k = k + eps + return n * torch.log(n) - k * torch.log(k) - (n-k) * torch.log(n-k+eps) + + +class LogBinomial(nn.Module): + def __init__(self, n_classes=256, act=torch.softmax): + """Compute log binomial distribution for n_classes + + Args: + n_classes (int, optional): number of output classes. Defaults to 256. + """ + super().__init__() + self.K = n_classes + self.act = act + self.register_buffer('k_idx', torch.arange( + 0, n_classes).view(1, -1, 1, 1)) + self.register_buffer('K_minus_1', torch.Tensor( + [self.K-1]).view(1, -1, 1, 1)) + + def forward(self, x, t=1., eps=1e-4): + """Compute log binomial distribution for x + + Args: + x (torch.Tensor - NCHW): probabilities + t (float, torch.Tensor - NCHW, optional): Temperature of distribution. Defaults to 1.. + eps (float, optional): Small number for numerical stability. Defaults to 1e-4. + + Returns: + torch.Tensor -NCHW: log binomial distribution logbinomial(p;t) + """ + if x.ndim == 3: + x = x.unsqueeze(1) # make it nchw + + one_minus_x = torch.clamp(1 - x, eps, 1) + x = torch.clamp(x, eps, 1) + y = log_binom(self.K_minus_1, self.k_idx) + self.k_idx * \ + torch.log(x) + (self.K - 1 - self.k_idx) * torch.log(one_minus_x) + return self.act(y/t, dim=1) + + +class ConditionalLogBinomial(nn.Module): + def __init__(self, in_features, condition_dim, n_classes=256, bottleneck_factor=2, p_eps=1e-4, max_temp=50, min_temp=1e-7, act=torch.softmax): + """Conditional Log Binomial distribution + + Args: + in_features (int): number of input channels in main feature + condition_dim (int): number of input channels in condition feature + n_classes (int, optional): Number of classes. Defaults to 256. + bottleneck_factor (int, optional): Hidden dim factor. Defaults to 2. + p_eps (float, optional): small eps value. Defaults to 1e-4. + max_temp (float, optional): Maximum temperature of output distribution. Defaults to 50. + min_temp (float, optional): Minimum temperature of output distribution. Defaults to 1e-7. + """ + super().__init__() + self.p_eps = p_eps + self.max_temp = max_temp + self.min_temp = min_temp + self.log_binomial_transform = LogBinomial(n_classes, act=act) + bottleneck = (in_features + condition_dim) // bottleneck_factor + self.mlp = nn.Sequential( + nn.Conv2d(in_features + condition_dim, bottleneck, + kernel_size=1, stride=1, padding=0), + nn.GELU(), + # 2 for p linear norm, 2 for t linear norm + nn.Conv2d(bottleneck, 2+2, kernel_size=1, stride=1, padding=0), + nn.Softplus() + ) + + def forward(self, x, cond): + """Forward pass + + Args: + x (torch.Tensor - NCHW): Main feature + cond (torch.Tensor - NCHW): condition feature + + Returns: + torch.Tensor: Output log binomial distribution + """ + pt = self.mlp(torch.concat((x, cond), dim=1)) + p, t = pt[:, :2, ...], pt[:, 2:, ...] + + p = p + self.p_eps + p = p[:, 0, ...] / (p[:, 0, ...] + p[:, 1, ...]) + + t = t + self.p_eps + t = t[:, 0, ...] / (t[:, 0, ...] + t[:, 1, ...]) + t = t.unsqueeze(1) + t = (self.max_temp - self.min_temp) * t + self.min_temp + + return self.log_binomial_transform(p, t) diff --git a/annotator/zoe/zoedepth/models/layers/localbins_layers.py b/annotator/zoe/zoedepth/models/layers/localbins_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..f94481605c3e6958ce50e73b2eb31d9f0c07dc67 --- /dev/null +++ b/annotator/zoe/zoedepth/models/layers/localbins_layers.py @@ -0,0 +1,169 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch +import torch.nn as nn + + +class SeedBinRegressor(nn.Module): + def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10): + """Bin center regressor network. Bin centers are bounded on (min_depth, max_depth) interval. + + Args: + in_features (int): input channels + n_bins (int, optional): Number of bin centers. Defaults to 16. + mlp_dim (int, optional): Hidden dimension. Defaults to 256. + min_depth (float, optional): Min depth value. Defaults to 1e-3. + max_depth (float, optional): Max depth value. Defaults to 10. + """ + super().__init__() + self.version = "1_1" + self.min_depth = min_depth + self.max_depth = max_depth + + self._net = nn.Sequential( + nn.Conv2d(in_features, mlp_dim, 1, 1, 0), + nn.ReLU(inplace=True), + nn.Conv2d(mlp_dim, n_bins, 1, 1, 0), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + """ + Returns tensor of bin_width vectors (centers). One vector b for every pixel + """ + B = self._net(x) + eps = 1e-3 + B = B + eps + B_widths_normed = B / B.sum(dim=1, keepdim=True) + B_widths = (self.max_depth - self.min_depth) * \ + B_widths_normed # .shape NCHW + # pad has the form (left, right, top, bottom, front, back) + B_widths = nn.functional.pad( + B_widths, (0, 0, 0, 0, 1, 0), mode='constant', value=self.min_depth) + B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW + + B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:, 1:, ...]) + return B_widths_normed, B_centers + + +class SeedBinRegressorUnnormed(nn.Module): + def __init__(self, in_features, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10): + """Bin center regressor network. Bin centers are unbounded + + Args: + in_features (int): input channels + n_bins (int, optional): Number of bin centers. Defaults to 16. + mlp_dim (int, optional): Hidden dimension. Defaults to 256. + min_depth (float, optional): Not used. (for compatibility with SeedBinRegressor) + max_depth (float, optional): Not used. (for compatibility with SeedBinRegressor) + """ + super().__init__() + self.version = "1_1" + self._net = nn.Sequential( + nn.Conv2d(in_features, mlp_dim, 1, 1, 0), + nn.ReLU(inplace=True), + nn.Conv2d(mlp_dim, n_bins, 1, 1, 0), + nn.Softplus() + ) + + def forward(self, x): + """ + Returns tensor of bin_width vectors (centers). One vector b for every pixel + """ + B_centers = self._net(x) + return B_centers, B_centers + + +class Projector(nn.Module): + def __init__(self, in_features, out_features, mlp_dim=128): + """Projector MLP + + Args: + in_features (int): input channels + out_features (int): output channels + mlp_dim (int, optional): hidden dimension. Defaults to 128. + """ + super().__init__() + + self._net = nn.Sequential( + nn.Conv2d(in_features, mlp_dim, 1, 1, 0), + nn.ReLU(inplace=True), + nn.Conv2d(mlp_dim, out_features, 1, 1, 0), + ) + + def forward(self, x): + return self._net(x) + + + +class LinearSplitter(nn.Module): + def __init__(self, in_features, prev_nbins, split_factor=2, mlp_dim=128, min_depth=1e-3, max_depth=10): + super().__init__() + + self.prev_nbins = prev_nbins + self.split_factor = split_factor + self.min_depth = min_depth + self.max_depth = max_depth + + self._net = nn.Sequential( + nn.Conv2d(in_features, mlp_dim, 1, 1, 0), + nn.GELU(), + nn.Conv2d(mlp_dim, prev_nbins * split_factor, 1, 1, 0), + nn.ReLU() + ) + + def forward(self, x, b_prev, prev_b_embedding=None, interpolate=True, is_for_query=False): + """ + x : feature block; shape - n, c, h, w + b_prev : previous bin widths normed; shape - n, prev_nbins, h, w + """ + if prev_b_embedding is not None: + if interpolate: + prev_b_embedding = nn.functional.interpolate(prev_b_embedding, x.shape[-2:], mode='bilinear', align_corners=True) + x = x + prev_b_embedding + S = self._net(x) + eps = 1e-3 + S = S + eps + n, c, h, w = S.shape + S = S.view(n, self.prev_nbins, self.split_factor, h, w) + S_normed = S / S.sum(dim=2, keepdim=True) # fractional splits + + b_prev = nn.functional.interpolate(b_prev, (h,w), mode='bilinear', align_corners=True) + + + b_prev = b_prev / b_prev.sum(dim=1, keepdim=True) # renormalize for gurantees + # print(b_prev.shape, S_normed.shape) + # if is_for_query:(1).expand(-1, b_prev.size(0)//n, -1, -1, -1, -1).flatten(0,1) # TODO ? can replace all this with a single torch.repeat? + b = b_prev.unsqueeze(2) * S_normed + b = b.flatten(1,2) # .shape n, prev_nbins * split_factor, h, w + + # calculate bin centers for loss calculation + B_widths = (self.max_depth - self.min_depth) * b # .shape N, nprev * splitfactor, H, W + # pad has the form (left, right, top, bottom, front, back) + B_widths = nn.functional.pad(B_widths, (0,0,0,0,1,0), mode='constant', value=self.min_depth) + B_edges = torch.cumsum(B_widths, dim=1) # .shape NCHW + + B_centers = 0.5 * (B_edges[:, :-1, ...] + B_edges[:,1:,...]) + return b, B_centers \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/layers/patch_transformer.py b/annotator/zoe/zoedepth/models/layers/patch_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..99d9e51a06b981bae45ce7dd64eaef19a4121991 --- /dev/null +++ b/annotator/zoe/zoedepth/models/layers/patch_transformer.py @@ -0,0 +1,91 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch +import torch.nn as nn + + +class PatchTransformerEncoder(nn.Module): + def __init__(self, in_channels, patch_size=10, embedding_dim=128, num_heads=4, use_class_token=False): + """ViT-like transformer block + + Args: + in_channels (int): Input channels + patch_size (int, optional): patch size. Defaults to 10. + embedding_dim (int, optional): Embedding dimension in transformer model. Defaults to 128. + num_heads (int, optional): number of attention heads. Defaults to 4. + use_class_token (bool, optional): Whether to use extra token at the start for global accumulation (called as "class token"). Defaults to False. + """ + super(PatchTransformerEncoder, self).__init__() + self.use_class_token = use_class_token + encoder_layers = nn.TransformerEncoderLayer( + embedding_dim, num_heads, dim_feedforward=1024) + self.transformer_encoder = nn.TransformerEncoder( + encoder_layers, num_layers=4) # takes shape S,N,E + + self.embedding_convPxP = nn.Conv2d(in_channels, embedding_dim, + kernel_size=patch_size, stride=patch_size, padding=0) + + def positional_encoding_1d(self, sequence_length, batch_size, embedding_dim, device='cpu'): + """Generate positional encodings + + Args: + sequence_length (int): Sequence length + embedding_dim (int): Embedding dimension + + Returns: + torch.Tensor SBE: Positional encodings + """ + position = torch.arange( + 0, sequence_length, dtype=torch.float32, device=device).unsqueeze(1) + index = torch.arange( + 0, embedding_dim, 2, dtype=torch.float32, device=device).unsqueeze(0) + div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim)) + pos_encoding = position * div_term + pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1) + pos_encoding = pos_encoding.unsqueeze(1).repeat(1, batch_size, 1) + return pos_encoding + + + def forward(self, x): + """Forward pass + + Args: + x (torch.Tensor - NCHW): Input feature tensor + + Returns: + torch.Tensor - SNE: Transformer output embeddings. S - sequence length (=HW/patch_size^2), N - batch size, E - embedding dim + """ + embeddings = self.embedding_convPxP(x).flatten( + 2) # .shape = n,c,s = n, embedding_dim, s + if self.use_class_token: + # extra special token at start ? + embeddings = nn.functional.pad(embeddings, (1, 0)) + + # change to S,N,E format required by transformer + embeddings = embeddings.permute(2, 0, 1) + S, N, E = embeddings.shape + embeddings = embeddings + self.positional_encoding_1d(S, N, E, device=embeddings.device) + x = self.transformer_encoder(embeddings) # .shape = S, N, E + return x diff --git a/annotator/zoe/zoedepth/models/model_io.py b/annotator/zoe/zoedepth/models/model_io.py new file mode 100644 index 0000000000000000000000000000000000000000..78b6579631dd847ac76651238cb5a948b5a66286 --- /dev/null +++ b/annotator/zoe/zoedepth/models/model_io.py @@ -0,0 +1,92 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import torch + +def load_state_dict(model, state_dict): + """Load state_dict into model, handling DataParallel and DistributedDataParallel. Also checks for "model" key in state_dict. + + DataParallel prefixes state_dict keys with 'module.' when saving. + If the model is not a DataParallel model but the state_dict is, then prefixes are removed. + If the model is a DataParallel model but the state_dict is not, then prefixes are added. + """ + state_dict = state_dict.get('model', state_dict) + # if model is a DataParallel model, then state_dict keys are prefixed with 'module.' + + do_prefix = isinstance( + model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)) + state = {} + for k, v in state_dict.items(): + if k.startswith('module.') and not do_prefix: + k = k[7:] + + if not k.startswith('module.') and do_prefix: + k = 'module.' + k + + state[k] = v + + model.load_state_dict(state) + print("Loaded successfully") + return model + + +def load_wts(model, checkpoint_path): + ckpt = torch.load(checkpoint_path, map_location='cpu') + return load_state_dict(model, ckpt) + + +def load_state_dict_from_url(model, url, **kwargs): + state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu', **kwargs) + return load_state_dict(model, state_dict) + + +def load_state_from_resource(model, resource: str): + """Loads weights to the model from a given resource. A resource can be of following types: + 1. URL. Prefixed with "url::" + e.g. url::http(s)://url.resource.com/ckpt.pt + + 2. Local path. Prefixed with "local::" + e.g. local::/path/to/ckpt.pt + + + Args: + model (torch.nn.Module): Model + resource (str): resource string + + Returns: + torch.nn.Module: Model with loaded weights + """ + print(f"Using pretrained resource {resource}") + + if resource.startswith('url::'): + url = resource.split('url::')[1] + return load_state_dict_from_url(model, url, progress=True) + + elif resource.startswith('local::'): + path = resource.split('local::')[1] + return load_wts(model, path) + + else: + raise ValueError("Invalid resource type, only url:: and local:: are supported") + \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/zoedepth/__init__.py b/annotator/zoe/zoedepth/models/zoedepth/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc33f737d238766559f0e3a8def3c0b568f23b7f --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth/__init__.py @@ -0,0 +1,31 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +from .zoedepth_v1 import ZoeDepth + +all_versions = { + "v1": ZoeDepth, +} + +get_version = lambda v : all_versions[v] \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-310.pyc b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff587448c5fa25f6424000c523787e4bd30eeed8 Binary files /dev/null and b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-38.pyc b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..555d0fcca3af40dc0e6fb31077282b0d40bacc5a Binary files /dev/null and b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-39.pyc b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c818eaea09ad2086b5d4340a895d1aadb3d90f57 Binary files /dev/null and b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/__init__.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-310.pyc b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9a1a8cb0217e029a4affce1fd2c9a5a3167fee8 Binary files /dev/null and b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-38.pyc b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1132d5a059c82e85f2d97188c0149cef8eb02240 Binary files /dev/null and b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-38.pyc differ diff --git a/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-39.pyc b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed2c45676334a9a56e9768dfcf54e1510f9f1c09 Binary files /dev/null and b/annotator/zoe/zoedepth/models/zoedepth/__pycache__/zoedepth_v1.cpython-39.pyc differ diff --git a/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth.json b/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth.json new file mode 100644 index 0000000000000000000000000000000000000000..3112ed78c89f00e1d13f5d6e5be87cd3216b6dc7 --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth.json @@ -0,0 +1,58 @@ +{ + "model": { + "name": "ZoeDepth", + "version_name": "v1", + "n_bins": 64, + "bin_embedding_dim": 128, + "bin_centers_type": "softplus", + "n_attractors":[16, 8, 4, 1], + "attractor_alpha": 1000, + "attractor_gamma": 2, + "attractor_kind" : "mean", + "attractor_type" : "inv", + "midas_model_type" : "DPT_BEiT_L_384", + "min_temp": 0.0212, + "max_temp": 50.0, + "output_distribution": "logbinomial", + "memory_efficient": true, + "inverse_midas": false, + "img_size": [384, 512] + }, + + "train": { + "train_midas": true, + "use_pretrained_midas": true, + "trainer": "zoedepth", + "epochs": 5, + "bs": 16, + "optim_kwargs": {"lr": 0.000161, "wd": 0.01}, + "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, + "same_lr": false, + "w_si": 1, + "w_domain": 0.2, + "w_reg": 0, + "w_grad": 0, + "avoid_boundary": false, + "random_crop": false, + "input_width": 640, + "input_height": 480, + "midas_lr_factor": 1, + "encoder_lr_factor":10, + "pos_enc_lr_factor":10, + "freeze_midas_bn": true + + }, + + "infer":{ + "train_midas": false, + "use_pretrained_midas": false, + "pretrained_resource" : null, + "force_keep_ar": true + }, + + "eval":{ + "train_midas": false, + "use_pretrained_midas": false, + "pretrained_resource" : null + } +} \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json b/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json new file mode 100644 index 0000000000000000000000000000000000000000..b51802aa44b91c39e15aacaac4b5ab6bec884414 --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth/config_zoedepth_kitti.json @@ -0,0 +1,22 @@ +{ + "model": { + "bin_centers_type": "normed", + "img_size": [384, 768] + }, + + "train": { + }, + + "infer":{ + "train_midas": false, + "use_pretrained_midas": false, + "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt", + "force_keep_ar": true + }, + + "eval":{ + "train_midas": false, + "use_pretrained_midas": false, + "pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt" + } +} \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/zoedepth/zoedepth_v1.py b/annotator/zoe/zoedepth/models/zoedepth/zoedepth_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..bc931b059d6165c84e8ff4f09d5c62d19930cee9 --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth/zoedepth_v1.py @@ -0,0 +1,250 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import itertools + +import torch +import torch.nn as nn +from ..depth_model import DepthModel +from ..base_models.midas import MidasCore +from ..layers.attractor import AttractorLayer, AttractorLayerUnnormed +from ..layers.dist_layers import ConditionalLogBinomial +from ..layers.localbins_layers import (Projector, SeedBinRegressor, + SeedBinRegressorUnnormed) +from ..model_io import load_state_from_resource + + +class ZoeDepth(DepthModel): + def __init__(self, core, n_bins=64, bin_centers_type="softplus", bin_embedding_dim=128, min_depth=1e-3, max_depth=10, + n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', min_temp=5, max_temp=50, train_midas=True, + midas_lr_factor=10, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs): + """ZoeDepth model. This is the version of ZoeDepth that has a single metric head + + Args: + core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features + n_bins (int, optional): Number of bin centers. Defaults to 64. + bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers. + For "softplus", softplus activation is used and thus are unbounded. Defaults to "softplus". + bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128. + min_depth (float, optional): Lower bound for normed bin centers. Defaults to 1e-3. + max_depth (float, optional): Upper bound for normed bin centers. Defaults to 10. + n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1]. + attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300. + attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2. + attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'. + attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'. + min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5. + max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50. + train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True. + midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10. + encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10. + pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10. + """ + super().__init__() + + self.core = core + self.max_depth = max_depth + self.min_depth = min_depth + self.min_temp = min_temp + self.bin_centers_type = bin_centers_type + + self.midas_lr_factor = midas_lr_factor + self.encoder_lr_factor = encoder_lr_factor + self.pos_enc_lr_factor = pos_enc_lr_factor + self.train_midas = train_midas + self.inverse_midas = inverse_midas + + if self.encoder_lr_factor <= 0: + self.core.freeze_encoder( + freeze_rel_pos=self.pos_enc_lr_factor <= 0) + + N_MIDAS_OUT = 32 + btlnck_features = self.core.output_channels[0] + num_out_features = self.core.output_channels[1:] + + self.conv2 = nn.Conv2d(btlnck_features, btlnck_features, + kernel_size=1, stride=1, padding=0) # btlnck conv + + if bin_centers_type == "normed": + SeedBinRegressorLayer = SeedBinRegressor + Attractor = AttractorLayer + elif bin_centers_type == "softplus": + SeedBinRegressorLayer = SeedBinRegressorUnnormed + Attractor = AttractorLayerUnnormed + elif bin_centers_type == "hybrid1": + SeedBinRegressorLayer = SeedBinRegressor + Attractor = AttractorLayerUnnormed + elif bin_centers_type == "hybrid2": + SeedBinRegressorLayer = SeedBinRegressorUnnormed + Attractor = AttractorLayer + else: + raise ValueError( + "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'") + + self.seed_bin_regressor = SeedBinRegressorLayer( + btlnck_features, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth) + self.seed_projector = Projector(btlnck_features, bin_embedding_dim) + self.projectors = nn.ModuleList([ + Projector(num_out, bin_embedding_dim) + for num_out in num_out_features + ]) + self.attractors = nn.ModuleList([ + Attractor(bin_embedding_dim, n_bins, n_attractors=n_attractors[i], min_depth=min_depth, max_depth=max_depth, + alpha=attractor_alpha, gamma=attractor_gamma, kind=attractor_kind, attractor_type=attractor_type) + for i in range(len(num_out_features)) + ]) + + last_in = N_MIDAS_OUT + 1 # +1 for relative depth + + # use log binomial instead of softmax + self.conditional_log_binomial = ConditionalLogBinomial( + last_in, bin_embedding_dim, n_classes=n_bins, min_temp=min_temp, max_temp=max_temp) + + def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs): + """ + Args: + x (torch.Tensor): Input image tensor of shape (B, C, H, W) + return_final_centers (bool, optional): Whether to return the final bin centers. Defaults to False. + denorm (bool, optional): Whether to denormalize the input image. This reverses ImageNet normalization as midas normalization is different. Defaults to False. + return_probs (bool, optional): Whether to return the output probability distribution. Defaults to False. + + Returns: + dict: Dictionary containing the following keys: + - rel_depth (torch.Tensor): Relative depth map of shape (B, H, W) + - metric_depth (torch.Tensor): Metric depth map of shape (B, 1, H, W) + - bin_centers (torch.Tensor): Bin centers of shape (B, n_bins). Present only if return_final_centers is True + - probs (torch.Tensor): Output probability distribution of shape (B, n_bins, H, W). Present only if return_probs is True + + """ + b, c, h, w = x.shape + # print("input shape ", x.shape) + self.orig_input_width = w + self.orig_input_height = h + rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True) + # print("output shapes", rel_depth.shape, out.shape) + + outconv_activation = out[0] + btlnck = out[1] + x_blocks = out[2:] + + x_d0 = self.conv2(btlnck) + x = x_d0 + _, seed_b_centers = self.seed_bin_regressor(x) + + if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2': + b_prev = (seed_b_centers - self.min_depth) / \ + (self.max_depth - self.min_depth) + else: + b_prev = seed_b_centers + + prev_b_embedding = self.seed_projector(x) + + # unroll this loop for better performance + for projector, attractor, x in zip(self.projectors, self.attractors, x_blocks): + b_embedding = projector(x) + b, b_centers = attractor( + b_embedding, b_prev, prev_b_embedding, interpolate=True) + b_prev = b.clone() + prev_b_embedding = b_embedding.clone() + + last = outconv_activation + + if self.inverse_midas: + # invert depth followed by normalization + rel_depth = 1.0 / (rel_depth + 1e-6) + rel_depth = (rel_depth - rel_depth.min()) / \ + (rel_depth.max() - rel_depth.min()) + # concat rel depth with last. First interpolate rel depth to last size + rel_cond = rel_depth.unsqueeze(1) + rel_cond = nn.functional.interpolate( + rel_cond, size=last.shape[2:], mode='bilinear', align_corners=True) + last = torch.cat([last, rel_cond], dim=1) + + b_embedding = nn.functional.interpolate( + b_embedding, last.shape[-2:], mode='bilinear', align_corners=True) + x = self.conditional_log_binomial(last, b_embedding) + + # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor + # print(x.shape, b_centers.shape) + b_centers = nn.functional.interpolate( + b_centers, x.shape[-2:], mode='bilinear', align_corners=True) + out = torch.sum(x * b_centers, dim=1, keepdim=True) + + # Structure output dict + output = dict(metric_depth=out) + if return_final_centers or return_probs: + output['bin_centers'] = b_centers + + if return_probs: + output['probs'] = x + + return output + + def get_lr_params(self, lr): + """ + Learning rate configuration for different layers of the model + Args: + lr (float) : Base learning rate + Returns: + list : list of parameters to optimize and their learning rates, in the format required by torch optimizers. + """ + param_conf = [] + if self.train_midas: + if self.encoder_lr_factor > 0: + param_conf.append({'params': self.core.get_enc_params_except_rel_pos( + ), 'lr': lr / self.encoder_lr_factor}) + + if self.pos_enc_lr_factor > 0: + param_conf.append( + {'params': self.core.get_rel_pos_params(), 'lr': lr / self.pos_enc_lr_factor}) + + midas_params = self.core.core.scratch.parameters() + midas_lr_factor = self.midas_lr_factor + param_conf.append( + {'params': midas_params, 'lr': lr / midas_lr_factor}) + + remaining_modules = [] + for name, child in self.named_children(): + if name != 'core': + remaining_modules.append(child) + remaining_params = itertools.chain( + *[child.parameters() for child in remaining_modules]) + + param_conf.append({'params': remaining_params, 'lr': lr}) + + return param_conf + + @staticmethod + def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs): + core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas, + train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs) + model = ZoeDepth(core, **kwargs) + if pretrained_resource: + assert isinstance(pretrained_resource, str), "pretrained_resource must be a string" + model = load_state_from_resource(model, pretrained_resource) + return model + + @staticmethod + def build_from_config(config): + return ZoeDepth.build(**config) diff --git a/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py b/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..513a278b939c10c010e3c0250ec73544d5663886 --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth_nk/__init__.py @@ -0,0 +1,31 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +from .zoedepth_nk_v1 import ZoeDepthNK + +all_versions = { + "v1": ZoeDepthNK, +} + +get_version = lambda v : all_versions[v] \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json b/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json new file mode 100644 index 0000000000000000000000000000000000000000..42bab2a3ad159a09599a5aba270c491021a3cf1a --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth_nk/config_zoedepth_nk.json @@ -0,0 +1,67 @@ +{ + "model": { + "name": "ZoeDepthNK", + "version_name": "v1", + "bin_conf" : [ + { + "name": "nyu", + "n_bins": 64, + "min_depth": 1e-3, + "max_depth": 10.0 + }, + { + "name": "kitti", + "n_bins": 64, + "min_depth": 1e-3, + "max_depth": 80.0 + } + ], + "bin_embedding_dim": 128, + "bin_centers_type": "softplus", + "n_attractors":[16, 8, 4, 1], + "attractor_alpha": 1000, + "attractor_gamma": 2, + "attractor_kind" : "mean", + "attractor_type" : "inv", + "min_temp": 0.0212, + "max_temp": 50.0, + "memory_efficient": true, + "midas_model_type" : "DPT_BEiT_L_384", + "img_size": [384, 512] + }, + + "train": { + "train_midas": true, + "use_pretrained_midas": true, + "trainer": "zoedepth_nk", + "epochs": 5, + "bs": 16, + "optim_kwargs": {"lr": 0.0002512, "wd": 0.01}, + "sched_kwargs": {"div_factor": 1, "final_div_factor": 10000, "pct_start": 0.7, "three_phase":false, "cycle_momentum": true}, + "same_lr": false, + "w_si": 1, + "w_domain": 100, + "avoid_boundary": false, + "random_crop": false, + "input_width": 640, + "input_height": 480, + "w_grad": 0, + "w_reg": 0, + "midas_lr_factor": 10, + "encoder_lr_factor":10, + "pos_enc_lr_factor":10 + }, + + "infer": { + "train_midas": false, + "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt", + "use_pretrained_midas": false, + "force_keep_ar": true + }, + + "eval": { + "train_midas": false, + "pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt", + "use_pretrained_midas": false + } +} \ No newline at end of file diff --git a/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py b/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..7368ae8031188a9f946d9d3f29633c96e791e68e --- /dev/null +++ b/annotator/zoe/zoedepth/models/zoedepth_nk/zoedepth_nk_v1.py @@ -0,0 +1,333 @@ +# MIT License + +# Copyright (c) 2022 Intelligent Systems Lab Org + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# File author: Shariq Farooq Bhat + +import itertools + +import torch +import torch.nn as nn + +from zoedepth.models.depth_model import DepthModel +from zoedepth.models.base_models.midas import MidasCore +from zoedepth.models.layers.attractor import AttractorLayer, AttractorLayerUnnormed +from zoedepth.models.layers.dist_layers import ConditionalLogBinomial +from zoedepth.models.layers.localbins_layers import (Projector, SeedBinRegressor, + SeedBinRegressorUnnormed) +from zoedepth.models.layers.patch_transformer import PatchTransformerEncoder +from zoedepth.models.model_io import load_state_from_resource + + +class ZoeDepthNK(DepthModel): + def __init__(self, core, bin_conf, bin_centers_type="softplus", bin_embedding_dim=128, + n_attractors=[16, 8, 4, 1], attractor_alpha=300, attractor_gamma=2, attractor_kind='sum', attractor_type='exp', + min_temp=5, max_temp=50, + memory_efficient=False, train_midas=True, + is_midas_pretrained=True, midas_lr_factor=1, encoder_lr_factor=10, pos_enc_lr_factor=10, inverse_midas=False, **kwargs): + """ZoeDepthNK model. This is the version of ZoeDepth that has two metric heads and uses a learned router to route to experts. + + Args: + core (models.base_models.midas.MidasCore): The base midas model that is used for extraction of "relative" features + + bin_conf (List[dict]): A list of dictionaries that contain the bin configuration for each metric head. Each dictionary should contain the following keys: + "name" (str, typically same as the dataset name), "n_bins" (int), "min_depth" (float), "max_depth" (float) + + The length of this list determines the number of metric heads. + bin_centers_type (str, optional): "normed" or "softplus". Activation type used for bin centers. For "normed" bin centers, linear normalization trick is applied. This results in bounded bin centers. + For "softplus", softplus activation is used and thus are unbounded. Defaults to "normed". + bin_embedding_dim (int, optional): bin embedding dimension. Defaults to 128. + + n_attractors (List[int], optional): Number of bin attractors at decoder layers. Defaults to [16, 8, 4, 1]. + attractor_alpha (int, optional): Proportional attractor strength. Refer to models.layers.attractor for more details. Defaults to 300. + attractor_gamma (int, optional): Exponential attractor strength. Refer to models.layers.attractor for more details. Defaults to 2. + attractor_kind (str, optional): Attraction aggregation "sum" or "mean". Defaults to 'sum'. + attractor_type (str, optional): Type of attractor to use; "inv" (Inverse attractor) or "exp" (Exponential attractor). Defaults to 'exp'. + + min_temp (int, optional): Lower bound for temperature of output probability distribution. Defaults to 5. + max_temp (int, optional): Upper bound for temperature of output probability distribution. Defaults to 50. + + memory_efficient (bool, optional): Whether to use memory efficient version of attractor layers. Memory efficient version is slower but is recommended incase of multiple metric heads in order save GPU memory. Defaults to False. + + train_midas (bool, optional): Whether to train "core", the base midas model. Defaults to True. + is_midas_pretrained (bool, optional): Is "core" pretrained? Defaults to True. + midas_lr_factor (int, optional): Learning rate reduction factor for base midas model except its encoder and positional encodings. Defaults to 10. + encoder_lr_factor (int, optional): Learning rate reduction factor for the encoder in midas model. Defaults to 10. + pos_enc_lr_factor (int, optional): Learning rate reduction factor for positional encodings in the base midas model. Defaults to 10. + + """ + + super().__init__() + + self.core = core + self.bin_conf = bin_conf + self.min_temp = min_temp + self.max_temp = max_temp + self.memory_efficient = memory_efficient + self.train_midas = train_midas + self.is_midas_pretrained = is_midas_pretrained + self.midas_lr_factor = midas_lr_factor + self.encoder_lr_factor = encoder_lr_factor + self.pos_enc_lr_factor = pos_enc_lr_factor + self.inverse_midas = inverse_midas + + N_MIDAS_OUT = 32 + btlnck_features = self.core.output_channels[0] + num_out_features = self.core.output_channels[1:] + # self.scales = [16, 8, 4, 2] # spatial scale factors + + self.conv2 = nn.Conv2d( + btlnck_features, btlnck_features, kernel_size=1, stride=1, padding=0) + + # Transformer classifier on the bottleneck + self.patch_transformer = PatchTransformerEncoder( + btlnck_features, 1, 128, use_class_token=True) + self.mlp_classifier = nn.Sequential( + nn.Linear(128, 128), + nn.ReLU(), + nn.Linear(128, 2) + ) + + if bin_centers_type == "normed": + SeedBinRegressorLayer = SeedBinRegressor + Attractor = AttractorLayer + elif bin_centers_type == "softplus": + SeedBinRegressorLayer = SeedBinRegressorUnnormed + Attractor = AttractorLayerUnnormed + elif bin_centers_type == "hybrid1": + SeedBinRegressorLayer = SeedBinRegressor + Attractor = AttractorLayerUnnormed + elif bin_centers_type == "hybrid2": + SeedBinRegressorLayer = SeedBinRegressorUnnormed + Attractor = AttractorLayer + else: + raise ValueError( + "bin_centers_type should be one of 'normed', 'softplus', 'hybrid1', 'hybrid2'") + self.bin_centers_type = bin_centers_type + # We have bins for each bin conf. + # Create a map (ModuleDict) of 'name' -> seed_bin_regressor + self.seed_bin_regressors = nn.ModuleDict( + {conf['name']: SeedBinRegressorLayer(btlnck_features, conf["n_bins"], mlp_dim=bin_embedding_dim//2, min_depth=conf["min_depth"], max_depth=conf["max_depth"]) + for conf in bin_conf} + ) + + self.seed_projector = Projector( + btlnck_features, bin_embedding_dim, mlp_dim=bin_embedding_dim//2) + self.projectors = nn.ModuleList([ + Projector(num_out, bin_embedding_dim, mlp_dim=bin_embedding_dim//2) + for num_out in num_out_features + ]) + + # Create a map (ModuleDict) of 'name' -> attractors (ModuleList) + self.attractors = nn.ModuleDict( + {conf['name']: nn.ModuleList([ + Attractor(bin_embedding_dim, n_attractors[i], + mlp_dim=bin_embedding_dim, alpha=attractor_alpha, + gamma=attractor_gamma, kind=attractor_kind, + attractor_type=attractor_type, memory_efficient=memory_efficient, + min_depth=conf["min_depth"], max_depth=conf["max_depth"]) + for i in range(len(n_attractors)) + ]) + for conf in bin_conf} + ) + + last_in = N_MIDAS_OUT + # conditional log binomial for each bin conf + self.conditional_log_binomial = nn.ModuleDict( + {conf['name']: ConditionalLogBinomial(last_in, bin_embedding_dim, conf['n_bins'], bottleneck_factor=4, min_temp=self.min_temp, max_temp=self.max_temp) + for conf in bin_conf} + ) + + def forward(self, x, return_final_centers=False, denorm=False, return_probs=False, **kwargs): + """ + Args: + x (torch.Tensor): Input image tensor of shape (B, C, H, W). Assumes all images are from the same domain. + return_final_centers (bool, optional): Whether to return the final centers of the attractors. Defaults to False. + denorm (bool, optional): Whether to denormalize the input image. Defaults to False. + return_probs (bool, optional): Whether to return the probabilities of the bins. Defaults to False. + + Returns: + dict: Dictionary of outputs with keys: + - "rel_depth": Relative depth map of shape (B, 1, H, W) + - "metric_depth": Metric depth map of shape (B, 1, H, W) + - "domain_logits": Domain logits of shape (B, 2) + - "bin_centers": Bin centers of shape (B, N, H, W). Present only if return_final_centers is True + - "probs": Bin probabilities of shape (B, N, H, W). Present only if return_probs is True + """ + b, c, h, w = x.shape + self.orig_input_width = w + self.orig_input_height = h + rel_depth, out = self.core(x, denorm=denorm, return_rel_depth=True) + + outconv_activation = out[0] + btlnck = out[1] + x_blocks = out[2:] + + x_d0 = self.conv2(btlnck) + x = x_d0 + + # Predict which path to take + embedding = self.patch_transformer(x)[0] # N, E + domain_logits = self.mlp_classifier(embedding) # N, 2 + domain_vote = torch.softmax(domain_logits.sum( + dim=0, keepdim=True), dim=-1) # 1, 2 + + # Get the path + bin_conf_name = ["nyu", "kitti"][torch.argmax( + domain_vote, dim=-1).squeeze().item()] + + try: + conf = [c for c in self.bin_conf if c.name == bin_conf_name][0] + except IndexError: + raise ValueError( + f"bin_conf_name {bin_conf_name} not found in bin_confs") + + min_depth = conf['min_depth'] + max_depth = conf['max_depth'] + + seed_bin_regressor = self.seed_bin_regressors[bin_conf_name] + _, seed_b_centers = seed_bin_regressor(x) + if self.bin_centers_type == 'normed' or self.bin_centers_type == 'hybrid2': + b_prev = (seed_b_centers - min_depth)/(max_depth - min_depth) + else: + b_prev = seed_b_centers + prev_b_embedding = self.seed_projector(x) + + attractors = self.attractors[bin_conf_name] + for projector, attractor, x in zip(self.projectors, attractors, x_blocks): + b_embedding = projector(x) + b, b_centers = attractor( + b_embedding, b_prev, prev_b_embedding, interpolate=True) + b_prev = b + prev_b_embedding = b_embedding + + last = outconv_activation + + b_centers = nn.functional.interpolate( + b_centers, last.shape[-2:], mode='bilinear', align_corners=True) + b_embedding = nn.functional.interpolate( + b_embedding, last.shape[-2:], mode='bilinear', align_corners=True) + + clb = self.conditional_log_binomial[bin_conf_name] + x = clb(last, b_embedding) + + # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor + # print(x.shape, b_centers.shape) + # b_centers = nn.functional.interpolate(b_centers, x.shape[-2:], mode='bilinear', align_corners=True) + out = torch.sum(x * b_centers, dim=1, keepdim=True) + + output = dict(domain_logits=domain_logits, metric_depth=out) + if return_final_centers or return_probs: + output['bin_centers'] = b_centers + + if return_probs: + output['probs'] = x + return output + + def get_lr_params(self, lr): + """ + Learning rate configuration for different layers of the model + + Args: + lr (float) : Base learning rate + Returns: + list : list of parameters to optimize and their learning rates, in the format required by torch optimizers. + """ + param_conf = [] + if self.train_midas: + def get_rel_pos_params(): + for name, p in self.core.core.pretrained.named_parameters(): + if "relative_position" in name: + yield p + + def get_enc_params_except_rel_pos(): + for name, p in self.core.core.pretrained.named_parameters(): + if "relative_position" not in name: + yield p + + encoder_params = get_enc_params_except_rel_pos() + rel_pos_params = get_rel_pos_params() + midas_params = self.core.core.scratch.parameters() + midas_lr_factor = self.midas_lr_factor if self.is_midas_pretrained else 1.0 + param_conf.extend([ + {'params': encoder_params, 'lr': lr / self.encoder_lr_factor}, + {'params': rel_pos_params, 'lr': lr / self.pos_enc_lr_factor}, + {'params': midas_params, 'lr': lr / midas_lr_factor} + ]) + + remaining_modules = [] + for name, child in self.named_children(): + if name != 'core': + remaining_modules.append(child) + remaining_params = itertools.chain( + *[child.parameters() for child in remaining_modules]) + param_conf.append({'params': remaining_params, 'lr': lr}) + return param_conf + + def get_conf_parameters(self, conf_name): + """ + Returns parameters of all the ModuleDicts children that are exclusively used for the given bin configuration + """ + params = [] + for name, child in self.named_children(): + if isinstance(child, nn.ModuleDict): + for bin_conf_name, module in child.items(): + if bin_conf_name == conf_name: + params += list(module.parameters()) + return params + + def freeze_conf(self, conf_name): + """ + Freezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration + """ + for p in self.get_conf_parameters(conf_name): + p.requires_grad = False + + def unfreeze_conf(self, conf_name): + """ + Unfreezes all the parameters of all the ModuleDicts children that are exclusively used for the given bin configuration + """ + for p in self.get_conf_parameters(conf_name): + p.requires_grad = True + + def freeze_all_confs(self): + """ + Freezes all the parameters of all the ModuleDicts children + """ + for name, child in self.named_children(): + if isinstance(child, nn.ModuleDict): + for bin_conf_name, module in child.items(): + for p in module.parameters(): + p.requires_grad = False + + @staticmethod + def build(midas_model_type="DPT_BEiT_L_384", pretrained_resource=None, use_pretrained_midas=False, train_midas=False, freeze_midas_bn=True, **kwargs): + core = MidasCore.build(midas_model_type=midas_model_type, use_pretrained_midas=use_pretrained_midas, + train_midas=train_midas, fetch_features=True, freeze_bn=freeze_midas_bn, **kwargs) + model = ZoeDepthNK(core, **kwargs) + if pretrained_resource: + assert isinstance(pretrained_resource, str), "pretrained_resource must be a string" + model = load_state_from_resource(model, pretrained_resource) + return model + + @staticmethod + def build_from_config(config): + return ZoeDepthNK.build(**config) diff --git a/annotator/zoe/zoedepth/utils/__pycache__/__init__.cpython-310.pyc b/annotator/zoe/zoedepth/utils/__pycache__/__init__.cpython-310.pyc index e213c1559b6db13234e834170dac3c1cdbf09768..e376e3fda0457dd5a8b6b3da859e8c4f6f4ab788 100644 Binary files a/annotator/zoe/zoedepth/utils/__pycache__/__init__.cpython-310.pyc and b/annotator/zoe/zoedepth/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/utils/__pycache__/arg_utils.cpython-310.pyc b/annotator/zoe/zoedepth/utils/__pycache__/arg_utils.cpython-310.pyc index fd3dbc6159d6f06090d27b32f9fe5590656c45b6..417e57101efc8035a31fdb2dd5f144889e893e70 100644 Binary files a/annotator/zoe/zoedepth/utils/__pycache__/arg_utils.cpython-310.pyc and b/annotator/zoe/zoedepth/utils/__pycache__/arg_utils.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/utils/__pycache__/config.cpython-310.pyc b/annotator/zoe/zoedepth/utils/__pycache__/config.cpython-310.pyc index 3c5c28e3dd2b018ff0d0715e9468d8f318d2caed..beddc3413efaa34139489e912e85b0c9d609a7d9 100644 Binary files a/annotator/zoe/zoedepth/utils/__pycache__/config.cpython-310.pyc and b/annotator/zoe/zoedepth/utils/__pycache__/config.cpython-310.pyc differ diff --git a/annotator/zoe/zoedepth/utils/easydict/__pycache__/__init__.cpython-310.pyc b/annotator/zoe/zoedepth/utils/easydict/__pycache__/__init__.cpython-310.pyc index caeb1d02f6838dc7100623f7667ff6af87427c4f..159eba8528d09f850541da80cd532c6df200d81b 100644 Binary files a/annotator/zoe/zoedepth/utils/easydict/__pycache__/__init__.cpython-310.pyc and b/annotator/zoe/zoedepth/utils/easydict/__pycache__/__init__.cpython-310.pyc differ diff --git a/config/class_level/car/posche.yaml b/config/class_level/car/posche.yaml new file mode 100644 index 0000000000000000000000000000000000000000..477ed66cb0455a46806c07da5e8e914bdbd8d3e9 --- /dev/null +++ b/config/class_level/car/posche.yaml @@ -0,0 +1,52 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/class_level/car/car_to_posche + +dataset_config: + path: "data/car/car" #"data/general_object/car_turn/car_turn_start_20" + prompt: A car + start_sample_frame: 0 + n_sample_frame: 12 + sampling_rate: 1 + layout_mask_dir: "data/car/layout_masks" + layout_mask_order: ['car','bg'] + negative_promot: "jittery" + + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/sd-controlnet-depth" + controlnet_conditioning_scale: 1.0 + + +editing_config: + # ddim_inversion_steps: 100 + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A red porsche car driving before the autumn view lawn','A red porsche car','autumn view lawn'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + +model_config: + # lora: 160 + # temporal_downsample_time: 4 + # SparseCausalAttention_index: ['first'] + # least_sc_channel: 640 + # least_sc_channel: 100000 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/class_level/running_two_man/running_man2spider.yaml b/config/class_level/running_two_man/running_man2spider.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fdd8cec023d10070766361135c9aec67fb1e6ed2 --- /dev/null +++ b/config/class_level/running_two_man/running_man2spider.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/run_two_man/class_level/man2spiderman + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: Man in red hoddie and man in gray shirt are jogging in forest + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['man_class','ground','full_trees'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['Two Spiderman are jogging on grass meadow before cherry trees','Two Spiderman','grass meadow','cherry trees'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/class_level/tennis/1cls_man2iron_man.yaml b/config/class_level/tennis/1cls_man2iron_man.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86fb930c2f507c2aeac643141d03544681577cad --- /dev/null +++ b/config/class_level/tennis/1cls_man2iron_man.yaml @@ -0,0 +1,52 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/class_level/tennis/man_to_ironman + +dataset_config: + path: "data/tennis/tennis" + prompt: a man + n_sample_frame: 24 + sampling_rate: 2 + layout_mask_dir: "./data/tennis/layout_masks" + layout_mask_order: ["man"] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +# use_invertion_latents: True +control_config: + control_type: "openpose" + pretrained_controlnet_path: ./ckpt/sd-controlnet-openpose + #"/home/xianyang/Data/code/FateZero/ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: false + face: false + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['An Iron Man'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0.25 + + +model_config: + # lora: 160 + # temporal_downsample_time: 4 + # SparseCausalAttention_index: ['first'] + # least_sc_channel: 640 + # least_sc_channel: 100000 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/class_level/tennis/3cls_batman_snow-court_iced-wall.yaml b/config/class_level/tennis/3cls_batman_snow-court_iced-wall.yaml new file mode 100644 index 0000000000000000000000000000000000000000..29a8af6b094d2d00fe79446755ae6b05e8b6fa28 --- /dev/null +++ b/config/class_level/tennis/3cls_batman_snow-court_iced-wall.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/class_level/tennis/3cls_man_to_batman_snow-court_iced-wall + +dataset_config: + path: "data/tennis/tennis" + prompt: a man is on a clay court before a stone wall. + n_sample_frame: 24 + sampling_rate: 2 + layout_mask_dir: "./data/tennis/layout_masks" + layout_mask_order: ["man","court",'stone_wall'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + + +# use_invertion_latents: True +control_config: + control_type: "openpose" + pretrained_controlnet_path: ./ckpt/sd-controlnet-openpose + controlnet_conditioning_scale: 1.0 + hand: False + face: False + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['A Batman is on snow covered court before an iced wall','A Batman','snow covered court','an iced wall'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/class_level/wolf/wolf.yaml b/config/class_level/wolf/wolf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c07af5bde801561edbf55ec8c0c4c35e210c71cc --- /dev/null +++ b/config/class_level/wolf/wolf.yaml @@ -0,0 +1,47 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/class_level/wolf2pig_preserve_bg + +dataset_config: + path: "data/wolf/wolf" + prompt: A wolf + start_sample_frame: 0 + n_sample_frame: 16 + sampling_rate: 2 + layout_mask_dir: "data/wolf/layout_masks/" + layout_mask_order: ['wolf','bg'] + negative_promot: "a wolf, deformed" + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/sd-controlnet-depth" + controlnet_conditioning_scale: 0.7 + + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['A cute pig in the autum forest','A cute pig','autum forest'], + ['A husky in the green forest','A husky','green forest'], + ['A tiger in the autum forest','A tiger','autum forest'], + ['A bear in the autum forest','A bear','autum forest'], + + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/2_cars/2cars_left_firetruck_right_school_bus.yaml b/config/instance_level/2_cars/2cars_left_firetruck_right_school_bus.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7adbf5807faf605d45661b8d286a19fbea4144c7 --- /dev/null +++ b/config/instance_level/2_cars/2cars_left_firetruck_right_school_bus.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/2_cars/left_firetruck_right_school-bus + +dataset_config: + path: "data/2_cars/2_cars" #"data/general_object/car_turn/car_turn_start_20" + prompt: two cars are on the highway + start_sample_frame: 0 + n_sample_frame: 14 + sampling_rate: 1 + layout_mask_dir: "data/2_cars/layout_masks" + layout_mask_order: ['left_car','right_car','road'] + negative_promot: 'jittery' + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: ./ckpt/control_v11f1p_sd15_depth + controlnet_conditioning_scale: 0.7 + + +editing_config: + use_invertion_latents: true + inject_step: 30 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: true + use_freeu: true + editing_prompts: [ + ['A firetruck and a school bus are on the road', 'A firetruck','a school bus','road'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/2_cars/2cars_left_firetruck_right_school_bus_preserve_bg.yaml b/config/instance_level/2_cars/2cars_left_firetruck_right_school_bus_preserve_bg.yaml new file mode 100644 index 0000000000000000000000000000000000000000..878f7035cda2d5e5d53f514703e052c5a0c221d8 --- /dev/null +++ b/config/instance_level/2_cars/2cars_left_firetruck_right_school_bus_preserve_bg.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/2_cars/left_firetruck_right_school-bus_preserve_bg + +dataset_config: + path: "data/2_cars/2_cars" #"data/general_object/car_turn/car_turn_start_20" + prompt: two cars are on the highway + start_sample_frame: 0 + n_sample_frame: 14 + sampling_rate: 1 + layout_mask_dir: "data/2_cars/layout_masks" + layout_mask_order: ['left_box','right_box'] + negative_promot: 'jittery' + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: ./ckpt/control_v11f1p_sd15_depth + controlnet_conditioning_scale: 0.7 + + +editing_config: + use_invertion_latents: true + inject_step: 30 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A firetruck and a school bus are on the road', 'A firetruck','a school bus'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/2_cats/2cats_3cls_samoyed_vs_tiger_sunrise.yaml b/config/instance_level/2_cats/2cats_3cls_samoyed_vs_tiger_sunrise.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f1927ea33539df9342631750ff8f2ae5bf267d8e --- /dev/null +++ b/config/instance_level/2_cats/2cats_3cls_samoyed_vs_tiger_sunrise.yaml @@ -0,0 +1,43 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/2_cats/3cls_samoyed_vs_tiger_under_sunrise + +dataset_config: + path: "data/2_cats/2_cats" + prompt: "Cats are playing with toys" + start_sample_frame: 0 + n_sample_frame: 24 + sampling_rate: 1 + layout_mask_dir: "data/2_cats/layout_masks" + layout_mask_order: ['left','right','bg'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "depth" + pretrained_controlnet_path: "./ckpt/sd-controlnet-depth" + controlnet_conditioning_scale: 0.7 + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['A small Samoyed and a tiger are under sunrise','A small Samoyed','a tiger','under sunrise'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/2_cats/2cats_4cls_panda_vs_poddle_bg_meadow_night.yaml b/config/instance_level/2_cats/2cats_4cls_panda_vs_poddle_bg_meadow_night.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9101d01a93ae808ef2b24c1eaca60d5fc441ab12 --- /dev/null +++ b/config/instance_level/2_cats/2cats_4cls_panda_vs_poddle_bg_meadow_night.yaml @@ -0,0 +1,43 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/2_cats/4cls_panda_vs_toy_poddle_on_meadow_in_starry_night + +dataset_config: + path: "data/2_cats/2_cats" + prompt: "" + start_sample_frame: 0 + n_sample_frame: 24 + sampling_rate: 1 + layout_mask_dir: "data/2_cats/layout_masks" + layout_mask_order: ['left','right','bg','ground'] + negative_promot: "a cat with big ear are playing with another cat, deformed" + +control_config: + control_type: "depth" + pretrained_controlnet_path: "./ckpt/sd-controlnet-depth" + controlnet_conditioning_scale: 0.53 + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['A Giant Panda and a curly apricot fur toy poodle are in starry night on grassy meadow','A Giant Panda','a curly apricot fur toy poodle','starry night','grassy meadow'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_golden_retriever.yaml b/config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_golden_retriever.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03d7a0612fae460f96ee2c3e2e356d7a2b20fb2e --- /dev/null +++ b/config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_golden_retriever.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/2_monkeys/left_teddy_right_golden_retriever + +dataset_config: + path: "data/2_monkeys/2_monkeys" + prompt: two monkeys are eating food + start_sample_frame: 14 + n_sample_frame: 16 + sampling_rate: 1 + layout_mask_dir: "data/2_monkeys/layout_masks" + layout_mask_order: ['left','right','bg'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: ./ckpt/control_v11f1p_sd15_depth + controlnet_conditioning_scale: 0.8 + + +editing_config: + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A teddy bear and a Golden Retriever are on the grassy area with dry leaves','A teddy bear','a Golden Retriever','grassy area with dry leaves'], ##note: test on frames 14-30 + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_koala.yaml b/config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_koala.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b3f1231d3a7e09b8e07c14aed9eadab33d86225a --- /dev/null +++ b/config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_koala.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/2_monkeys/left_teddy_right_koala + +dataset_config: + path: "data/2_monkeys/2_monkeys" #"data/general_object/car_turn/car_turn_start_20" + prompt: two monkeys are eating food + start_sample_frame: 11 + n_sample_frame: 16 + sampling_rate: 1 + layout_mask_dir: "data/2_monkeys/layout_masks" + layout_mask_order: ['left','right','bg'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: ./ckpt/control_v11f1p_sd15_depth + controlnet_conditioning_scale: 0.9 + + +editing_config: + use_invertion_latents: true + inject_step: 5 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A teddy bear and a koala are on the grassy area with dry leaves','A teddy bear','a koala','grassy area with dry leaves'], ##note: test on frames 11-27 + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/badminton/badminton_2cls_wonder_woman_spiderman.yaml b/config/instance_level/badminton/badminton_2cls_wonder_woman_spiderman.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fd251d4d0c3860aa320b31cb2f61569d35ae6029 --- /dev/null +++ b/config/instance_level/badminton/badminton_2cls_wonder_woman_spiderman.yaml @@ -0,0 +1,48 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/badminton/left_wonder_woman_right_spiderman + +dataset_config: + path: "data/badminton/badminton" + prompt: a man and a woman are playing badminton + start_sample_frame: 7 + n_sample_frame: 24 + sampling_rate: 1 + layout_mask_dir: "./data/badminton/layout_masks" + layout_mask_order: ['man','woman','bg'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: True + + +editing_config: + use_inversion_attention: true + use_invertion_latents: true + inject_step: 12 + old_qk: 1 + flatten_res: [1,2] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A Spider Man and a Wonder Woman are before charcoal grey wall','A Spider Man','Wonder Woman','charcoal grey wall'], + + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/run_two_man.yaml b/config/instance_level/running_two_man/running_3cls_iron_spider.yaml similarity index 78% rename from config/run_two_man.yaml rename to config/instance_level/running_two_man/running_3cls_iron_spider.yaml index e0a8296ea0277e9d372372a2bfeefb9bd8975a58..9417c351dfbeb7b3e02a2056d3c13f89653ddbdd 100644 --- a/config/run_two_man.yaml +++ b/config/instance_level/running_two_man/running_3cls_iron_spider.yaml @@ -1,14 +1,14 @@ pretrained_model_path: "./ckpt/stable-diffusion-v1-5" -logdir: ./result/run_two_man/ +logdir: ./result/instance_level/run_two_man/3cls_left_iron_right_spider_cherry dataset_config: - path: "data/run_two_man/run_two_man" - prompt: Man in red hoddie and man in gray shirt are jogging in forest + path: "data/run_two_man/run_two_man_fr2" + prompt: '' n_sample_frame: 16 start_sample_frame: 0 - sampling_rate: 1 - layout_mask_dir: "./data/run_two_man/layout_masks" - layout_mask_order: ['left_man','right_man','trees'] + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['left_man','right_man','full_trees'] negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" control_config: @@ -34,7 +34,6 @@ editing_config: sample_seeds: [0] num_inference_steps: 50 blending_percentage: 0 - test_pipeline_config: target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline diff --git a/config/instance_level/running_two_man/running_3cls_vis_cross_attn.yaml b/config/instance_level/running_two_man/running_3cls_vis_cross_attn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7f141646373f78acad51dfa425bce6c387cbd18 --- /dev/null +++ b/config/instance_level/running_two_man/running_3cls_vis_cross_attn.yaml @@ -0,0 +1,48 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/run_two_man/instance_level/3cls_vis_cross_attn_flag_test + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: 'Man in red hoddie and man in gray shirt are jogging in forest' + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['left_man_plus','right_man_plus','trees','trunk'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['Spiderman and Polar Bear are jogging under cherry trees','man','Polar Bear','cherry trees',''], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + vis_cross_attn: True + #cluster_inversion_feature: True + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/instance_level/running_two_man/running_4cls_spider_polar.yaml b/config/instance_level/running_two_man/running_4cls_spider_polar.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3a7403f434a7a74e623c018adf0ef8f0ff7e78f --- /dev/null +++ b/config/instance_level/running_two_man/running_4cls_spider_polar.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/run_two_man/instance_level/left2spider_right2polar_pnp5 + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: '' + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['left_man','right_man','ground','full_trees'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 5 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['Spiderman and Polar Bear on grass meadow before cherry trees','Spiderman','Polar Bear','grass meadow','cherry trees'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/instance_level/soap-box/soap-box.yaml b/config/instance_level/soap-box/soap-box.yaml new file mode 100644 index 0000000000000000000000000000000000000000..271c01bfbb6f783a125a66cfa5250bd2316c4a7b --- /dev/null +++ b/config/instance_level/soap-box/soap-box.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/instance_level/soap-box/5cls_ironman_push_stormtrooper + +dataset_config: + path: "data/soap-box/soap-box" + prompt: "" + start_sample_frame: 12 + n_sample_frame: 16 + sampling_rate: 2 + layout_mask_dir: "./data/soap-box/layout_masks" + layout_mask_order: ['man2','man1','bridge','ground-full','bg'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/control_v11f1p_sd15_depth" + controlnet_conditioning_scale: 1.0 + hand: False + face: False + +editing_config: + use_invertion_latents: true + inject_step: 5 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['An Iron Man and a Stormtrooper on moss covered stone bridge over lake in the forest','An Iron Man','a Stormtrooper','moss covered stone bridge','lake','forest'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/instance_level/soely_edit/joint_edit.yaml b/config/instance_level/soely_edit/joint_edit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d8d648fc796ff5a07d5013f33ece1d09133d6ed6 --- /dev/null +++ b/config/instance_level/soely_edit/joint_edit.yaml @@ -0,0 +1,46 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/soely_edit/run_two_man/joint_edit + + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: Man in red hoddie and man in gray shirt are jogging in forest + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['left_v2','right_v2','left_right_trunk'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['Iron Man and Spiderman are jogging under green trees','Iron Man','Spiderman'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/instance_level/soely_edit/only_left.yaml b/config/instance_level/soely_edit/only_left.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8584162eae0f9d0af295793a57ae20f157b935f3 --- /dev/null +++ b/config/instance_level/soely_edit/only_left.yaml @@ -0,0 +1,46 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/soely_edit/run_two_man/only_left_to_ironman + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: Man in red hoddie and man in gray shirt are jogging in forest + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['left_v2','left_trunk'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 5 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['Iron Man and gray man are jogging under green trees','Iron Man',''], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/instance_level/soely_edit/only_right.yaml b/config/instance_level/soely_edit/only_right.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2e40165756df4dba79760655802cedddf8f0c3a5 --- /dev/null +++ b/config/instance_level/soely_edit/only_right.yaml @@ -0,0 +1,46 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/soely_edit/run_two_man/only_right_to_spiderman + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: Man in red hoddie and man in gray shirt are jogging in forest + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['right_v2','trunk'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['red man and Spiderman are jogging under green trees','Spiderman',''], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/adding_new_object/boxer-punching/thor_in_sunglasses.yaml b/config/part_level/adding_new_object/boxer-punching/thor_in_sunglasses.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a54ca1dab8237f272c5a688b3e8e259e49dfd97a --- /dev/null +++ b/config/part_level/adding_new_object/boxer-punching/thor_in_sunglasses.yaml @@ -0,0 +1,43 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/adding_new_object/boxer-punching/4cls_thor_add_sunglasses_red_gloves_night + +dataset_config: + path: "data/boxer-punching/boxer-punching" + prompt: A boxer wearing black boxing gloves punches towards the camera + start_sample_frame: 4 + n_sample_frame: 28 + sampling_rate: 1 + layout_mask_dir: "data/boxer-punching/layout_masks" + layout_mask_order: ['man_wo_eyes','eyes','gloves','bg'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "depth" + pretrained_controlnet_path: "./ckpt/control_v11f1p_sd15_depth" + controlnet_conditioning_scale: 1.0 + + +editing_config: + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: true + use_freeu: true + + editing_prompts: [ + ['Thor in sunglasses, punching red boxing gloves in starry night sky','Thor','sunglasses','red boxing gloves','starry night sky'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + +seed: 42 + diff --git a/config/part_level/adding_new_object/man_text_message/superman+cap.yaml b/config/part_level/adding_new_object/man_text_message/superman+cap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79e676f690822b445001e3a266743a8c78575612 --- /dev/null +++ b/config/part_level/adding_new_object/man_text_message/superman+cap.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/adding_new_object/man_text_message/superman+cap_blend_with_suit_bg + +dataset_config: + path: "data/man_text_message/man_text_message" + prompt: A man is texting message on the street. + start_sample_frame: 0 + n_sample_frame: 16 + sampling_rate: 1 + layout_mask_dir: "data/man_text_message/layout_masks" + layout_mask_order: ['man_wo_hat','hat','bg'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: true + face: false + +editing_config: + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['Superman in a flat cap before store','Superman','flat cap'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/adding_new_object/man_text_message/superman.yaml b/config/part_level/adding_new_object/man_text_message/superman.yaml new file mode 100644 index 0000000000000000000000000000000000000000..daa2878b94ae792c5a10cf6022098452018a3d1b --- /dev/null +++ b/config/part_level/adding_new_object/man_text_message/superman.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/adding_new_object/man_text_message/superman_blend_with_suit_bg + +dataset_config: + path: "data/man_text_message/man_text_message" + prompt: A man is texting message on the street. + start_sample_frame: 0 + n_sample_frame: 16 + sampling_rate: 1 + layout_mask_dir: "data/man_text_message/layout_masks" + layout_mask_order: ['man','suit_bg'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: true + face: false + +editing_config: + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['Superman before store','Superman'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/adding_new_object/run_two_man/running_spider_polar_sunglass.yaml b/config/part_level/adding_new_object/run_two_man/running_spider_polar_sunglass.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f6daff1519c2be55437e68568895162dfb5fb2d4 --- /dev/null +++ b/config/part_level/adding_new_object/run_two_man/running_spider_polar_sunglass.yaml @@ -0,0 +1,47 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/run_two_man/left2spider_right2polar-sunglasses + +dataset_config: + path: "data/run_two_man/run_two_man_fr2" + prompt: '' + n_sample_frame: 16 + start_sample_frame: 0 + sampling_rate: 2 + layout_mask_dir: "./data/run_two_man/layout_masks_fr2" + layout_mask_order: ['left_man','right_man_wo_eyes','eyes_right_man','ground','full_trees'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: True + face: False + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + editing_prompts: [ + ['Spiderman and Polar Bear in sunglasses on grass meadow before cherry trees','Spiderman','Polar Bear','sunglasses','grass meadow','cherry trees'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + cluster_inversion_feature: True + # vis_cross_attn: false + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/adding_new_object/spin-ball/superman+sunglasses.yaml b/config/part_level/adding_new_object/spin-ball/superman+sunglasses.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e85b9a6cc2a0e12737db3a06e0bab8ee9529f9b --- /dev/null +++ b/config/part_level/adding_new_object/spin-ball/superman+sunglasses.yaml @@ -0,0 +1,45 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" + +logdir: "./result/part_level/adding_new_object/spin_ball/superman+sunglasses" + +dataset_config: + path: "data/spin-ball/spin-ball" + prompt: "a man is spining a basketball" + n_sample_frame: 12 + sampling_rate: 1 + layout_mask_dir: "data/spin-ball/layout_masks" + layout_mask_order: ['man_wo_eyes','eyes','ball','trees'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/control_v11f1p_sd15_depth" + controlnet_conditioning_scale: 1.0 + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + + editing_prompts: [ + ['Superman in sunglasses spins moon under cherry blossoms','Superman','sunglasses','moon','cherry blossoms'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/adding_new_object/spin-ball/superman_spin_moon.yaml b/config/part_level/adding_new_object/spin-ball/superman_spin_moon.yaml new file mode 100644 index 0000000000000000000000000000000000000000..65fbaa8a149b699af9b0cac6ee5cf505a781de17 --- /dev/null +++ b/config/part_level/adding_new_object/spin-ball/superman_spin_moon.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/adding_new_object/spin_ball/superman_spin_moon_under_cherry + +dataset_config: + path: "data/spin-ball/spin-ball" + prompt: "a man is spining a basketball" + n_sample_frame: 12 + sampling_rate: 1 + layout_mask_dir: "data/spin-ball/layout_masks" + layout_mask_order: ['man','ball','trees'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/control_v11f1p_sd15_depth" + controlnet_conditioning_scale: 1.0 + +editing_config: + use_invertion_latents: true + inject_step: 0 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: false + use_freeu: false + + editing_prompts: [ + ['Superman spins moon under cherry blossoms','Superman','moon','cherry blossoms'], + ] + + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/part_level_modification/cat_flower/ginger_body.yaml b/config/part_level/part_level_modification/cat_flower/ginger_body.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b7e627309987bb0a02a557e1e7f22a7224228b3 --- /dev/null +++ b/config/part_level/part_level_modification/cat_flower/ginger_body.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/part_level_modification/cat_flower/ginger_body + +dataset_config: + path: "data/cat_flower/cat_flower" + prompt: A cat is roaring + n_sample_frame: 8 + sampling_rate: 1 + layout_mask_dir: "data/cat_flower/layout_masks" + layout_mask_order: ["body_wo_belt",'body_bg'] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/sd-controlnet-depth" + controlnet_conditioning_scale: 1.0 + + +editing_config: + use_invertion_latents: true + inject_step: 40 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A ginger cat body is roaring','ginger cat body'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/part_level/part_level_modification/cat_flower/ginger_head.yaml b/config/part_level/part_level_modification/cat_flower/ginger_head.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94850a5f241009a5a1fdf7d8536d5de856bfc436 --- /dev/null +++ b/config/part_level/part_level_modification/cat_flower/ginger_head.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/part_level_modification/cat_flower/ginger_head + +dataset_config: + path: "data/cat_flower/cat_flower" + prompt: A cat is roaring + n_sample_frame: 8 + sampling_rate: 1 + layout_mask_dir: "data/cat_flower/layout_masks" + layout_mask_order: ["head"] + negative_promot: "ugly, blurry, low res, unrealistic, unaesthetic" + + +control_config: + control_type: "depth_zoe" + pretrained_controlnet_path: "./ckpt/sd-controlnet-depth" + controlnet_conditioning_scale: 1.0 + + +editing_config: + use_invertion_latents: true + inject_step: 40 + old_qk: 1 + flatten_res: [1] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A ginger and white cat head is roaring','ginger and white cat head'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + modulated_percentage: 0.3 + + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 \ No newline at end of file diff --git a/config/part_level/part_level_modification/man_text_message/black_suit.yaml b/config/part_level/part_level_modification/man_text_message/black_suit.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c911eddb79c890bb0e67c6c40c4105c07ade0232 --- /dev/null +++ b/config/part_level/part_level_modification/man_text_message/black_suit.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/part_level_modification/man_text_message/black_suit + +dataset_config: + path: "data/man_text_message/man_text_message" + prompt: A man in casual shirt is texting message + start_sample_frame: 0 + n_sample_frame: 16 + sampling_rate: 1 + layout_mask_dir: "data/man_text_message/layout_masks" + layout_mask_order: ['suit','suit_bg'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: true + face: false + +editing_config: + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A man in black suit','black suit'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/config/part_level/part_level_modification/man_text_message/blue_shirt.yaml b/config/part_level/part_level_modification/man_text_message/blue_shirt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5c4363467afc45dfd117af194aee8f3c61925161 --- /dev/null +++ b/config/part_level/part_level_modification/man_text_message/blue_shirt.yaml @@ -0,0 +1,44 @@ +pretrained_model_path: "./ckpt/stable-diffusion-v1-5" +logdir: ./result/part_level/part_level_modification/man_text_message/blue_shirt + +dataset_config: + path: "data/man_text_message/man_text_message" + prompt: A man in casual shirt is texting message + start_sample_frame: 0 + n_sample_frame: 16 + sampling_rate: 1 + layout_mask_dir: "data/man_text_message/layout_masks" + layout_mask_order: ['shirt'] + negative_promot: "jittery, ugly, blurry, low res, unrealistic, unaesthetic" + +control_config: + control_type: "dwpose" + pretrained_controlnet_path: "./ckpt/control_v11p_sd15_openpose" + controlnet_conditioning_scale: 1.0 + hand: true + face: false + +editing_config: + use_invertion_latents: true + inject_step: 10 + old_qk: 1 + flatten_res: [1,2,4] + guidance_scale: 7.5 + use_pnp: true + use_freeu: false + editing_prompts: [ + ['A man in blue shirt','blue shirt'], + ] + clip_length: "${..dataset_config.n_sample_frame}" + sample_seeds: [0] + num_inference_steps: 50 + blending_percentage: 0 + +test_pipeline_config: + target: video_diffusion.pipelines.ddim_spatial_temporal.DDIMSpatioTemporalStableDiffusionPipeline + num_inference_steps: "${..validation_sample_logger.num_inference_steps}" + + + +seed: 42 + diff --git a/kill.sh b/kill.sh new file mode 100644 index 0000000000000000000000000000000000000000..bd6d74f3e60a049dfa3cda925cd1cdc27ba47c87 --- /dev/null +++ b/kill.sh @@ -0,0 +1,3 @@ +ps -ef | grep test.py | grep -v grep | awk '{print $2}' | xargs kill -9 +ps -ef | grep test.sh | grep -v grep | awk '{print $2}' | xargs kill -9 +ps -ef | grep test_on_1.sh | grep -v grep | awk '{print $2}' | xargs kill -9 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 112a9856efe8ac301aca645db3df7f7a21e4e779..bec1b6d8a394ecbfaf8fdaf017a3e6b73b73100f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,11 +53,15 @@ stack-data==0.6.3 tensorboard==2.17.0 tensorboard-data-server==0.7.2 tokenizers==0.14.1 -torch==2.3.1+cu121 tqdm==4.66.4 traitlets==5.14.3 transformers==4.35.0 triton==2.3.1 wcwidth==0.2.13 werkzeug==3.0.3 -zipp==3.19.2 \ No newline at end of file +zipp==3.19.2 +scipy==1.14.0 +scikit-learn==1.2.2 +nltk==3.8.1 +timm==0.6.7 +scikit-image==0.24.0 \ No newline at end of file diff --git a/test.py b/test.py index 60b78c906de80c2795dcff5a7fa084011a454298..1c20870ba5357aa41e5c621364ff1aa876a33091 100644 --- a/test.py +++ b/test.py @@ -67,7 +67,7 @@ def test( mixed_precision: Optional[str] = "fp16", batch_size: int = 1, model_config: dict={}, - verbose: bool=True, + cluster_inversion_feature: bool=False, **kwargs ): @@ -322,6 +322,7 @@ def test( do_classifier_free_guidance=True, control=batch['control'], controlnet_conditioning_scale=control_config['controlnet_conditioning_scale'], use_pnp=editing_config['use_pnp'], + cluster_inversion_feature=editing_config.get('cluster_inversion_feature', False), trajs=trajectories, old_qk=editing_config["old_qk"], flatten_res=editing_config['flatten_res'] @@ -373,6 +374,8 @@ def test( inject_step=editing_config["inject_step"], old_qk=editing_config["old_qk"], use_pnp = editing_config['use_pnp'], + cluster_inversion_feature = editing_config.get('cluster_inversion_feature', False), + vis_cross_attn = editing_config.get('vis_cross_attn', False), attn_inversion_dict = attn_inversion_dict, ) diff --git a/test.sh b/test.sh index 1c40f2ca618bb70cbaafc8e30a7da7421411ea09..b93a61cfdd62cee2f71a5e3bbee9313968d99456 100644 --- a/test.sh +++ b/test.sh @@ -1,2 +1,47 @@ export CUDA_VISIBLE_DEVICES=0 -accelerate launch test.py --config config/run_two_man.yaml \ No newline at end of file +## instance level + +# accelerate launch test.py --config config/class_level/man2spider.yaml +# accelerate launch test.py --config config/instance_level/running_3cls_iron_spider.yaml +# accelerate launch test.py --config config/part_level/run_spider_polar_sunglass.yaml + +# accelerate launch test.py --config config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_golden_retriever.yaml +# accelerate launch test.py --config config/instance_level/2_monkeys/monkeys_2cls_teddy_bear_koala.yaml + +# accelerate launch test.py --config config/instance_level/badminton/badminton_2cls_wonder_woman_spiderman.yaml + +# accelerate launch test.py --config config/instance_level/soap-box/soap-box.yaml + +# accelerate launch test.py --config config/instance_level/2_cats/2cats_3cls_samoyed_vs_tiger_sunrise.yaml +# accelerate launch test.py --config config/instance_level/2_cats/2cats_4cls_panda_vs_poddle_bg_meadow_night.yaml + +# accelerate launch test.py --config config/instance_level/2_cars/2cars_left_firetruck_right_school_bus_preserve_bg.yaml +# accelerate launch test.py --config config/instance_level/2_cars/2cars_left_firetruck_right_school_bus.yaml + + +## part level +# accelerate launch test.py --config config/part_level/adding_new_object/boxer-punching/thor_in_sunglasses.yaml + +accelerate launch test.py --config config/part_level/adding_new_object/man_text_message/superman.yaml +accelerate launch test.py --config config/part_level/adding_new_object/man_text_message/superman+cap.yaml + +# accelerate launch test.py --config config/part_level/adding_new_object/spin-ball/superman_spin_moon.yaml +# accelerate launch test.py --config config/part_level/adding_new_object/spin-ball/superman+sunglasses.yaml + +# accelerate launch test.py --config config/part_level/part_level_modification/cat_flower/ginger_body.yaml +# accelerate launch test.py --config config/part_level/part_level_modification/cat_flower/ginger_head.yaml + +# accelerate launch test.py --config config/part_level/part_level_modification/man_text_message/black_suit.yaml +# accelerate launch test.py --config config/part_level/part_level_modification/man_text_message/blue_shirt.yaml + +# accelerate launch test.py --config config/instance_level/soely_edit/only_left.yaml +# accelerate launch test.py --config config/instance_level/soely_edit/only_right.yaml +# accelerate launch test.py --config config/instance_level/soely_edit/joint_edit.yaml + +# ## class level +# accelerate launch test.py --config config/class_level/car/posche.yaml + +# accelerate launch test.py --config config/class_level/tennis/1cls_man2iron_man.yaml +# accelerate launch test.py --config config/class_level/tennis/3cls_batman_snow-court_iced-wall.yaml + +# accelerate launch test.py --config config/class_level/wolf/wolf_to_pig.yaml \ No newline at end of file diff --git a/video_diffusion/common/__pycache__/image_util.cpython-310.pyc b/video_diffusion/common/__pycache__/image_util.cpython-310.pyc index c47c260963bce0688eeefa8375df49077c49c960..83e32a94bf26ad62221ba1effa221f5d5f025fcd 100644 Binary files a/video_diffusion/common/__pycache__/image_util.cpython-310.pyc and b/video_diffusion/common/__pycache__/image_util.cpython-310.pyc differ diff --git a/video_diffusion/common/__pycache__/instantiate_from_config.cpython-310.pyc b/video_diffusion/common/__pycache__/instantiate_from_config.cpython-310.pyc index 986925c16a3dc71bd10f98c768d9baf76823a879..f46fb07eb465f115b7ff84447a853ee4154d428b 100644 Binary files a/video_diffusion/common/__pycache__/instantiate_from_config.cpython-310.pyc and b/video_diffusion/common/__pycache__/instantiate_from_config.cpython-310.pyc differ diff --git a/video_diffusion/common/__pycache__/logger.cpython-310.pyc b/video_diffusion/common/__pycache__/logger.cpython-310.pyc index 6a06dca25815e870302256340966d7725345ee9c..7c58646b6d0334a544e2c0ae3e8008011c4510a8 100644 Binary files a/video_diffusion/common/__pycache__/logger.cpython-310.pyc and b/video_diffusion/common/__pycache__/logger.cpython-310.pyc differ diff --git a/video_diffusion/common/__pycache__/util.cpython-310.pyc b/video_diffusion/common/__pycache__/util.cpython-310.pyc index b80d994d810640c72c7319076cbf529851c11d13..b98f6c185c9e117de682d0456a9f755a39bd1306 100644 Binary files a/video_diffusion/common/__pycache__/util.cpython-310.pyc and b/video_diffusion/common/__pycache__/util.cpython-310.pyc differ diff --git a/video_diffusion/data/__pycache__/dataset.cpython-310.pyc b/video_diffusion/data/__pycache__/dataset.cpython-310.pyc index 10d612bfc6e5092ac5468840724a47cc87647f1b..ac894dca229324968168e65d4b334fd35fdd3a8f 100644 Binary files a/video_diffusion/data/__pycache__/dataset.cpython-310.pyc and b/video_diffusion/data/__pycache__/dataset.cpython-310.pyc differ diff --git a/video_diffusion/data/__pycache__/transform.cpython-310.pyc b/video_diffusion/data/__pycache__/transform.cpython-310.pyc index 8efaa2b720abe6b780bbfd6f44fd9c04e25c3d39..8714f8bbb926cce525c08a8b6316d83b3eb3595f 100644 Binary files a/video_diffusion/data/__pycache__/transform.cpython-310.pyc and b/video_diffusion/data/__pycache__/transform.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/attention.cpython-310.pyc b/video_diffusion/models/__pycache__/attention.cpython-310.pyc index ca1f0f94143637be1804c7048b0f64b9368f7a04..54ac68b4bf5e40967b829d931942b12aa7430d95 100644 Binary files a/video_diffusion/models/__pycache__/attention.cpython-310.pyc and b/video_diffusion/models/__pycache__/attention.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/controlnet3d.cpython-310.pyc b/video_diffusion/models/__pycache__/controlnet3d.cpython-310.pyc index 07f28411307bb51bbe973164085b20f7836e6ba8..b20769e3f7846bb716fcb8430e6af7b2974ae9c0 100644 Binary files a/video_diffusion/models/__pycache__/controlnet3d.cpython-310.pyc and b/video_diffusion/models/__pycache__/controlnet3d.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/controlnet_attention.cpython-310.pyc b/video_diffusion/models/__pycache__/controlnet_attention.cpython-310.pyc index 689d5dbb1b0ea1e34b4ae1d30a34a3c5c162520c..1c7d836169a61409286091586a69695522f3d463 100644 Binary files a/video_diffusion/models/__pycache__/controlnet_attention.cpython-310.pyc and b/video_diffusion/models/__pycache__/controlnet_attention.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/controlnet_unet_blocks.cpython-310.pyc b/video_diffusion/models/__pycache__/controlnet_unet_blocks.cpython-310.pyc index f770b8155ba86eb5cf6b550637c412d40bb77670..ac5a0f1a76fb93a15d47a29acf10e9b86ebe6d34 100644 Binary files a/video_diffusion/models/__pycache__/controlnet_unet_blocks.cpython-310.pyc and b/video_diffusion/models/__pycache__/controlnet_unet_blocks.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/lora.cpython-310.pyc b/video_diffusion/models/__pycache__/lora.cpython-310.pyc index 952d16c0aaea60f081669401ca1145f8fbc9fbbc..ec7ad3bde16b26aa9ea11f519c9c170d5d2de39f 100644 Binary files a/video_diffusion/models/__pycache__/lora.cpython-310.pyc and b/video_diffusion/models/__pycache__/lora.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/resnet.cpython-310.pyc b/video_diffusion/models/__pycache__/resnet.cpython-310.pyc index 3c96b923799d78f9ab084bd8d75ef5e7cb102763..f87892c9f0609db0e2fc518ed042e11b95ecc057 100644 Binary files a/video_diffusion/models/__pycache__/resnet.cpython-310.pyc and b/video_diffusion/models/__pycache__/resnet.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/unet_3d_blocks.cpython-310.pyc b/video_diffusion/models/__pycache__/unet_3d_blocks.cpython-310.pyc index 4699f2d699da41992da9b7487b93054969f90b28..a37df0d41200a8dc7c9ac4fc6a367171520daf7f 100644 Binary files a/video_diffusion/models/__pycache__/unet_3d_blocks.cpython-310.pyc and b/video_diffusion/models/__pycache__/unet_3d_blocks.cpython-310.pyc differ diff --git a/video_diffusion/models/__pycache__/unet_3d_condition.cpython-310.pyc b/video_diffusion/models/__pycache__/unet_3d_condition.cpython-310.pyc index 7efbb4612d0ce40c23e10be6427a1897ab57a952..e743490d2b66416e292674e3d34d3ddbef9f5524 100644 Binary files a/video_diffusion/models/__pycache__/unet_3d_condition.cpython-310.pyc and b/video_diffusion/models/__pycache__/unet_3d_condition.cpython-310.pyc differ diff --git a/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc b/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc index 5e8f17a60f626de179c873870bcdebdb08cb4135..d0b2402f83e885da49f16dedabbebd6881ed3773 100644 Binary files a/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc and b/video_diffusion/pipelines/__pycache__/ddim_spatial_temporal.cpython-310.pyc differ diff --git a/video_diffusion/pipelines/__pycache__/stable_diffusion.cpython-310.pyc b/video_diffusion/pipelines/__pycache__/stable_diffusion.cpython-310.pyc index c450ef2ebfee34482441abeb5470a522e313da11..b8dca0188234db16e1a6ae412cea247d0597e970 100644 Binary files a/video_diffusion/pipelines/__pycache__/stable_diffusion.cpython-310.pyc and b/video_diffusion/pipelines/__pycache__/stable_diffusion.cpython-310.pyc differ diff --git a/video_diffusion/pipelines/__pycache__/validation_loop.cpython-310.pyc b/video_diffusion/pipelines/__pycache__/validation_loop.cpython-310.pyc index 23cde8ce8b90bbabecfabe7c813000374a7082c6..b4ea3890d85a264cbad0cf70e95f956d79c036ef 100644 Binary files a/video_diffusion/pipelines/__pycache__/validation_loop.cpython-310.pyc and b/video_diffusion/pipelines/__pycache__/validation_loop.cpython-310.pyc differ diff --git a/video_diffusion/pipelines/ddim_spatial_temporal.py b/video_diffusion/pipelines/ddim_spatial_temporal.py index 4469c5e5d565df30565371ebc8d6e964518eb073..f985a5c0fc1b51ee691cf68ba7b4f6b90cd8b316 100755 --- a/video_diffusion/pipelines/ddim_spatial_temporal.py +++ b/video_diffusion/pipelines/ddim_spatial_temporal.py @@ -18,7 +18,7 @@ from transformers import CLIPTextModel, CLIPTokenizer import torch.nn.functional as F from omegaconf import OmegaConf from video_diffusion.prompt_attention.attention_register import register_attention_control -from video_diffusion.prompt_attention.attention_util import ModulatedAttentionControl,ModulatedAttention_ControlEdit,Attention_Record_Processor +from video_diffusion.prompt_attention.attention_util import ST_Layout_Attn_Control,ST_Layout_Attn_ControlEdit,Attention_Record_Processor from video_diffusion.prompt_attention import attention_util from video_diffusion.prompt_attention.sd_study_utils import * from video_diffusion.prompt_attention.attention_store import AttentionStore @@ -142,6 +142,7 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip control = None, controlnet_conditioning_scale=None, use_pnp=None, + cluster_inversion_feature = None, **kwargs, ): weight_dtype = image.dtype @@ -262,88 +263,90 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip } else: attn_inversion_dict = None - ''' - inv_self_avg_dict={} - inv_cross_avg_dict={} - element_name = 'attn' - attn_size = 32 - for element_name in ['attn']: - inv_self_avg_dict[element_name]={} - inv_cross_avg_dict[element_name]={} - - self_attn_avg = editor.aggregate_attention(from_where=("up", "down", "mid"), - res=attn_size,is_cross=False) - - cross_attn_avg = editor.aggregate_attention(from_where=("up", "down", "mid"), - res=attn_size,is_cross=True) - - print('self_attn_avg',self_attn_avg.shape) - print('cross_attn_avg', cross_attn_avg.shape) - inv_self_avg_dict[element_name][attn_size]=self_attn_avg - inv_cross_avg_dict[element_name][attn_size]=cross_attn_avg - - os.makedirs(os.path.join(self.logdir, "attn_inv"), exist_ok=True) - os.makedirs(os.path.join(self.logdir, "sd_study"), exist_ok=True) - with open(os.path.join(self.logdir, - "attn_inv/inv_self_avg_dict.pkl"), - 'wb') as f: - pkl.dump(inv_self_avg_dict, f) - - with open(os.path.join(self.logdir, - "attn_inv/inv_cross_avg_dict.pkl"), - 'wb') as f: - pkl.dump(inv_cross_avg_dict, f) - - num_segments=3 - draw_pca(inv_self_avg_dict, resolution=32, dict_key='attn', - save_path=os.path.join(self.logdir, 'sd_study'), - special_name='inv_self') - - run_clusters(inv_self_avg_dict, resolution=32, dict_key='attn', - save_path=os.path.join(self.logdir, 'sd_study'), - special_name='inv_self',num_segments=num_segments) - cross_attn_visualization = attention_util.show_cross_attention_plus_org_img(self.tokenizer, source_prompt, - image, editor, 32, ["up", "down", "mid"], save_path= os.path.join(self.logdir,'sd_study'),attention_maps=cross_attn_avg) + if cluster_inversion_feature: + logger.info('cluster ddim inversion feature') + inv_self_avg_dict={} + inv_cross_avg_dict={} + element_name = 'attn' + attn_size = 32 + for element_name in ['attn']: + inv_self_avg_dict[element_name]={} + inv_cross_avg_dict[element_name]={} + + self_attn_avg = editor.aggregate_attention(from_where=("up", "down", "mid"), + res=attn_size,is_cross=False) + + cross_attn_avg = editor.aggregate_attention(from_where=("up", "down", "mid"), + res=attn_size,is_cross=True) + + print('self_attn_avg',self_attn_avg.shape) + print('cross_attn_avg', cross_attn_avg.shape) + inv_self_avg_dict[element_name][attn_size]=self_attn_avg + inv_cross_avg_dict[element_name][attn_size]=cross_attn_avg + + os.makedirs(os.path.join(self.logdir, "attn_inv"), exist_ok=True) + os.makedirs(os.path.join(self.logdir, "sd_study"), exist_ok=True) + with open(os.path.join(self.logdir, + "attn_inv/inv_self_avg_dict.pkl"), + 'wb') as f: + pkl.dump(inv_self_avg_dict, f) + + with open(os.path.join(self.logdir, + "attn_inv/inv_cross_avg_dict.pkl"), + 'wb') as f: + pkl.dump(inv_cross_avg_dict, f) + + num_segments=3 + draw_pca(inv_self_avg_dict, resolution=32, dict_key='attn', + save_path=os.path.join(self.logdir, 'sd_study'), + special_name='inv_self') + + run_clusters(inv_self_avg_dict, resolution=32, dict_key='attn', + save_path=os.path.join(self.logdir, 'sd_study'), + special_name='inv_self',num_segments=num_segments) + cross_attn_visualization = attention_util.show_cross_attention_plus_org_img(self.tokenizer, source_prompt, + image, editor, 32, ["up", "down", "mid"], save_path= os.path.join(self.logdir,'sd_study'),attention_maps=cross_attn_avg) - dict_key='attn' - special_name='inv_self' - resolution = 32 - threshold=0.1 - - tokenized_prompt = nltk.word_tokenize(source_prompt) - nouns = [(i, word) for (i, (word, pos)) in enumerate(nltk.pos_tag(tokenized_prompt)) if pos[:2] == 'NN'] - print(nouns) - - npy_name=f'cluster_{dict_key}_{resolution}_{special_name}.npy' - save_path=os.path.join(self.logdir, 'sd_study') - - abs_filename=os.path.join(self.logdir, "attn_inv", f"inv_cross_avg_dict.pkl") - inv_cross_avg_dict=read_pkl(abs_filename) - - video_cross_attention = inv_cross_avg_dict['attn'][32] - video_clusters=np.load(os.path.join(save_path, npy_name)) - - t = video_clusters.shape[0] - for i in range(t): - clusters = video_clusters[i] - cross_attention = video_cross_attention[i] - c2noun, c2mask = cluster2noun_(clusters, threshold, num_segments, nouns,cross_attention) - print('c2noun',c2noun) - merged_mask={} - for index in range(len(c2noun)): - # mask_ = merged_mask[class_name] - item=c2noun[index] - mask_ = c2mask[index] - mask_ = torch.from_numpy(mask_) - mask_ = F.interpolate(mask_.float().unsqueeze(0).unsqueeze(0), size=512, mode='nearest').round().bool().squeeze(0).squeeze(0) - - output_name = os.path.join(f"{save_path}", - f"frame_{i}_{item}_{index}.png") - save_mask(mask_, output_name) - ''' + dict_key='attn' + special_name='inv_self' + resolution = 32 + threshold=0.1 + + tokenized_prompt = nltk.word_tokenize(source_prompt) + nouns = [(i, word) for (i, (word, pos)) in enumerate(nltk.pos_tag(tokenized_prompt)) if pos[:2] == 'NN'] + print(nouns) + + npy_name=f'cluster_{dict_key}_{resolution}_{special_name}.npy' + save_path=os.path.join(self.logdir, 'sd_study') + + abs_filename=os.path.join(self.logdir, "attn_inv", f"inv_cross_avg_dict.pkl") + inv_cross_avg_dict=read_pkl(abs_filename) + + video_cross_attention = inv_cross_avg_dict['attn'][32] + video_clusters=np.load(os.path.join(save_path, npy_name)) + + t = video_clusters.shape[0] + for i in range(t): + clusters = video_clusters[i] + cross_attention = video_cross_attention[i] + c2noun, c2mask = cluster2noun_(clusters, threshold, num_segments, nouns,cross_attention) + print('c2noun',c2noun) + merged_mask={} + for index in range(len(c2noun)): + # mask_ = merged_mask[class_name] + item=c2noun[index] + mask_ = c2mask[index] + mask_ = torch.from_numpy(mask_) + mask_ = F.interpolate(mask_.float().unsqueeze(0).unsqueeze(0), size=512, mode='nearest').round().bool().squeeze(0).squeeze(0) + + output_name = os.path.join(f"{save_path}", + f"frame_{i}_{item}_{index}.png") + + save_mask(mask_, output_name) + return latents, attn_inversion_dict @@ -666,6 +669,8 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip logdir: str=None, controlnet_conditioning_scale: float = 1.0, use_pnp: bool = False, + cluster_inversion_feature: bool = False, + vis_cross_attn: bool = False, attn_inversion_dict: dict=None, **kwargs, ): @@ -689,12 +694,11 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip self.scheduler.set_timesteps(num_inference_steps, device=device) - if latents is None: latents, attn_inversion_dict = self.prepare_latents_ddim_inverted( image, batch_size, source_prompt, do_classifier_free_guidance, generator, - control, controlnet_conditioning_scale, use_pnp + control, controlnet_conditioning_scale, use_pnp, cluster_inversion_feature ) print("use inversion latents") @@ -709,7 +713,7 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip #============do visualization for st-layout attn===============# self.store_controller = attention_util.AttentionStore() - editor = ModulatedAttention_ControlEdit(text_cond=text_cond,sreg_maps=sreg_maps,creg_maps=creg_maps,reg_sizes=reg_sizes,reg_sizes_c=reg_sizes_c, + editor = ST_Layout_Attn_ControlEdit(text_cond=text_cond,sreg_maps=sreg_maps,creg_maps=creg_maps,reg_sizes=reg_sizes,reg_sizes_c=reg_sizes_c, time_steps=time_steps,clip_length=clip_length,attention_type=attention_type, additional_attention_store=self.store_controller, save_self_attention = True, @@ -719,7 +723,7 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip attention_util.register_attention_control(self, editor, text_cond, clip_length, downsample_height,downsample_width,ddim_inversion=False) #============do visualization for st-layout attn===============# - # editor = ModulatedAttentionControl(text_cond=text_cond,sreg_maps=sreg_maps,creg_maps=creg_maps,reg_sizes=reg_sizes,reg_sizes_c=reg_sizes_c, + # editor = ST_Layout_Attn_Control(text_cond=text_cond,sreg_maps=sreg_maps,creg_maps=creg_maps,reg_sizes=reg_sizes,reg_sizes_c=reg_sizes_c, # time_steps=time_steps,clip_length=clip_length,attention_type=attention_type) # register_attention_control(self, editor, text_cond, clip_length,downsample_height,downsample_width,ddim_inversion=False) @@ -813,9 +817,10 @@ class DDIMSpatioTemporalStableDiffusionPipeline(SpatioTemporalStableDiffusionPip ### vis cross attn # image shape fchw - # save_path = os.path.join(logdir,'visualization_denoise') - # os.makedirs(save_path, exist_ok=True) - # attention_output = attention_util.show_cross_attention_plus_org_img(self.tokenizer,prompt, image, editor, 32, ["up","down"],save_path=save_path) + if vis_cross_attn: + save_path = os.path.join(logdir,'visualization_denoise') + os.makedirs(save_path, exist_ok=True) + attention_output = attention_util.show_cross_attention_plus_org_img(self.tokenizer,prompt, image, editor, 32, ["up","down"],save_path=save_path) # 8. Post-processing image = self.decode_latents(latents) diff --git a/video_diffusion/pipelines/validation_loop.py b/video_diffusion/pipelines/validation_loop.py index 6cd0e3c6a0fb86c9adf0942fb4eb3c8eeecdb22b..07dabbeddb11676a441a785016caaab3cd9f2e70 100644 --- a/video_diffusion/pipelines/validation_loop.py +++ b/video_diffusion/pipelines/validation_loop.py @@ -130,6 +130,8 @@ class SampleLogger: inject_step = None, old_qk = None, use_pnp = None, + cluster_inversion_feature = None, + vis_cross_attn = None, attn_inversion_dict = None, ): torch.cuda.empty_cache() @@ -173,8 +175,9 @@ class SampleLogger: inject_step=inject_step, old_qk=old_qk, use_pnp=use_pnp, + cluster_inversion_feature= cluster_inversion_feature, + vis_cross_attn = vis_cross_attn, attn_inversion_dict=attn_inversion_dict, - # Put the source prompt at the first one, when using p2p ) sequence = sequence_return.images[0] diff --git a/video_diffusion/prompt_attention/__pycache__/attention_register.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/attention_register.cpython-310.pyc index 35a9faa2ce1929b11f6a5a5e2fe7c0dbec55f471..e59501e0e6953744547fafc9556c1a58e416e18c 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/attention_register.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/attention_register.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/__pycache__/attention_store.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/attention_store.cpython-310.pyc index 82a16f46138bdbf4a9657668e8501c44851512da..aa646452b3aaf05a967cf573002ed2d7dca64234 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/attention_store.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/attention_store.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/__pycache__/attention_util.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/attention_util.cpython-310.pyc index b51e86bfd76636691f774af4243150b8053e7ff2..c4b8f7469addb47338bb54e1f2d3c1fc1d769f84 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/attention_util.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/attention_util.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/__pycache__/free_lunch_utils.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/free_lunch_utils.cpython-310.pyc index 05d59dea9d9f17c7ea062d083482dc81b5e94003..d18f43fa9f6cefae147504631e43245f51c840df 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/free_lunch_utils.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/free_lunch_utils.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/__pycache__/ptp_utils.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/ptp_utils.cpython-310.pyc index 1c599a8d4c2469fccc5e1f151fe9f49d4362b6de..5740517fafdcb25b8aec120e350934033bbe0e22 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/ptp_utils.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/ptp_utils.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/__pycache__/sd_study_utils.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/sd_study_utils.cpython-310.pyc index 93cd5a7376204949f1656a3897ec108239eeee87..228d06ffc3cb9cc452d165c3c9ec1b7d89d31430 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/sd_study_utils.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/sd_study_utils.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/__pycache__/visualization.cpython-310.pyc b/video_diffusion/prompt_attention/__pycache__/visualization.cpython-310.pyc index a9daf70513afa81ef0f353499098c4e5057d9e59..2778897969e251fe263adcc88bcb3fe7ae3aca90 100644 Binary files a/video_diffusion/prompt_attention/__pycache__/visualization.cpython-310.pyc and b/video_diffusion/prompt_attention/__pycache__/visualization.cpython-310.pyc differ diff --git a/video_diffusion/prompt_attention/attention_store.py b/video_diffusion/prompt_attention/attention_store.py index edb0611830f732105b9a1f757c5a5295e5b54b5a..bb8bf3e3a7b912eb117d353517302cd9da63d2ac 100644 --- a/video_diffusion/prompt_attention/attention_store.py +++ b/video_diffusion/prompt_attention/attention_store.py @@ -108,6 +108,31 @@ class AttentionStore(AttentionControl): average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store} return average_attention + def aggregate_attention(self, from_where: List[str], res: int, is_cross: bool, element_name='attn') -> torch.Tensor: + """Aggregates the attention across the different layers and heads at the specified resolution.""" + out = [] + num_pixels = res ** 2 + attention_maps = self.get_average_attention() + for location in from_where: + for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]: + print('is cross',is_cross) + print('item',item.shape) + #cross (t,head,res^2,77) + #self (head,t, res^2,res^2) + if is_cross: + t, h, res_sq, token = item.shape + if item.shape[2] == num_pixels: + cross_maps = item.reshape(t, -1, res, res, item.shape[-1]) + out.append(cross_maps) + else: + h, t, res_sq, res_sq = item.shape + if item.shape[2] == num_pixels: + self_item = item.permute(1, 0, 2, 3) #(t,head,res^2,res^2) + self_maps = self_item.reshape(t, h, res, res, self_item.shape[-1]) + out.append(self_maps) + out = torch.cat(out, dim=-4) #average head attention + out = out.sum(-4) / out.shape[-4] + return out def reset(self): super(AttentionStore, self).reset() @@ -129,4 +154,4 @@ class AttentionStore(AttentionControl): self.attention_store = {} self.save_self_attention = save_self_attention self.latents_store = [] - self.attention_store_all_step = [] + self.attention_store_all_step = [] \ No newline at end of file diff --git a/video_diffusion/prompt_attention/attention_util.py b/video_diffusion/prompt_attention/attention_util.py index 38ea7ccbe94b26bb3e9d84d202dd3a455ded101d..b05958bb99223dc4c046be38dfa89fea88235151 100644 --- a/video_diffusion/prompt_attention/attention_util.py +++ b/video_diffusion/prompt_attention/attention_util.py @@ -13,7 +13,7 @@ import torch import torch.nn.functional as F import video_diffusion.prompt_attention.ptp_utils as ptp_utils -from video_diffusion.prompt_attention.visualization import show_cross_attention,show_cross_attention_plus_org_img,show_self_attention_comp +from video_diffusion.prompt_attention.visualization import show_cross_attention,show_cross_attention_plus_org_img,show_self_attention_comp,aggregate_attention from video_diffusion.prompt_attention.attention_store import AttentionStore, AttentionControl from video_diffusion.prompt_attention.attention_register import register_attention_control device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') @@ -154,18 +154,22 @@ def identify_self_attention_max_min(sim, video, h_index:int, w_index:int, res: i -class ModulatedAttentionControl(AttentionControl, abc.ABC): +class ST_Layout_Attn_Control(AttentionControl, abc.ABC): def __init__(self, end_step=15, total_steps=50, step_idx=None, text_cond=None, sreg_maps=None, creg_maps=None, reg_sizes=None,reg_sizes_c=None, time_steps=None,clip_length=None,attention_type=None): """ - Mutual self-attention control for Stable-Diffusion model + Spatial-Temporal Layout-guided Attention (ST-Layout Attn) for Stable-Diffusion model + note: without vis cross attention weight function. Args: - start_step: the step to start mutual self-attention control - start_layer: the layer to start mutual self-attention control - layer_idx: list of the layers to apply mutual self-attention control - step_idx: list the steps to apply mutual self-attention control + end_step: the step to end st-layout attn control total_steps: the total number of steps - model_type: the model type, SD or SDXL + step_idx: list the steps to apply mutual self-attention control + text_cond: discrete text embedding for each region. + sreg_maps: spatial-temporal self-attention qk condition maps. + creg_maps: cross-attention qk condition maps + reg_sizes/reg_sizes_c: size regularzation maps for each instance in self_attn/cross_attention + clip_length: frames len of video + attention_type: FullyFrameAttention_sliced_attn/FullyFrameAttention/SparseCausalAttention """ super().__init__() self.total_steps = total_steps @@ -294,14 +298,7 @@ class Attention_Record_Processor(AttentionStore, abc.ABC): -class ModulatedAttention_ControlEdit(AttentionStore, abc.ABC): - """Decide self or cross-attention. Call the reweighting cross attention module - - Args: - AttentionStore (_type_): ([1, 4, 8, 64, 64]) - abc (_type_): [8, 8, 1024, 77] - """ - +class ST_Layout_Attn_ControlEdit(AttentionStore, abc.ABC): def __init__(self, end_step=15, total_steps=50, step_idx=None, text_cond=None, sreg_maps=None, creg_maps=None, reg_sizes=None,reg_sizes_c=None, time_steps=None, clip_length=None,attention_type=None, @@ -311,16 +308,20 @@ class ModulatedAttention_ControlEdit(AttentionStore, abc.ABC): video = None, ): """ - Mutual self-attention control for Stable-Diffusion model + Spatial-Temporal Layout-guided Attention (ST-Layout Attn) for Stable-Diffusion model + note: with vis cross attention weight function. Args: - start_step: the step to start mutual self-attention control - start_layer: the layer to start mutual self-attention control - layer_idx: list of the layers to apply mutual self-attention control - step_idx: list the steps to apply mutual self-attention control + end_step: the step to end st-layout attn control total_steps: the total number of steps - model_type: the model type, SD or SDXL + step_idx: list the steps to apply mutual self-attention control + text_cond: discrete text embedding for each region. + sreg_maps: spatial-temporal self-attention qk condition maps. + creg_maps: cross-attention qk condition maps + reg_sizes/reg_sizes_c: size regularzation maps for each instance in self_attn/cross_attention + clip_length: frames len of video + attention_type: FullyFrameAttention_sliced_attn/FullyFrameAttention/SparseCausalAttention """ - super(ModulatedAttention_ControlEdit, self).__init__( + super(ST_Layout_Attn_ControlEdit, self).__init__( save_self_attention=save_self_attention, disk_store=disk_store) self.total_steps = total_steps @@ -354,7 +355,7 @@ class ModulatedAttention_ControlEdit(AttentionStore, abc.ABC): def forward(self, sim, is_cross: bool, place_in_unet: str,**kwargs): - super(ModulatedAttention_ControlEdit, self).forward(sim, is_cross, place_in_unet,**kwargs) + super(ST_Layout_Attn_ControlEdit, self).forward(sim, is_cross, place_in_unet,**kwargs) # print("self.cur_step",self.cur_step) key = f"{place_in_unet}_{'cross' if is_cross else 'self'}" diff --git a/video_diffusion/prompt_attention/sd_study_utils.py b/video_diffusion/prompt_attention/sd_study_utils.py index 1a454a6273ceb5fab39b5d020e812b44dc69cce1..e546849271c3262c5781f232cdb6887f1d59137e 100644 --- a/video_diffusion/prompt_attention/sd_study_utils.py +++ b/video_diffusion/prompt_attention/sd_study_utils.py @@ -60,24 +60,66 @@ def cluster2noun_mod(clusters, background_segment_threshold, num_segments, nouns result_mask[c]=cluster_mask return result, result_mask -def cluster2noun_(clusters, background_segment_threshold, num_segments, nouns, cross_attention): - REPEAT=clusters.shape[0]/cross_attention.shape[0] +def cluster2noun_(clusters, background_segment_threshold, num_segments, nouns, cross_attention, attention_threshold=0.2): + REPEAT = clusters.shape[0] // cross_attention.shape[0] result = {} - result_mask={} + result_mask = {} + print('cross_attention',cross_attention.shape) + + # 提取名词索引和对应的注意力图 nouns_indices = [index for (index, word) in nouns] nouns_maps = cross_attention.cpu().numpy()[:, :, [i + 1 for i in nouns_indices]] - normalized_nouns_maps = np.zeros_like(nouns_maps).repeat(REPEAT, axis=0).repeat(REPEAT, axis=1) - for i in range(nouns_maps.shape[-1]): - curr_noun_map = nouns_maps[:, :, i].repeat(REPEAT, axis=0).repeat(REPEAT, axis=1) - normalized_nouns_maps[:, :, i] = (curr_noun_map - np.abs(curr_noun_map.min())) / curr_noun_map.max() + print('nouns_maps', nouns_maps.shape) + normalized_nouns_maps = nouns_maps + #normalized_nouns_maps = np.zeros_like(nouns_maps).repeat(REPEAT, axis=0).repeat(REPEAT, axis=1) + + # 标准化注意力图并应用阈值 + # for i in range(nouns_maps.shape[-1]): + # curr_noun_map = nouns_maps[:, :, i].repeat(REPEAT, axis=0).repeat(REPEAT, axis=1) + # normalized_map = (curr_noun_map - np.abs(curr_noun_map.min())) / curr_noun_map.max() + + # # 应用阈值,将低于阈值的部分设为 0 + # #normalized_map[normalized_map < attention_threshold] = 0 + + # normalized_nouns_maps[:, :, i] = normalized_map + + print('normalized_nouns_maps', normalized_nouns_maps.shape) + + #show_normalized_nouns_maps(normalized_nouns_maps, nouns, logdir) + + # 用于记录已经分配的单词 + assigned_nouns = set() + for c in range(num_segments): cluster_mask = np.zeros_like(clusters) cluster_mask[clusters == c] = 1 + score_maps = [cluster_mask * normalized_nouns_maps[:, :, i] for i in range(len(nouns_indices))] scores = [score_map.sum() / cluster_mask.sum() for score_map in score_maps] - result[c] = nouns[np.argmax(np.array(scores))] if max(scores) > background_segment_threshold else "BG" - result_mask[c]=cluster_mask + + # 找出最高分的名词,并确保未被分配过 + sorted_scores_indices = np.argsort(scores)[::-1] + assigned_word = None + + for idx in sorted_scores_indices: + if scores[idx] > background_segment_threshold and nouns[idx] not in assigned_nouns: + assigned_word = nouns[idx] + assigned_nouns.add(nouns[idx]) # 记录这个单词已分配 + break + + # 如果没有找到合适的名词,强制分配最高分的未分配名词 + if assigned_word is None and len(sorted_scores_indices) > 0: + for idx in sorted_scores_indices: + if nouns[idx] not in assigned_nouns: + assigned_word = nouns[idx] + assigned_nouns.add(nouns[idx]) # 记录这个单词已分配 + break + + if assigned_word: + result[c] = assigned_word + result_mask[c] = cluster_mask + return result, result_mask