diff --git a/EdgeCape/VERSION b/EdgeCape/VERSION new file mode 100644 index 0000000000000000000000000000000000000000..0ea3a944b399d25f7e1b8fe684d754eb8da9fe7f --- /dev/null +++ b/EdgeCape/VERSION @@ -0,0 +1 @@ +0.2.0 diff --git a/EdgeCape/__init__.py b/EdgeCape/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d12119c91b4a54a136b3a13a5a695bfa90d27ea8 --- /dev/null +++ b/EdgeCape/__init__.py @@ -0,0 +1,3 @@ +from .core import * # noqa +from .datasets import * # noqa +from .models import * # noqa diff --git a/EdgeCape/__pycache__/__init__.cpython-39.pyc b/EdgeCape/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbaf1a61d0188e43097b7cd4adf0d9542a9f3178 Binary files /dev/null and b/EdgeCape/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/apis/__init__.py b/EdgeCape/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..500c844f99bf7725c185e94c289ccf5613d09da5 --- /dev/null +++ b/EdgeCape/apis/__init__.py @@ -0,0 +1,5 @@ +from .train import train_model + +__all__ = [ + 'train_model' +] diff --git a/EdgeCape/apis/__pycache__/__init__.cpython-39.pyc b/EdgeCape/apis/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a92d25ced6e51a27dc7c78a1609980cff0b45372 Binary files /dev/null and b/EdgeCape/apis/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/apis/__pycache__/test.cpython-39.pyc b/EdgeCape/apis/__pycache__/test.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce1b2e7a94659701418f4c76e640a0f79f64b183 Binary files /dev/null and b/EdgeCape/apis/__pycache__/test.cpython-39.pyc differ diff --git a/EdgeCape/apis/__pycache__/train.cpython-39.pyc b/EdgeCape/apis/__pycache__/train.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24b3ed6df4a3eb358d92637846f83dc9787493ca Binary files /dev/null and b/EdgeCape/apis/__pycache__/train.cpython-39.pyc differ diff --git a/EdgeCape/apis/test.py b/EdgeCape/apis/test.py new file mode 100644 index 0000000000000000000000000000000000000000..a28324ffe4c154bf5af4a5d1226d01c13d527124 --- /dev/null +++ b/EdgeCape/apis/test.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import pickle +import shutil +import tempfile + +import mmcv +import numpy as np +import torch +import torch.distributed as dist +from mmcv.runner import get_dist_info + + +def single_gpu_test(model, data_loader): + """Test model with a single gpu. + + This method tests model with a single gpu and displays test progress bar. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for data in data_loader: + with torch.no_grad(): + result = model(return_loss=False, **data) + batch_size = len(next(iter(data.values()))[0]) + # results.append(result) + if 'preds' in result: + for i in range(batch_size): + results.append({ + 'preds': result['preds'][i][None], + 'boxes': result['boxes'][i][None], + 'bbox_ids': [result['bbox_ids'][i]], + 'image_paths': [result['image_paths'][i]], + }) + # use the first key as main key to calculate the batch size + # for _ in range(batch_size): + prog_bar.update(batch_size) + return results + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + for data in data_loader: + with torch.no_grad(): + result = model(return_loss=False, **data) + results.append(result) + + if rank == 0: + # use the first key as main key to calculate the batch size + batch_size = len(next(iter(data.values()))) + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results in cpu mode. + + It saves the results on different gpus to 'tmpdir' and collects + them by the rank 0 worker. + + Args: + result_part (list): Results to be collected + size (int): Result size. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. Default: None + + Returns: + list: Ordered results. + """ + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # synchronizes all processes to make sure tmpdir exist + dist.barrier() + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + # synchronizes all processes for loading pickle file + dist.barrier() + # collect all parts + if rank != 0: + return None + + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results in gpu mode. + + It encodes results to gpu tensors and use gpu communication for results + collection. + + Args: + result_part (list): Results to be collected + size (int): Result size. + + Returns: + list: Ordered results. + """ + + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_list.append( + pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results + return None diff --git a/EdgeCape/apis/train.py b/EdgeCape/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..b3113e274e96890b32b5a6aaba31532d2258a743 --- /dev/null +++ b/EdgeCape/apis/train.py @@ -0,0 +1,124 @@ +import os +import torch +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook, + build_optimizer) + +from mmpose.core import DistEvalHook, EvalHook, Fp16OptimizerHook +from mmpose.datasets import build_dataloader +from mmpose.utils import get_root_logger +from EdgeCape.core.custom_hooks.shuffle_hooks import ShufflePairedSamplesHook + +def train_model(model, + dataset, + val_dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """Train model entry function. + + Args: + model (nn.Module): The model to be trained. + dataset (Dataset): Train dataset. + cfg (dict): The config dict for training. + distributed (bool): Whether to use distributed training. + Default: False. + validate (bool): Whether to do evaluation. Default: False. + timestamp (str | None): Local time for runner. Default: None. + meta (dict | None): Meta dict to record some important information. + Default: None + """ + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + dataloader_setting = dict( + samples_per_gpu=cfg.data.get('samples_per_gpu', {}), + workers_per_gpu=cfg.data.get('workers_per_gpu', {}), + # cfg.gpus will be ignored if distributed + num_gpus=len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + pin_memory=False, + ) + dataloader_setting = dict(dataloader_setting, + **cfg.data.get('train_dataloader', {})) + + data_loaders = [ + build_dataloader(ds, **dataloader_setting) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', True) # NOTE: True has been modified to False for faster training. + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + runner = EpochBasedRunner( + model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta) + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + if distributed: + runner.register_hook(DistSamplerSeedHook()) + + shuffle_cfg = cfg.get('shuffle_cfg', None) + if shuffle_cfg is not None: + for data_loader in data_loaders: + runner.register_hook(ShufflePairedSamplesHook(data_loader, **shuffle_cfg)) + + # register eval hooks + if validate: + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['res_folder'] = os.path.join(cfg.work_dir, eval_cfg['res_folder']) + dataloader_setting = dict( + # samples_per_gpu=cfg.data.get('samples_per_gpu', {}), + samples_per_gpu=1, + workers_per_gpu=cfg.data.get('workers_per_gpu', {}), + # cfg.gpus will be ignored if distributed + num_gpus=len(cfg.gpu_ids), + dist=distributed, + shuffle=False, + pin_memory=False, + ) + dataloader_setting = dict(dataloader_setting, + **cfg.data.get('val_dataloader', {})) + val_dataloader = build_dataloader(val_dataset, **dataloader_setting) + eval_hook = DistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) diff --git a/EdgeCape/core/__init__.py b/EdgeCape/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/EdgeCape/core/__init__.py @@ -0,0 +1 @@ + diff --git a/EdgeCape/core/__pycache__/__init__.cpython-39.pyc b/EdgeCape/core/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..621d6daae565649aeb11af5f270aaa93cfb7c936 Binary files /dev/null and b/EdgeCape/core/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/core/custom_hooks/__pycache__/shuffle_hooks.cpython-39.pyc b/EdgeCape/core/custom_hooks/__pycache__/shuffle_hooks.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16d7f9a3d09ef946f8d3b4872569f0efb0a61a51 Binary files /dev/null and b/EdgeCape/core/custom_hooks/__pycache__/shuffle_hooks.cpython-39.pyc differ diff --git a/EdgeCape/core/custom_hooks/shuffle_hooks.py b/EdgeCape/core/custom_hooks/shuffle_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..b4fa43b816ad2c8fb3f93a587d25ac4e02e4e18b --- /dev/null +++ b/EdgeCape/core/custom_hooks/shuffle_hooks.py @@ -0,0 +1,28 @@ +from mmcv.runner import Hook +from torch.utils.data import DataLoader +from mmpose.utils import get_root_logger + +class ShufflePairedSamplesHook(Hook): + """Non-Distributed ShufflePairedSamples. + After each training epoch, run FewShotKeypointDataset.random_paired_samples() + """ + + def __init__(self, + dataloader, + interval=1): + if not isinstance(dataloader, DataLoader): + raise TypeError(f'dataloader must be a pytorch DataLoader, ' + f'but got {type(dataloader)}') + + self.dataloader = dataloader + self.interval = interval + self.logger = get_root_logger() + + def after_train_epoch(self, runner): + """Called after every training epoch to evaluate the results.""" + if not self.every_n_epochs(runner, self.interval): + return + # self.logger.info("Run random_paired_samples()") + # self.logger.info(f"Before: {self.dataloader.dataset.paired_samples[0]}") + self.dataloader.dataset.random_paired_samples() + # self.logger.info(f"After: {self.dataloader.dataset.paired_samples[0]}") diff --git a/EdgeCape/datasets/__init__.py b/EdgeCape/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..25529624cf32c145ca0bf686af2899bb386d5d28 --- /dev/null +++ b/EdgeCape/datasets/__init__.py @@ -0,0 +1,3 @@ +from .builder import * # noqa +from .datasets import * # noqa +from .pipelines import * # noqa diff --git a/EdgeCape/datasets/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4de4c2a0d6795b20c1e4f4c84dd001473552f0bb Binary files /dev/null and b/EdgeCape/datasets/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/datasets/__pycache__/builder.cpython-39.pyc b/EdgeCape/datasets/__pycache__/builder.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..30750e1d1d6a20d048432ded43899d35cc9af8a3 Binary files /dev/null and b/EdgeCape/datasets/__pycache__/builder.cpython-39.pyc differ diff --git a/EdgeCape/datasets/builder.py b/EdgeCape/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..3468bf9ea271a54a04666699c01a585bd4c387d7 --- /dev/null +++ b/EdgeCape/datasets/builder.py @@ -0,0 +1,55 @@ +from mmcv.utils import build_from_cfg +from torch.utils.data.dataset import ConcatDataset + +from mmpose.datasets.dataset_wrappers import RepeatDataset +from mmpose.datasets.builder import DATASETS + + +def _concat_cfg(cfg): + replace = ['ann_file', 'img_prefix'] + channels = ['num_joints', 'dataset_channel'] + concat_cfg = [] + for i in range(len(cfg['type'])): + cfg_tmp = cfg.deepcopy() + cfg_tmp['type'] = cfg['type'][i] + for item in replace: + assert item in cfg_tmp + assert len(cfg['type']) == len(cfg[item]), (cfg[item]) + cfg_tmp[item] = cfg[item][i] + for item in channels: + assert item in cfg_tmp['data_cfg'] + assert len(cfg['type']) == len(cfg['data_cfg'][item]) + cfg_tmp['data_cfg'][item] = cfg['data_cfg'][item][i] + concat_cfg.append(cfg_tmp) + return concat_cfg + + +def _check_vaild(cfg): + replace = ['num_joints', 'dataset_channel'] + if isinstance(cfg['data_cfg'][replace[0]], (list, tuple)): + for item in replace: + cfg['data_cfg'][item] = cfg['data_cfg'][item][0] + return cfg + + +def build_dataset(cfg, default_args=None): + """Build a dataset from config dict. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + default_args (dict, optional): Default initialization arguments. + Default: None. + + Returns: + Dataset: The constructed dataset. + """ + if isinstance(cfg['type'], (list, tuple)): # In training, type=TransformerPoseDataset + dataset = ConcatDataset( + [build_dataset(c, default_args) for c in _concat_cfg(cfg)]) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + build_dataset(cfg['dataset'], default_args), cfg['times']) + else: + cfg = _check_vaild(cfg) + dataset = build_from_cfg(cfg, DATASETS, default_args) + return dataset diff --git a/EdgeCape/datasets/datasets/__init__.py b/EdgeCape/datasets/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..21977d38975ec150e01f01e4ee44ffbfc6ded25e --- /dev/null +++ b/EdgeCape/datasets/datasets/__init__.py @@ -0,0 +1,6 @@ +from .mp100 import (FewShotKeypointDataset, FewShotBaseDataset, + TransformerBaseDataset, TransformerPoseDataset,) + +__all__ = ['FewShotBaseDataset', 'FewShotKeypointDataset', + 'TransformerBaseDataset', 'TransformerPoseDataset', + ] diff --git a/EdgeCape/datasets/datasets/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/datasets/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fc96fe82b2013fb373442481f9c23ac0cf483de Binary files /dev/null and b/EdgeCape/datasets/datasets/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__init__.py b/EdgeCape/datasets/datasets/mp100/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..17c22428d55a700d541e27d8f2d5b0c168e9c693 --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/__init__.py @@ -0,0 +1,13 @@ +from .fewshot_dataset import FewShotKeypointDataset +from .fewshot_base_dataset import FewShotBaseDataset +from .transformer_dataset import TransformerPoseDataset +from .transformer_base_dataset import TransformerBaseDataset +from .test_base_dataset import TestBaseDataset +from .test_dataset import TestPoseDataset +from .custom_test_dataset import CustomTestPoseDataset + +__all__ = [ + 'FewShotKeypointDataset', 'FewShotBaseDataset', + 'TransformerPoseDataset', 'TransformerBaseDataset', + 'TestBaseDataset', 'TestPoseDataset', 'CustomTestPoseDataset' +] diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d43ce586c0ed7efed0387fda6ccf3a7bbfeb129 Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/custom_test_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/custom_test_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e67a35f10822a0f8ea06d82fcb81cf6bec33d7ce Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/custom_test_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_base_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_base_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4877d87ccbe185bcd8cbf1c7f635bc46c55dbc5e Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_base_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3d1c9df69ad6d912446ce6223d32d8257c388d22 Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/test_base_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/test_base_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d1d9119202ff6747398d9023409080d9e63a940 Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/test_base_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/test_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/test_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dac9750321961c677dc87af5d1de840eb9efefef Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/test_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_base_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_base_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f52dcb1165551f763dcb19ff3c84c2993247972c Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_base_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_dataset.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd23318bc903674f986ab7063222b0f373f79e76 Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_dataset.cpython-39.pyc differ diff --git a/EdgeCape/datasets/datasets/mp100/custom_test_dataset.py b/EdgeCape/datasets/datasets/mp100/custom_test_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..798b8d98ad501b7dcc793fc01d7a0781ff1bca88 --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/custom_test_dataset.py @@ -0,0 +1,355 @@ +from mmpose.datasets import DATASETS +import random +import numpy as np +import os +from collections import OrderedDict +from xtcocotools.coco import COCO +from .test_base_dataset import TestBaseDataset + +@DATASETS.register_module() +class CustomTestPoseDataset(TestBaseDataset): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + valid_class_ids, + max_kpt_num=None, + num_shots=1, + num_queries=100, + num_episodes=1, + pck_threshold_list=[0.05, 0.1, 0.15, 0.20, 0.25], + test_mode=True): + super().__init__( + ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode, PCK_threshold_list=pck_threshold_list) + + self.ann_info['flip_pairs'] = [] + + self.ann_info['upper_body_ids'] = [] + self.ann_info['lower_body_ids'] = [] + + self.ann_info['use_different_joint_weights'] = False + self.ann_info['joint_weights'] = np.array([1.,], + dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) + + self.coco = COCO(ann_file) + + self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) + self.img_ids = self.coco.getImgIds() + + cat = None + relevant_names = [ + '000000052046', + '000000052152' + + # '000000027059', + # '000000030361' + # '000000027936', + # 'Pileated_Woodpecker_0004_180307', 'American_Three_Toed_Woodpecker_0019_179870' + # '000000016379', '000000008869' + # 'commonwarthog_115', + # 'commonwarthog_78' + # '000000027059', '000000030361', '000000027936' + # 'klipspringer_66', '000000008333', '000000026814', '000000047543', '000000052080', 'Common_Tern_0050_148928' + ] + if len(relevant_names) > 0: + if cat is not None: + relevant_names = [os.path.join(cat, name) for name in relevant_names] + self.img_ids = [img_id for img_id in self.img_ids if self.id2name[img_id] in relevant_names] + else: + new_ids = [] + for relevant_name in relevant_names: + new_ids += [img_id for img_id in self.img_ids if relevant_name in self.id2name[img_id]] + self.img_ids = new_ids + else: + self.img_ids = [img_id for img_id in self.img_ids if cat == self.id2name[img_id].split('/')[0]] + + self.classes = [ + cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + + self.num_classes = len(self.classes) + self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds())) + self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes)) + + if valid_class_ids is not None: # None by default + self.valid_class_ids = valid_class_ids + else: + self.valid_class_ids = self.coco.getCatIds() + + self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids] + self.cats = self.coco.cats + self.max_kpt_num = max_kpt_num + + # Also update self.cat2obj + self.db = self._get_db() + + self.num_shots = num_shots + + if not test_mode: + # Update every training epoch + self.random_paired_samples() + else: + self.num_queries = num_queries + self.num_episodes = num_episodes + self.make_paired_samples() + + + def random_paired_samples(self): + num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes] + + # balance the dataset + max_num_data = max(num_datas) + + all_samples = [] + for cls in self.valid_class_ids: + for i in range(max_num_data): + shot = random.sample(self.cat2obj[cls], self.num_shots + 1) + all_samples.append(shot) + + self.paired_samples = np.array(all_samples) + np.random.shuffle(self.paired_samples) + + def make_paired_samples(self): + random.seed(1) + np.random.seed(0) + all_samples = [] + self.num_episodes = 1000 + for cls in self.valid_class_ids: + for _ in range(self.num_episodes): + if self.cat2obj[cls] == []: + continue + self.num_queries = 1 + self.num_shots = 1 + if len(self.cat2obj[cls]) < self.num_shots + self.num_queries: + shots = random.choices(self.cat2obj[cls], k=self.num_shots + self.num_queries) + else: + shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries) + sample_ids = shots[:self.num_shots] + query_ids = shots[self.num_shots:] + for query_id in query_ids: + all_samples.append(sample_ids + [query_id]) + all_samples.append([query_id] + [query_id]) + + self.paired_samples = np.array(list(set(tuple(x) for x in all_samples))) + + def _select_kpt(self, obj, kpt_id): + obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1] + obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1] + obj['kpt_id'] = kpt_id + + return obj + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _get_db(self): + """Ground truth bbox and keypoints.""" + self.obj_id = 0 + + self.cat2obj = {} + for i in self.coco.getCatIds(): + self.cat2obj.update({i: []}) + + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + + category_id = obj['category_id'] + # the number of keypoint for this specific category + cat_kpt_num = int(len(obj['keypoints']) / 3) + if self.max_kpt_num is None: + kpt_num = cat_kpt_num + else: + kpt_num = self.max_kpt_num + + joints_3d = np.zeros((kpt_num, 3), dtype=np.float32) + joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:cat_kpt_num, :2] = keypoints[:, :2] + joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + + self.cat2obj[category_id].append(self.obj_id) + + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'bbox': obj['clean_bbox'][:4], + 'bbox_score': 1, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'category_id': category_id, + 'cat_kpt_num': cat_kpt_num, + 'bbox_id': self.obj_id, + 'skeleton': self.coco.cats[obj['category_id']]['skeleton'], + }) + bbox_id = bbox_id + 1 + self.obj_id += 1 + + return rec + + def _xywh2cs(self, x, y, w, h): + """This encodes bbox(x,y,w,w) into (center, scale) + + Args: + x, y, w, h + + Returns: + tuple: A tuple containing center and scale. + + - center (np.ndarray[float32](2,)): center of the bbox (x, y). + - scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + # + # if (not self.test_mode) and np.random.rand() < 0.3: + # center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * 1.25 + + return center, scale + + def evaluate(self, outputs, res_folder, metric='PCK', **kwargs): + """Evaluate interhand2d keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + outputs (list(preds, boxes, image_path, output_heatmap)) + :preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + :image_paths (list[str]): For example, ['C', 'a', 'p', 't', + 'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_', + 'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M', + '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/', + 'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.', + 'j', 'p', 'g'] + :output_heatmap (np.ndarray[N, K, H, W]): model outpus. + + res_folder (str): Path of directory to save the results. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + res_file = os.path.join(res_folder, 'result_keypoints.json') + + kpts = [] + for output in outputs: + preds = output['preds'] + boxes = output['boxes'] + image_paths = output['image_paths'] + bbox_ids = output['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + return name_value diff --git a/EdgeCape/datasets/datasets/mp100/fewshot_base_dataset.py b/EdgeCape/datasets/datasets/mp100/fewshot_base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..fc7f067869603605f0f9189e96d59a5e577307cf --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/fewshot_base_dataset.py @@ -0,0 +1,223 @@ +import copy +from abc import ABCMeta, abstractmethod +import json_tricks as json +import numpy as np + +from mmcv.parallel import DataContainer as DC +from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, + keypoint_pck_accuracy) +from torch.utils.data import Dataset +from mmpose.datasets import DATASETS +from mmpose.datasets.pipelines import Compose + +@DATASETS.register_module() +class FewShotBaseDataset(Dataset, metaclass=ABCMeta): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + test_mode=False): + self.image_info = {} + self.ann_info = {} + + self.annotations_path = ann_file + if not img_prefix.endswith('/'): + img_prefix = img_prefix + '/' + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['flip_pairs'] = None + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + self.db = [] + self.num_shots = 1 + self.paired_samples = [] + self.pipeline = Compose(self.pipeline) + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def _select_kpt(self, obj, kpt_id): + """Select kpt.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, cfg, preds, output_dir, *args, **kwargs): + """Evaluate keypoint results.""" + raise NotImplementedError + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, + res_file, + metrics, + pck_thr=0.2, + pckh_thr=0.7, + auc_nor=30): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.paired_samples) + + outputs = [] + gts = [] + masks = [] + threshold_bbox = [] + threshold_head_box = [] + + for pred, pair in zip(preds, self.paired_samples): + item = self.db[pair[-1]] + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + + mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0) + mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0) + for id_s in pair[:-1]: + mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0)) + masks.append(np.bitwise_and(mask_query, mask_sample)) + + if 'PCK' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + if 'PCKh' in metrics: + head_box_thr = item['head_size'] + threshold_head_box.append( + np.array([head_box_thr, head_box_thr])) + + if 'PCK' in metrics: + pck_avg = [] + for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox): + _, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), pck_thr, np.expand_dims(thr_bbox,0)) + pck_avg.append(pck) + info_str.append(('PCK', np.mean(pck_avg))) + + return info_str + + def _merge_obj(self, Xs_list, Xq, idx): + """ merge Xs_list and Xq. + + :param Xs_list: N-shot samples X + :param Xq: query X + :param idx: id of paired_samples + :return: Xall + """ + Xall = dict() + Xall['img_s'] = [Xs['img'] for Xs in Xs_list] + Xall['target_s'] = [Xs['target'] for Xs in Xs_list] + Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list] + xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list] + + Xall['img_q'] = Xq['img'] + Xall['target_q'] = Xq['target'] + Xall['target_weight_q'] = Xq['target_weight'] + xq_img_metas = Xq['img_metas'].data + + img_metas = dict() + for key in xq_img_metas.keys(): + img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas] + img_metas['query_' + key] = xq_img_metas[key] + img_metas['bbox_id'] = idx + + Xall['img_metas'] = DC(img_metas, cpu_only=True) + + return Xall + + def __len__(self): + """Get the size of the dataset.""" + return len(self.paired_samples) + + def __getitem__(self, idx): + """Get the sample given index.""" + + pair_ids = self.paired_samples[idx] + assert len(pair_ids) == self.num_shots + 1 + sample_id_list = pair_ids[:self.num_shots] + query_id = pair_ids[-1] + + sample_obj_list = [] + for sample_id in sample_id_list: + sample_obj = copy.deepcopy(self.db[sample_id]) + sample_obj['ann_info'] = copy.deepcopy(self.ann_info) + sample_obj_list.append(sample_obj) + + query_obj = copy.deepcopy(self.db[query_id]) + query_obj['ann_info'] = copy.deepcopy(self.ann_info) + + if not self.test_mode: + # randomly select "one" keypoint + sample_valid = (sample_obj_list[0]['joints_3d_visible'][:, 0] > 0) + for sample_obj in sample_obj_list: + sample_valid = sample_valid & (sample_obj['joints_3d_visible'][:, 0] > 0) + query_valid = (query_obj['joints_3d_visible'][:, 0] > 0) + + valid_s = np.where(sample_valid)[0] + valid_q = np.where(query_valid)[0] + valid_sq = np.where(sample_valid & query_valid)[0] + if len(valid_sq) > 0: + kpt_id = np.random.choice(valid_sq) + elif len(valid_s) > 0: + kpt_id = np.random.choice(valid_s) + elif len(valid_q) > 0: + kpt_id = np.random.choice(valid_q) + else: + kpt_id = np.random.choice(np.array(range(len(query_valid)))) + + for i in range(self.num_shots): + sample_obj_list[i] = self._select_kpt(sample_obj_list[i], kpt_id) + query_obj = self._select_kpt(query_obj, kpt_id) + + # when test, all keypoints will be preserved. + + Xs_list = [] + for sample_obj in sample_obj_list: + Xs = self.pipeline(sample_obj) + Xs_list.append(Xs) + Xq = self.pipeline(query_obj) + + Xall = self._merge_obj(Xs_list, Xq, idx) + Xall['skeleton'] = self.db[query_id]['skeleton'] + + return Xall + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/EdgeCape/datasets/datasets/mp100/fewshot_dataset.py b/EdgeCape/datasets/datasets/mp100/fewshot_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7bf805f4909d625f6600531c8e07180ea41aa28e --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/fewshot_dataset.py @@ -0,0 +1,312 @@ +from mmpose.datasets import DATASETS +import random +import numpy as np +import os +from collections import OrderedDict +from xtcocotools.coco import COCO +from .fewshot_base_dataset import FewShotBaseDataset + +@DATASETS.register_module() +class FewShotKeypointDataset(FewShotBaseDataset): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + valid_class_ids, + num_shots = 1, + num_queries = 100, + num_episodes = 1, + test_mode=False): + super().__init__( + ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode) + + self.ann_info['flip_pairs'] = [] + + self.ann_info['upper_body_ids'] = [] + self.ann_info['lower_body_ids'] = [] + + self.ann_info['use_different_joint_weights'] = False + self.ann_info['joint_weights'] = np.array([1.,], + dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) + + self.coco = COCO(ann_file) + + self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) + self.img_ids = self.coco.getImgIds() + self.classes = [ + cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + + self.num_classes = len(self.classes) + self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds())) + self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes)) + + if valid_class_ids is not None: + self.valid_class_ids = valid_class_ids + else: + self.valid_class_ids = self.coco.getCatIds() + self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids] + + self.cats = self.coco.cats + + # Also update self.cat2obj + self.db = self._get_db() + + self.num_shots = num_shots + + if not test_mode: + # Update every training epoch + self.random_paired_samples() + else: + self.num_queries = num_queries + self.num_episodes = num_episodes + self.make_paired_samples() + + + def random_paired_samples(self): + num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes] + + # balance the dataset + max_num_data = max(num_datas) + + all_samples = [] + for cls in self.valid_class_ids: + for i in range(max_num_data): + shot = random.sample(self.cat2obj[cls], self.num_shots + 1) + all_samples.append(shot) + + self.paired_samples = np.array(all_samples) + np.random.shuffle(self.paired_samples) + + def make_paired_samples(self): + random.seed(1) + np.random.seed(0) + + all_samples = [] + for cls in self.valid_class_ids: + for _ in range(self.num_episodes): + shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries) + sample_ids = shots[:self.num_shots] + query_ids = shots[self.num_shots:] + for query_id in query_ids: + all_samples.append(sample_ids + [query_id]) + + self.paired_samples = np.array(all_samples) + + def _select_kpt(self, obj, kpt_id): + obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1] + obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1] + obj['kpt_id'] = kpt_id + + return obj + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _get_db(self): + """Ground truth bbox and keypoints.""" + self.obj_id = 0 + + self.cat2obj = {} + for i in self.coco.getCatIds(): + self.cat2obj.update({i: []}) + + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + + category_id = obj['category_id'] + # the number of keypoint for this specific category + cat_kpt_num = int(len(obj['keypoints']) / 3) + + joints_3d = np.zeros((cat_kpt_num, 3), dtype=np.float32) + joints_3d_visible = np.zeros((cat_kpt_num, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:, :2] = keypoints[:, :2] + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + + self.cat2obj[category_id].append(self.obj_id) + + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'bbox': obj['clean_bbox'][:4], + 'bbox_score': 1, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'category_id': category_id, + 'cat_kpt_num': cat_kpt_num, + 'bbox_id': self.obj_id, + 'skeleton': self.coco.cats[obj['category_id']]['skeleton'], + }) + bbox_id = bbox_id + 1 + self.obj_id += 1 + + return rec + + def _xywh2cs(self, x, y, w, h): + """This encodes bbox(x,y,w,w) into (center, scale) + + Args: + x, y, w, h + + Returns: + tuple: A tuple containing center and scale. + + - center (np.ndarray[float32](2,)): center of the bbox (x, y). + - scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + # + # if (not self.test_mode) and np.random.rand() < 0.3: + # center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * 1.25 + + return center, scale + + def evaluate(self, outputs, res_folder, metric='PCK', **kwargs): + """Evaluate interhand2d keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + outputs (list(preds, boxes, image_path, output_heatmap)) + :preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + :image_paths (list[str]): For example, ['C', 'a', 'p', 't', + 'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_', + 'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M', + '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/', + 'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.', + 'j', 'p', 'g'] + :output_heatmap (np.ndarray[N, K, H, W]): model outpus. + + res_folder (str): Path of directory to save the results. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + res_file = os.path.join(res_folder, 'result_keypoints.json') + + kpts = [] + for output in outputs: + preds = output['preds'] + boxes = output['boxes'] + image_paths = output['image_paths'] + bbox_ids = output['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + return name_value diff --git a/EdgeCape/datasets/datasets/mp100/test_base_dataset.py b/EdgeCape/datasets/datasets/mp100/test_base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3b4f4214264bba072e6b5bddb53ed792cda10af1 --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/test_base_dataset.py @@ -0,0 +1,226 @@ +import copy +from abc import ABCMeta, abstractmethod +import json_tricks as json +import numpy as np + +from mmcv.parallel import DataContainer as DC +from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, keypoint_nme, + keypoint_pck_accuracy) +from torch.utils.data import Dataset +from mmpose.datasets import DATASETS +from mmpose.datasets.pipelines import Compose + +@DATASETS.register_module() +class TestBaseDataset(Dataset, metaclass=ABCMeta): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + test_mode=True, + PCK_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25]): + self.image_info = {} + self.ann_info = {} + + self.annotations_path = ann_file + if not img_prefix.endswith('/'): + img_prefix = img_prefix + '/' + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + self.PCK_threshold_list = PCK_threshold_list + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['flip_pairs'] = None + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + self.db = [] + self.num_shots = 1 + self.paired_samples = [] + self.pipeline = Compose(self.pipeline) + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def _select_kpt(self, obj, kpt_id): + """Select kpt.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, cfg, preds, output_dir, *args, **kwargs): + """Evaluate keypoint results.""" + raise NotImplementedError + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, + res_file, + metrics): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.paired_samples) + + outputs = [] + gts = [] + masks = [] + threshold_bbox = [] + threshold_head_box = [] + + for pred, pair in zip(preds, self.paired_samples): + item = self.db[pair[-1]] + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + + mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0) + mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0) + for id_s in pair[:-1]: + mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0)) + masks.append(np.bitwise_and(mask_query, mask_sample)) + + if 'PCK' in metrics or 'NME' in metrics or 'AUC' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + if 'PCKh' in metrics: + head_box_thr = item['head_size'] + threshold_head_box.append( + np.array([head_box_thr, head_box_thr])) + + if 'PCK' in metrics: + pck_results = dict() + for pck_thr in self.PCK_threshold_list: + pck_results[pck_thr] = [] + + for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox): + for pck_thr in self.PCK_threshold_list: + _, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), pck_thr, np.expand_dims(thr_bbox,0)) + pck_results[pck_thr].append(pck) + + mPCK = 0 + for pck_thr in self.PCK_threshold_list: + info_str.append(['PCK@' + str(pck_thr), np.mean(pck_results[pck_thr])]) + mPCK += np.mean(pck_results[pck_thr]) + info_str.append(['mPCK', mPCK / len(self.PCK_threshold_list)]) + + if 'NME' in metrics: + nme_results = [] + for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox): + nme = keypoint_nme(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), np.expand_dims(thr_bbox,0)) + nme_results.append(nme) + info_str.append(['NME', np.mean(nme_results)]) + + if 'AUC' in metrics: + auc_results = [] + for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox): + auc = keypoint_auc(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), thr_bbox[0]) + auc_results.append(auc) + info_str.append(['AUC', np.mean(auc_results)]) + + if 'EPE' in metrics: + epe_results = [] + for (output, gt, mask) in zip(outputs, gts, masks): + epe = keypoint_epe(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0)) + epe_results.append(epe) + info_str.append(['EPE', np.mean(epe_results)]) + return info_str + + def _merge_obj(self, Xs_list, Xq, idx): + """ merge Xs_list and Xq. + + :param Xs_list: N-shot samples X + :param Xq: query X + :param idx: id of paired_samples + :return: Xall + """ + Xall = dict() + Xall['img_s'] = [Xs['img'] for Xs in Xs_list] + Xall['target_s'] = [Xs['target'] for Xs in Xs_list] + Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list] + xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list] + + Xall['img_q'] = Xq['img'] + Xall['target_q'] = Xq['target'] + Xall['target_weight_q'] = Xq['target_weight'] + xq_img_metas = Xq['img_metas'].data + + img_metas = dict() + for key in xq_img_metas.keys(): + img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas] + img_metas['query_' + key] = xq_img_metas[key] + img_metas['bbox_id'] = idx + + Xall['img_metas'] = DC(img_metas, cpu_only=True) + + return Xall + + def __len__(self): + """Get the size of the dataset.""" + return len(self.paired_samples) + + def __getitem__(self, idx): + """Get the sample given index.""" + + pair_ids = self.paired_samples[idx] # [supported id * shots, query id] + assert len(pair_ids) == self.num_shots + 1 + sample_id_list = pair_ids[:self.num_shots] + query_id = pair_ids[-1] + + sample_obj_list = [] + for sample_id in sample_id_list: + sample_obj = copy.deepcopy(self.db[sample_id]) + sample_obj['ann_info'] = copy.deepcopy(self.ann_info) + sample_obj_list.append(sample_obj) + + query_obj = copy.deepcopy(self.db[query_id]) + query_obj['ann_info'] = copy.deepcopy(self.ann_info) + + Xs_list = [] + for sample_obj in sample_obj_list: + Xs = self.pipeline(sample_obj) # dict with ['img', 'target', 'target_weight', 'img_metas'], + Xs_list.append(Xs) # Xs['target'] is of shape [100, map_h, map_w] + Xq = self.pipeline(query_obj) + + Xall = self._merge_obj(Xs_list, Xq, idx) + Xall['skeleton'] = self.db[query_id]['skeleton'] + + return Xall + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/EdgeCape/datasets/datasets/mp100/test_dataset.py b/EdgeCape/datasets/datasets/mp100/test_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..aa264d2e1edddcce66f13f892e2a303e552ed8cc --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/test_dataset.py @@ -0,0 +1,319 @@ +from mmpose.datasets import DATASETS +import random +import numpy as np +import os +from collections import OrderedDict +from xtcocotools.coco import COCO +from .test_base_dataset import TestBaseDataset + +@DATASETS.register_module() +class TestPoseDataset(TestBaseDataset): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + valid_class_ids, + max_kpt_num=None, + num_shots=1, + num_queries=100, + num_episodes=1, + pck_threshold_list=[0.05, 0.1, 0.15, 0.20, 0.25], + test_mode=True): + super().__init__( + ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode, PCK_threshold_list=pck_threshold_list) + + self.ann_info['flip_pairs'] = [] + + self.ann_info['upper_body_ids'] = [] + self.ann_info['lower_body_ids'] = [] + + self.ann_info['use_different_joint_weights'] = False + self.ann_info['joint_weights'] = np.array([1.,], + dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) + + self.coco = COCO(ann_file) + + self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) + self.img_ids = self.coco.getImgIds() + self.classes = [ + cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + + self.num_classes = len(self.classes) + self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds())) + self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes)) + + if valid_class_ids is not None: # None by default + self.valid_class_ids = valid_class_ids + else: + self.valid_class_ids = self.coco.getCatIds() + self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids] + + self.cats = self.coco.cats + self.max_kpt_num = max_kpt_num + + # Also update self.cat2obj + self.db = self._get_db() + + self.num_shots = num_shots + + if not test_mode: + # Update every training epoch + self.random_paired_samples() + else: + self.num_queries = num_queries + self.num_episodes = num_episodes + self.make_paired_samples() + + + def random_paired_samples(self): + num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes] + + # balance the dataset + max_num_data = max(num_datas) + + all_samples = [] + for cls in self.valid_class_ids: + for i in range(max_num_data): + shot = random.sample(self.cat2obj[cls], self.num_shots + 1) + all_samples.append(shot) + + self.paired_samples = np.array(all_samples) + np.random.shuffle(self.paired_samples) + + def make_paired_samples(self): + random.seed(1) + np.random.seed(0) + + all_samples = [] + for cls in self.valid_class_ids: + for _ in range(self.num_episodes): + shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries) + sample_ids = shots[:self.num_shots] + query_ids = shots[self.num_shots:] + for query_id in query_ids: + all_samples.append(sample_ids + [query_id]) + + self.paired_samples = np.array(all_samples) + + def _select_kpt(self, obj, kpt_id): + obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1] + obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1] + obj['kpt_id'] = kpt_id + + return obj + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _get_db(self): + """Ground truth bbox and keypoints.""" + self.obj_id = 0 + + self.cat2obj = {} + for i in self.coco.getCatIds(): + self.cat2obj.update({i: []}) + + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + + category_id = obj['category_id'] + # the number of keypoint for this specific category + cat_kpt_num = int(len(obj['keypoints']) / 3) + if self.max_kpt_num is None: + kpt_num = cat_kpt_num + else: + kpt_num = self.max_kpt_num + + joints_3d = np.zeros((kpt_num, 3), dtype=np.float32) + joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:cat_kpt_num, :2] = keypoints[:, :2] + joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + + self.cat2obj[category_id].append(self.obj_id) + + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'bbox': obj['clean_bbox'][:4], + 'bbox_score': 1, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'category_id': category_id, + 'cat_kpt_num': cat_kpt_num, + 'bbox_id': self.obj_id, + 'skeleton': self.coco.cats[obj['category_id']]['skeleton'], + }) + bbox_id = bbox_id + 1 + self.obj_id += 1 + + return rec + + def _xywh2cs(self, x, y, w, h): + """This encodes bbox(x,y,w,w) into (center, scale) + + Args: + x, y, w, h + + Returns: + tuple: A tuple containing center and scale. + + - center (np.ndarray[float32](2,)): center of the bbox (x, y). + - scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + # + # if (not self.test_mode) and np.random.rand() < 0.3: + # center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * 1.25 + + return center, scale + + def evaluate(self, outputs, res_folder, metric='PCK', **kwargs): + """Evaluate interhand2d keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + outputs (list(preds, boxes, image_path, output_heatmap)) + :preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + :image_paths (list[str]): For example, ['C', 'a', 'p', 't', + 'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_', + 'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M', + '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/', + 'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.', + 'j', 'p', 'g'] + :output_heatmap (np.ndarray[N, K, H, W]): model outpus. + + res_folder (str): Path of directory to save the results. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + res_file = os.path.join(res_folder, 'result_keypoints.json') + + kpts = [] + for output in outputs: + preds = output['preds'] + boxes = output['boxes'] + image_paths = output['image_paths'] + bbox_ids = output['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + return name_value diff --git a/EdgeCape/datasets/datasets/mp100/transformer_base_dataset.py b/EdgeCape/datasets/datasets/mp100/transformer_base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..bd6b343e56c6096e6fa35c960f2596811cc2ac32 --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/transformer_base_dataset.py @@ -0,0 +1,209 @@ +import copy +from abc import ABCMeta, abstractmethod +import json_tricks as json +import numpy as np + +from mmcv.parallel import DataContainer as DC +from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, + keypoint_pck_accuracy) +from torch.utils.data import Dataset +from mmpose.datasets import DATASETS +from mmpose.datasets.pipelines import Compose + +@DATASETS.register_module() +class TransformerBaseDataset(Dataset, metaclass=ABCMeta): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + masking_ratio=0.3, + test_mode=False): + self.image_info = {} + self.ann_info = {} + + self.annotations_path = ann_file + if not img_prefix.endswith('/'): + img_prefix = img_prefix + '/' + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + self.masking_ratio = masking_ratio + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['flip_pairs'] = None + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + self.db = [] + self.num_shots = 1 + self.paired_samples = [] + self.pipeline = Compose(self.pipeline) + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def _select_kpt(self, obj, kpt_id): + """Select kpt.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, cfg, preds, output_dir, *args, **kwargs): + """Evaluate keypoint results.""" + raise NotImplementedError + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, + res_file, + metrics, + pck_thr=0.2, + pckh_thr=0.7, + auc_nor=30): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.paired_samples) + + outputs = [] + gts = [] + masks = [] + threshold_bbox = [] + threshold_head_box = [] + + for pred, pair in zip(preds, self.paired_samples): + item = self.db[pair[-1]] + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + + mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0) + mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0) + for id_s in pair[:-1]: + mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0)) + masks.append(np.bitwise_and(mask_query, mask_sample)) + + if 'PCK' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + if 'PCKh' in metrics: + head_box_thr = item['head_size'] + threshold_head_box.append( + np.array([head_box_thr, head_box_thr])) + + if 'PCK' in metrics: + pck_avg = [] + for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox): + _, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), pck_thr, np.expand_dims(thr_bbox,0)) + pck_avg.append(pck) + info_str.append(('PCK', np.mean(pck_avg))) + + return info_str + + def _merge_obj(self, Xs_list, Xq, idx): + """ merge Xs_list and Xq. + + :param Xs_list: N-shot samples X + :param Xq: query X + :param idx: id of paired_samples + :return: Xall + """ + Xall = dict() + Xall['img_s'] = [Xs['img'] for Xs in Xs_list] + Xall['target_s'] = [Xs['target'] for Xs in Xs_list] + Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list] + xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list] + + Xall['img_q'] = Xq['img'] + Xall['target_q'] = Xq['target'] + Xall['target_weight_q'] = Xq['target_weight'] + xq_img_metas = Xq['img_metas'].data + + img_metas = dict() + for key in xq_img_metas.keys(): + img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas] + img_metas['query_' + key] = xq_img_metas[key] + img_metas['bbox_id'] = idx + + Xall['img_metas'] = DC(img_metas, cpu_only=True) + + return Xall + + def __len__(self): + """Get the size of the dataset.""" + return len(self.paired_samples) + + def __getitem__(self, idx): + """Get the sample given index.""" + + pair_ids = self.paired_samples[idx] # [supported id * shots, query id] + assert len(pair_ids) == self.num_shots + 1 + sample_id_list = pair_ids[:self.num_shots] + query_id = pair_ids[-1] + + sample_obj_list = [] + for sample_id in sample_id_list: + sample_obj = copy.deepcopy(self.db[sample_id]) + sample_obj['ann_info'] = copy.deepcopy(self.ann_info) + sample_obj_list.append(sample_obj) + + query_obj = copy.deepcopy(self.db[query_id]) + query_obj['ann_info'] = copy.deepcopy(self.ann_info) + + Xs_list = [] + for sample_obj in sample_obj_list: + Xs = self.pipeline(sample_obj) # dict with ['img', 'target', 'target_weight', 'img_metas'], + Xs_list.append(Xs) # Xs['target'] is of shape [100, map_h, map_w] + Xq = self.pipeline(query_obj) + + Xall = self._merge_obj(Xs_list, Xq, idx) + Xall['skeleton'] = self.db[query_id]['skeleton'] + Xall['rand_mask'] = self.rand_mask(Xall['target_weight_s']) + return Xall + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts + + def rand_mask(self, target_weight_s): + mask_s = target_weight_s[0] + for target_weight in target_weight_s: + mask_s = mask_s * target_weight + num_to_mask = int(np.sum(mask_s) * self.masking_ratio) + true_indices = np.where(mask_s == 1)[0] + rand_mask = np.random.permutation(true_indices)[:num_to_mask] + mask_s[rand_mask] = 0 + return mask_s diff --git a/EdgeCape/datasets/datasets/mp100/transformer_dataset.py b/EdgeCape/datasets/datasets/mp100/transformer_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..4bb584a7cf914e1f0dc2b37e74b3fe546c5afd04 --- /dev/null +++ b/EdgeCape/datasets/datasets/mp100/transformer_dataset.py @@ -0,0 +1,319 @@ +from mmpose.datasets import DATASETS +import random +import numpy as np +import os +from collections import OrderedDict +from xtcocotools.coco import COCO +from .transformer_base_dataset import TransformerBaseDataset + +@DATASETS.register_module() +class TransformerPoseDataset(TransformerBaseDataset): + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + valid_class_ids, + max_kpt_num=None, + num_shots=1, + num_queries=100, + num_episodes=1, + test_mode=False): + super().__init__( + ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode) + + self.ann_info['flip_pairs'] = [] + + self.ann_info['upper_body_ids'] = [] + self.ann_info['lower_body_ids'] = [] + + self.ann_info['use_different_joint_weights'] = False + self.ann_info['joint_weights'] = np.array([1.,], + dtype=np.float32).reshape((self.ann_info['num_joints'], 1)) + + self.coco = COCO(ann_file) + + self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) + self.img_ids = self.coco.getImgIds() + self.classes = [ + cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + + self.num_classes = len(self.classes) + self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds())) + self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes)) + + if valid_class_ids is not None: # None by default + self.valid_class_ids = valid_class_ids + else: + self.valid_class_ids = self.coco.getCatIds() + self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids] + + self.cats = self.coco.cats + self.max_kpt_num = max_kpt_num + + # Also update self.cat2obj + self.db = self._get_db() + + self.num_shots = num_shots + + if not test_mode: + # Update every training epoch + self.random_paired_samples() + else: + self.num_queries = num_queries + self.num_episodes = num_episodes + self.make_paired_samples() + + + def random_paired_samples(self): + num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes] + + # balance the dataset + max_num_data = max(num_datas) + + all_samples = [] + for cls in self.valid_class_ids: + for i in range(max_num_data): + shot = random.sample(self.cat2obj[cls], self.num_shots + 1) + all_samples.append(shot) + + self.paired_samples = np.array(all_samples) + np.random.shuffle(self.paired_samples) + + def make_paired_samples(self): + random.seed(1) + np.random.seed(0) + + all_samples = [] + for cls in self.valid_class_ids: + for _ in range(self.num_episodes): + shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries) + sample_ids = shots[:self.num_shots] + query_ids = shots[self.num_shots:] + for query_id in query_ids: + all_samples.append(sample_ids + [query_id]) + + self.paired_samples = np.array(all_samples) + + def _select_kpt(self, obj, kpt_id): + obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1] + obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1] + obj['kpt_id'] = kpt_id + + return obj + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _get_db(self): + """Ground truth bbox and keypoints.""" + self.obj_id = 0 + + self.cat2obj = {} + for i in self.coco.getCatIds(): + self.cat2obj.update({i: []}) + + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + + category_id = obj['category_id'] + # the number of keypoint for this specific category + cat_kpt_num = int(len(obj['keypoints']) / 3) + if self.max_kpt_num is None: + kpt_num = cat_kpt_num + else: + kpt_num = self.max_kpt_num + + joints_3d = np.zeros((kpt_num, 3), dtype=np.float32) + joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + joints_3d[:cat_kpt_num, :2] = keypoints[:, :2] + joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + if os.path.exists(image_file): + self.cat2obj[category_id].append(self.obj_id) + + rec.append({ + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'rotation': 0, + 'bbox': obj['clean_bbox'][:4], + 'bbox_score': 1, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'category_id': category_id, + 'cat_kpt_num': cat_kpt_num, + 'bbox_id': self.obj_id, + 'skeleton': self.coco.cats[obj['category_id']]['skeleton'], + }) + bbox_id = bbox_id + 1 + self.obj_id += 1 + + return rec + + def _xywh2cs(self, x, y, w, h): + """This encodes bbox(x,y,w,w) into (center, scale) + + Args: + x, y, w, h + + Returns: + tuple: A tuple containing center and scale. + + - center (np.ndarray[float32](2,)): center of the bbox (x, y). + - scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + # + # if (not self.test_mode) and np.random.rand() < 0.3: + # center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * 1.25 + + return center, scale + + def evaluate(self, outputs, res_folder, metric='PCK', **kwargs): + """Evaluate interhand2d keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + outputs (list(preds, boxes, image_path, output_heatmap)) + :preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + :image_paths (list[str]): For example, ['C', 'a', 'p', 't', + 'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_', + 'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M', + '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/', + 'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.', + 'j', 'p', 'g'] + :output_heatmap (np.ndarray[N, K, H, W]): model outpus. + + res_folder (str): Path of directory to save the results. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'AUC', 'EPE'. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + res_file = os.path.join(res_folder, 'result_keypoints.json') + + kpts = [] + for output in outputs: + preds = output['preds'] + boxes = output['boxes'] + image_paths = output['image_paths'] + bbox_ids = output['bbox_ids'] + + batch_size = len(image_paths) + for i in range(batch_size): + image_id = self.name2id[image_paths[i][len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i].tolist(), + 'center': boxes[i][0:2].tolist(), + 'scale': boxes[i][2:4].tolist(), + 'area': float(boxes[i][4]), + 'score': float(boxes[i][5]), + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + return name_value diff --git a/EdgeCape/datasets/pipelines/__init__.py b/EdgeCape/datasets/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9f4429140785a3e72c0ac6f63dd99a041b9bd92d --- /dev/null +++ b/EdgeCape/datasets/pipelines/__init__.py @@ -0,0 +1,8 @@ +from .top_down_transform import (TopDownAffineFewShot, + TopDownGenerateTargetFewShot, + LoadDepthFromFile, + DepthTopDownAffineFewShot) + +__all__ = [ + 'TopDownGenerateTargetFewShot', 'TopDownAffineFewShot', 'LoadDepthFromFile', 'DepthTopDownAffineFewShot', +] diff --git a/EdgeCape/datasets/pipelines/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/pipelines/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..948825ac9faf17123a86e341cc2be39dd6e4de48 Binary files /dev/null and b/EdgeCape/datasets/pipelines/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/datasets/pipelines/__pycache__/post_transforms.cpython-39.pyc b/EdgeCape/datasets/pipelines/__pycache__/post_transforms.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ce6763358a558d6439cb3ba57fc73a35334a7f6 Binary files /dev/null and b/EdgeCape/datasets/pipelines/__pycache__/post_transforms.cpython-39.pyc differ diff --git a/EdgeCape/datasets/pipelines/__pycache__/top_down_transform.cpython-39.pyc b/EdgeCape/datasets/pipelines/__pycache__/top_down_transform.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d08eaf3032d9c87f0c870510e26b4c0e93788fd Binary files /dev/null and b/EdgeCape/datasets/pipelines/__pycache__/top_down_transform.cpython-39.pyc differ diff --git a/EdgeCape/datasets/pipelines/post_transforms.py b/EdgeCape/datasets/pipelines/post_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..a1025daf5cc87ca3d9a3a204a8df05ca8af725fd --- /dev/null +++ b/EdgeCape/datasets/pipelines/post_transforms.py @@ -0,0 +1,121 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import cv2 +import numpy as np + + +def get_affine_transform(center, + scale, + rot, + output_size, + shift=(0., 0.), + inv=False): + """Get the affine transform matrix, given the center/scale/rot/output_size. + + Args: + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + rot (float): Rotation angle (degree). + output_size (np.ndarray[2, ]): Size of the destination heatmaps. + shift (0-100%): Shift translation ratio wrt the width/height. + Default (0., 0.). + inv (bool): Option to inverse the affine transform direction. + (inv=False: src->dst or inv=True: dst->src) + + Returns: + np.ndarray: The transform matrix. + """ + assert len(center) == 2 + assert len(scale) == 2 + assert len(output_size) == 2 + assert len(shift) == 2 + + # pixel_std is 200. + scale_tmp = scale * 200.0 + + shift = np.array(shift) + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = rotate_point([0., src_w * -0.5], rot_rad) + dst_dir = np.array([0., dst_w * -0.5]) + + src = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + src[2, :] = _get_3rd_point(src[0, :], src[1, :]) + + dst = np.zeros((3, 2), dtype=np.float32) + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def affine_transform(pt, trans_mat): + """Apply an affine transformation to the points. + + Args: + pt (np.ndarray): a 2 dimensional point to be transformed + trans_mat (np.ndarray): 2x3 matrix of an affine transform + + Returns: + np.ndarray: Transformed points. + """ + assert len(pt) == 2 + new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.]) + + return new_pt + + +def _get_3rd_point(a, b): + """To calculate the affine matrix, three pairs of points are required. This + function is used to get the 3rd point, given 2D points a & b. + + The 3rd point is defined by rotating vector `a - b` by 90 degrees + anticlockwise, using b as the rotation center. + + Args: + a (np.ndarray): point(x,y) + b (np.ndarray): point(x,y) + + Returns: + np.ndarray: The 3rd point. + """ + assert len(a) == 2 + assert len(b) == 2 + direction = a - b + third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) + + return third_pt + + +def rotate_point(pt, angle_rad): + """Rotate a point by an angle. + + Args: + pt (list[float]): 2 dimensional point to be rotated + angle_rad (float): rotation angle by radian + + Returns: + list[float]: Rotated point. + """ + assert len(pt) == 2 + sn, cs = np.sin(angle_rad), np.cos(angle_rad) + new_x = pt[0] * cs - pt[1] * sn + new_y = pt[0] * sn + pt[1] * cs + rotated_pt = [new_x, new_y] + + return rotated_pt diff --git a/EdgeCape/datasets/pipelines/top_down_transform.py b/EdgeCape/datasets/pipelines/top_down_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..147e4160b39a94d568b25187ab90d4be8e7d743c --- /dev/null +++ b/EdgeCape/datasets/pipelines/top_down_transform.py @@ -0,0 +1,716 @@ +import os +import warnings +from pathlib import Path +from typing import Optional + +import cv2 +import mmcv +import numpy as np +from mmcv import fileio + +from mmpose.datasets.builder import PIPELINES +from .post_transforms import (affine_transform, + get_affine_transform) +from mmpose.core.post_processing import (affine_transform, fliplr_joints, + get_affine_transform, get_warp_matrix, + warp_affine_joints) + +@PIPELINES.register_module() +class TopDownAffineFewShot: + """Affine transform the image to make input. + + Required keys:'img', 'joints_3d', 'joints_3d_visible', 'ann_info','scale', + 'rotation' and 'center'. Modified keys:'img', 'joints_3d', and + 'joints_3d_visible'. + + Args: + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, use_udp=False): + self.use_udp = use_udp + + def __call__(self, results): + image_size = results['ann_info']['image_size'] + + img = results['img'] + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + c = results['center'] + s = results['scale'] + r = results['rotation'] + + if self.use_udp: + trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0) + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + joints_3d[:, 0:2] = \ + warp_affine_joints(joints_3d[:, 0:2].copy(), trans) + else: + trans = get_affine_transform(c, s, r, image_size) + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + for i in range(len(joints_3d)): + if joints_3d_visible[i, 0] > 0.0: + joints_3d[i, 0:2] = affine_transform(joints_3d[i, 0:2], trans) + + results['img'] = img + results['joints_3d'] = joints_3d + results['joints_3d_visible'] = joints_3d_visible + + return results + + +@PIPELINES.register_module() +class TopDownGenerateTargetFewShot: + """Generate the target heatmap. + + Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'. + Modified keys: 'target', and 'target_weight'. + + Args: + sigma: Sigma of heatmap gaussian for 'MSRA' approach. + kernel: Kernel of heatmap gaussian for 'Megvii' approach. + encoding (str): Approach to generate target heatmaps. + Currently supported approaches: 'MSRA', 'Megvii', 'UDP'. + Default:'MSRA' + + unbiased_encoding (bool): Option to use unbiased + encoding methods. + Paper ref: Zhang et al. Distribution-Aware Coordinate + Representation for Human Pose Estimation (CVPR 2020). + keypoint_pose_distance: Keypoint pose distance for UDP. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + target_type (str): supported targets: 'GaussianHeatMap', + 'CombinedTarget'. Default:'GaussianHeatMap' + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, + sigma=2, + kernel=(11, 11), + valid_radius_factor=0.0546875, + target_type='GaussianHeatMap', + encoding='MSRA', + unbiased_encoding=False): + self.sigma = sigma + self.unbiased_encoding = unbiased_encoding + self.kernel = kernel + self.valid_radius_factor = valid_radius_factor + self.target_type = target_type + self.encoding = encoding + + def _msra_generate_target(self, cfg, joints_3d, joints_3d_visible, sigma): + """Generate the target heatmap via "MSRA" approach. + + Args: + cfg (dict): data config + joints_3d: np.ndarray ([num_joints, 3]) + joints_3d_visible: np.ndarray ([num_joints, 3]) + sigma: Sigma of heatmap gaussian + Returns: + tuple: A tuple containing targets. + + - target: Target heatmaps. + - target_weight: (1: visible, 0: invisible) + """ + num_joints = len(joints_3d) + image_size = cfg['image_size'] + W, H = cfg['heatmap_size'] + joint_weights = cfg['joint_weights'] + use_different_joint_weights = cfg['use_different_joint_weights'] + assert not use_different_joint_weights + + target_weight = np.zeros((num_joints, 1), dtype=np.float32) + target = np.zeros((num_joints, H, W), dtype=np.float32) + + # 3-sigma rule + tmp_size = sigma * 3 + + if self.unbiased_encoding: + for joint_id in range(num_joints): + target_weight[joint_id] = joints_3d_visible[joint_id, 0] + + feat_stride = image_size / [W, H] + mu_x = joints_3d[joint_id][0] / feat_stride[0] + mu_y = joints_3d[joint_id][1] / feat_stride[1] + # Check that any part of the gaussian is in-bounds + ul = [mu_x - tmp_size, mu_y - tmp_size] + br = [mu_x + tmp_size + 1, mu_y + tmp_size + 1] + if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0: + target_weight[joint_id] = 0 + + if target_weight[joint_id] == 0: + continue + + x = np.arange(0, W, 1, np.float32) + y = np.arange(0, H, 1, np.float32) + y = y[:, None] + + if target_weight[joint_id] > 0.5: + target[joint_id] = np.exp(-((x - mu_x)**2 + + (y - mu_y)**2) / + (2 * sigma**2)) + else: + for joint_id in range(num_joints): + target_weight[joint_id] = joints_3d_visible[joint_id, 0] + + feat_stride = image_size / [W, H] + mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5) + mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5) + # Check that any part of the gaussian is in-bounds + ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] + br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] + if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0: + target_weight[joint_id] = 0 + + if target_weight[joint_id] > 0.5: + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, None] + x0 = y0 = size // 2 + # The gaussian is not normalized, + # we want the center value to equal 1 + g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], W) - ul[0] + g_y = max(0, -ul[1]), min(br[1], H) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], W) + img_y = max(0, ul[1]), min(br[1], H) + + target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ + g[g_y[0]:g_y[1], g_x[0]:g_x[1]] + + if use_different_joint_weights: + target_weight = np.multiply(target_weight, joint_weights) + + return target, target_weight + + def _udp_generate_target(self, cfg, joints_3d, joints_3d_visible, factor, + target_type): + """Generate the target heatmap via 'UDP' approach. Paper ref: Huang et + al. The Devil is in the Details: Delving into Unbiased Data Processing + for Human Pose Estimation (CVPR 2020). + + Note: + num keypoints: K + heatmap height: H + heatmap width: W + num target channels: C + C = K if target_type=='GaussianHeatMap' + C = 3*K if target_type=='CombinedTarget' + + Args: + cfg (dict): data config + joints_3d (np.ndarray[K, 3]): Annotated keypoints. + joints_3d_visible (np.ndarray[K, 3]): Visibility of keypoints. + factor (float): kernel factor for GaussianHeatMap target or + valid radius factor for CombinedTarget. + target_type (str): 'GaussianHeatMap' or 'CombinedTarget'. + GaussianHeatMap: Heatmap target with gaussian distribution. + CombinedTarget: The combination of classification target + (response map) and regression target (offset map). + + Returns: + tuple: A tuple containing targets. + + - target (np.ndarray[C, H, W]): Target heatmaps. + - target_weight (np.ndarray[K, 1]): (1: visible, 0: invisible) + """ + num_joints = len(joints_3d) + image_size = cfg['image_size'] + heatmap_size = cfg['heatmap_size'] + joint_weights = cfg['joint_weights'] + use_different_joint_weights = cfg['use_different_joint_weights'] + assert not use_different_joint_weights + + target_weight = np.ones((num_joints, 1), dtype=np.float32) + target_weight[:, 0] = joints_3d_visible[:, 0] + + assert target_type in ['GaussianHeatMap', 'CombinedTarget'] + + if target_type == 'GaussianHeatMap': + target = np.zeros((num_joints, heatmap_size[1], heatmap_size[0]), + dtype=np.float32) + + tmp_size = factor * 3 + + # prepare for gaussian + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, None] + + for joint_id in range(num_joints): + feat_stride = (image_size - 1.0) / (heatmap_size - 1.0) + mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5) + mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5) + # Check that any part of the gaussian is in-bounds + ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] + br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] + if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \ + or br[0] < 0 or br[1] < 0: + # If not, just return the image as is + target_weight[joint_id] = 0 + continue + + # # Generate gaussian + mu_x_ac = joints_3d[joint_id][0] / feat_stride[0] + mu_y_ac = joints_3d[joint_id][1] / feat_stride[1] + x0 = y0 = size // 2 + x0 += mu_x_ac - mu_x + y0 += mu_y_ac - mu_y + g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * factor**2)) + + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0] + g_y = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], heatmap_size[0]) + img_y = max(0, ul[1]), min(br[1], heatmap_size[1]) + + v = target_weight[joint_id] + if v > 0.5: + target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ + g[g_y[0]:g_y[1], g_x[0]:g_x[1]] + elif target_type == 'CombinedTarget': + target = np.zeros( + (num_joints, 3, heatmap_size[1] * heatmap_size[0]), + dtype=np.float32) + feat_width = heatmap_size[0] + feat_height = heatmap_size[1] + feat_x_int = np.arange(0, feat_width) + feat_y_int = np.arange(0, feat_height) + feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int) + feat_x_int = feat_x_int.flatten() + feat_y_int = feat_y_int.flatten() + # Calculate the radius of the positive area in classification + # heatmap. + valid_radius = factor * heatmap_size[1] + feat_stride = (image_size - 1.0) / (heatmap_size - 1.0) + for joint_id in range(num_joints): + mu_x = joints_3d[joint_id][0] / feat_stride[0] + mu_y = joints_3d[joint_id][1] / feat_stride[1] + x_offset = (mu_x - feat_x_int) / valid_radius + y_offset = (mu_y - feat_y_int) / valid_radius + dis = x_offset**2 + y_offset**2 + keep_pos = np.where(dis <= 1)[0] + v = target_weight[joint_id] + if v > 0.5: + target[joint_id, 0, keep_pos] = 1 + target[joint_id, 1, keep_pos] = x_offset[keep_pos] + target[joint_id, 2, keep_pos] = y_offset[keep_pos] + target = target.reshape(num_joints * 3, heatmap_size[1], + heatmap_size[0]) + + if use_different_joint_weights: + target_weight = np.multiply(target_weight, joint_weights) + + return target, target_weight + + def __call__(self, results): + """Generate the target heatmap.""" + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + + assert self.encoding in ['MSRA', 'UDP'] + + if self.encoding == 'MSRA': + if isinstance(self.sigma, list): + num_sigmas = len(self.sigma) + cfg = results['ann_info'] + num_joints = len(joints_3d) + heatmap_size = cfg['heatmap_size'] + + target = np.empty( + (0, num_joints, heatmap_size[1], heatmap_size[0]), + dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_sigmas): + target_i, target_weight_i = self._msra_generate_target( + cfg, joints_3d, joints_3d_visible, self.sigma[i]) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: + target, target_weight = self._msra_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, + self.sigma) + elif self.encoding == 'UDP': + if self.target_type == 'CombinedTarget': + factors = self.valid_radius_factor + channel_factor = 3 + elif self.target_type == 'GaussianHeatMap': + factors = self.sigma + channel_factor = 1 + if isinstance(factors, list): + num_factors = len(factors) + cfg = results['ann_info'] + num_joints = len(joints_3d) + W, H = cfg['heatmap_size'] + + target = np.empty((0, channel_factor * num_joints, H, W), + dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_factors): + target_i, target_weight_i = self._udp_generate_target( + cfg, joints_3d, joints_3d_visible, factors[i], + self.target_type) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: + target, target_weight = self._udp_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, factors, + self.target_type) + else: + raise ValueError( + f'Encoding approach {self.encoding} is not supported!') + + results['target'] = target + results['target_weight'] = target_weight + + return results + +@PIPELINES.register_module() +class LoadDepthFromFile: + """Load depthmap from file. + + Required Keys: + + - depth_path + + Modified Keys: + + - depth + + Args: + to_float32 (bool): Whether to convert the loaded depth to a float32 + numpy array. If set to False, the loaded depth is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:`mmcv.imfrombytes`. + Defaults to 'color'. + imdecode_backend (str): The depth decoding backend type. The backend + argument for :func:`mmcv.imfrombytes`. + See :func:`mmcv.imfrombytes` for details. + Defaults to 'cv2'. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmengine.fileio.FileClient` for details. + Defaults to None. It will be deprecated in future. Please use + ``backend_args`` instead. + Deprecated in version 2.0.0rc4. + ignore_empty (bool): Whether to allow loading empty depth or file path + not existent. Defaults to False. + backend_args (dict, optional): Instantiates the corresponding file + backend. It may contain `backend` key to specify the file + backend. If it contains, the file backend corresponding to this + value will be used and initialized with the remaining values, + otherwise the corresponding file backend will be selected + based on the prefix of the file path. Defaults to None. + New in version 2.0.0rc4. + """ + + def __init__(self, + to_float32=False, + color_type='color', + channel_order='rgb', + file_client_args=dict(backend='disk')): + self.to_float32 = to_float32 + self.color_type = color_type + self.channel_order = channel_order + self.file_client_args = file_client_args.copy() + self.file_client = None + + def _read_depth(self, path): + img = np.load(path)['depth'] + if img is None: + raise ValueError(f'Fail to read {path}') + if self.to_float32: + img = img.astype(np.float32) + return img + + def __call__(self, results: dict) -> Optional[dict]: + """Functions to load depth. + + Args: + results (dict): Result dict from + :class:`mmengine.dataset.BaseDataset`. + + Returns: + dict: The dict contains loaded depth and meta information. + """ + + """Loading depth(s) from file.""" + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + depth_file = results.get('depth_file', None) + # Replace file extension with npy + pre, ext = os.path.splitext(depth_file) + depth_file = pre + '.npz' + if isinstance(depth_file, (list, tuple)): + # Load depths from a list of paths + results['depth'] = [self._read_depth(path) for path in depth_file] + elif depth_file is not None: + # Load single depth from path + results['depth'] = self._read_depth(depth_file) + else: + if 'depth' not in results: + # If `depth_file`` is not in results, check the `img` exists + # and format the depth. This for compatibility when the depth + # is manually set outside the pipeline. + raise KeyError('Either `depth_file` or `img` should exist in ' + 'results.') + if isinstance(results['depth'], (list, tuple)): + assert isinstance(results['depth'][0], np.ndarray) + else: + assert isinstance(results['depth'], np.ndarray) + results['depth_file'] = None + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f'file_client_args={self.file_client_args})') + return repr_str + + +@PIPELINES.register_module() +class DepthTopDownAffineFewShot: + """Affine transform the image to make input. + + Required keys:'img', 'depth', 'joints_3d', 'joints_3d_visible', 'ann_info','scale', + 'rotation' and 'center'. Modified keys:'img', 'joints_3d', and + 'joints_3d_visible'. + + Args: + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, use_udp=False): + self.use_udp = use_udp + + def __call__(self, results): + image_size = results['ann_info']['image_size'] + + img = results['img'] + depth = results['depth'] + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + c = results['center'] + s = results['scale'] + r = results['rotation'] + + if self.use_udp: + trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0) + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + depth = cv2.warpAffine( + depth, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + joints_3d[:, 0:2] = warp_affine_joints(joints_3d[:, 0:2].copy(), trans) + else: + trans = get_affine_transform(c, s, r, image_size) + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + depth = cv2.warpAffine( + depth, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + for i in range(len(joints_3d)): + if joints_3d_visible[i, 0] > 0.0: + joints_3d[i, 0:2] = affine_transform(joints_3d[i, 0:2], trans) + + results['img'] = img + results['depth'] = depth + results['joints_3d'] = joints_3d + results['joints_3d_visible'] = joints_3d_visible + + return results + + + + +@PIPELINES.register_module() +class LoadFeatFromFile: + """Load depthmap from file. + + Required Keys: + + - depth_path + + Modified Keys: + + - depth + + Args: + to_float32 (bool): Whether to convert the loaded depth to a float32 + numpy array. If set to False, the loaded depth is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:`mmcv.imfrombytes`. + Defaults to 'color'. + imdecode_backend (str): The depth decoding backend type. The backend + argument for :func:`mmcv.imfrombytes`. + See :func:`mmcv.imfrombytes` for details. + Defaults to 'cv2'. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmengine.fileio.FileClient` for details. + Defaults to None. It will be deprecated in future. Please use + ``backend_args`` instead. + Deprecated in version 2.0.0rc4. + ignore_empty (bool): Whether to allow loading empty depth or file path + not existent. Defaults to False. + backend_args (dict, optional): Instantiates the corresponding file + backend. It may contain `backend` key to specify the file + backend. If it contains, the file backend corresponding to this + value will be used and initialized with the remaining values, + otherwise the corresponding file backend will be selected + based on the prefix of the file path. Defaults to None. + New in version 2.0.0rc4. + """ + + def __init__(self, + to_float32=False, + color_type='color', + channel_order='rgb', + file_client_args=dict(backend='disk')): + self.to_float32 = to_float32 + self.color_type = color_type + self.channel_order = channel_order + self.file_client_args = file_client_args.copy() + self.file_client = None + + def _read_depth(self, path): + img = np.load(path)['feat'] + if img is None: + raise ValueError(f'Fail to read {path}') + if self.to_float32: + img = img.astype(np.float32) + return img + + def __call__(self, results: dict) -> Optional[dict]: + """Functions to load depth. + + Args: + results (dict): Result dict from + :class:`mmengine.dataset.BaseDataset`. + + Returns: + dict: The dict contains loaded depth and meta information. + """ + + """Loading depth(s) from file.""" + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + feat_file = results.get('feat_file', None) + # Replace file extension with npy + pre, ext = os.path.splitext(feat_file) + feat_file = pre + '.npz' + if isinstance(feat_file, (list, tuple)): + # Load depths from a list of paths + results['feat'] = [self._read_depth(path) for path in feat_file] + elif feat_file is not None: + # Load single depth from path + results['feat'] = self._read_depth(feat_file) + else: + if 'feat_file' not in results: + # If `depth_file`` is not in results, check the `img` exists + # and format the depth. This for compatibility when the depth + # is manually set outside the pipeline. + raise KeyError('Either `feat_file` or `img` should exist in results.') + if isinstance(results['feat'], (list, tuple)): + assert isinstance(results['feat'][0], np.ndarray) + else: + assert isinstance(results['feat'], np.ndarray) + results['feat_file'] = None + + return results + + def __repr__(self): + repr_str = (f'{self.__class__.__name__}(' + f'to_float32={self.to_float32}, ' + f"color_type='{self.color_type}', " + f'file_client_args={self.file_client_args})') + return repr_str + + +@PIPELINES.register_module() +class FeatTopDownAffineFewShot: + """Affine transform the image to make input. + + Required keys:'img', 'depth', 'joints_3d', 'joints_3d_visible', 'ann_info','scale', + 'rotation' and 'center'. Modified keys:'img', 'joints_3d', and + 'joints_3d_visible'. + + Args: + use_udp (bool): To use unbiased data processing. + Paper ref: Huang et al. The Devil is in the Details: Delving into + Unbiased Data Processing for Human Pose Estimation (CVPR 2020). + """ + + def __init__(self, use_udp=False): + self.use_udp = use_udp + + def __call__(self, results): + image_size = results['ann_info']['image_size'] + + img = results['img'] + feat = results['feat'] + joints_3d = results['joints_3d'] + joints_3d_visible = results['joints_3d_visible'] + c = results['center'] + s = results['scale'] + r = results['rotation'] + + if self.use_udp: + trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0) + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + feat = cv2.warpAffine( + feat, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + joints_3d[:, 0:2] = warp_affine_joints(joints_3d[:, 0:2].copy(), trans) + else: + trans = get_affine_transform(c, s, r, image_size) + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + feat = cv2.warpAffine( + feat, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + for i in range(len(joints_3d)): + if joints_3d_visible[i, 0] > 0.0: + joints_3d[i, 0:2] = affine_transform(joints_3d[i, 0:2], trans) + + results['img'] = img + results['depth'] = feat + results['joints_3d'] = joints_3d + results['joints_3d_visible'] = joints_3d_visible + + return results diff --git a/EdgeCape/models/__init__.py b/EdgeCape/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..45fc685affd284a607f312a718cbe3c1abc7751b --- /dev/null +++ b/EdgeCape/models/__init__.py @@ -0,0 +1,3 @@ +from .detectors import * # noqa +from .keypoint_heads import * # noqa +from .backbones import * # noqa \ No newline at end of file diff --git a/EdgeCape/models/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7a5039155827e7b483a269cfe8f11db33062967d Binary files /dev/null and b/EdgeCape/models/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/models/backbones/__pycache__/adapter.cpython-39.pyc b/EdgeCape/models/backbones/__pycache__/adapter.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81da99d8bada3e5dd0f4d82e7ee00bc13ca9a06b Binary files /dev/null and b/EdgeCape/models/backbones/__pycache__/adapter.cpython-39.pyc differ diff --git a/EdgeCape/models/backbones/__pycache__/dino.cpython-39.pyc b/EdgeCape/models/backbones/__pycache__/dino.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90967dfc725e242a468959ba72074d68243d0568 Binary files /dev/null and b/EdgeCape/models/backbones/__pycache__/dino.cpython-39.pyc differ diff --git a/EdgeCape/models/backbones/adapter.py b/EdgeCape/models/backbones/adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..17c9fc56c63b414151d8cd918f866e902faa95fd --- /dev/null +++ b/EdgeCape/models/backbones/adapter.py @@ -0,0 +1,935 @@ + +import torch.nn.functional as F +import fvcore.nn.weight_init as weight_init +import numpy as np +import torch +import torch.nn as nn +from torch.nn.functional import interpolate + +""" +Code is based on: https://github.com/mbanani/probe3d +""" + + +class SurfaceNormalHead(nn.Module): + def __init__( + self, + feat_dim, + head_type="multiscale", + uncertainty_aware=False, + hidden_dim=512, + kernel_size=1, + ): + super().__init__() + + self.uncertainty_aware = uncertainty_aware + output_dim = 4 if uncertainty_aware else 3 + + self.kernel_size = kernel_size + + assert head_type in ["linear", "multiscale", "dpt"] + name = f"snorm_{head_type}_k{kernel_size}" + self.name = f"{name}_UA" if uncertainty_aware else name + + if head_type == "linear": + self.head = Linear(feat_dim, output_dim, kernel_size) + elif head_type == "multiscale": + self.head = MultiscaleHead(feat_dim, output_dim, hidden_dim, kernel_size) + elif head_type == "dpt": + self.head = DPT(feat_dim, output_dim, hidden_dim, kernel_size) + else: + raise ValueError(f"Unknown head type: {self.head_type}") + + def forward(self, feats): + return self.head(feats) + + +class DepthHead(nn.Module): + def __init__( + self, + feat_dim, + head_type="multiscale", + min_depth=0.001, + max_depth=10, + prediction_type="bindepth", + hidden_dim=512, + kernel_size=1, + ): + super().__init__() + + self.kernel_size = kernel_size + self.name = f"{prediction_type}_{head_type}_k{kernel_size}" + + if prediction_type == "bindepth": + output_dim = 256 + self.predict = DepthBinPrediction(min_depth, max_depth, n_bins=output_dim) + elif prediction_type == "sigdepth": + output_dim = 1 + self.predict = DepthSigmoidPrediction(min_depth, max_depth) + else: + raise ValueError() + + if head_type == "linear": + self.head = Linear(feat_dim, output_dim, kernel_size) + elif head_type == "multiscale": + self.head = MultiscaleHead(feat_dim, output_dim, hidden_dim, kernel_size) + elif head_type == "dpt": + self.head = DPT(feat_dim, output_dim, hidden_dim, kernel_size) + else: + raise ValueError(f"Unknown head type: {self.head_type}") + + def forward(self, feats): + """Prediction each pixel.""" + feats = self.head(feats) + depth = self.predict(feats) + return depth + + +class DepthBinPrediction(nn.Module): + def __init__( + self, + min_depth=0.001, + max_depth=10, + n_bins=256, + bins_strategy="UD", + norm_strategy="linear", + ): + super().__init__() + self.n_bins = n_bins + self.min_depth = min_depth + self.max_depth = max_depth + self.norm_strategy = norm_strategy + self.bins_strategy = bins_strategy + + def forward(self, prob): + if self.bins_strategy == "UD": + bins = torch.linspace( + self.min_depth, self.max_depth, self.n_bins, device=prob.device + ) + elif self.bins_strategy == "SID": + bins = torch.logspace( + self.min_depth, self.max_depth, self.n_bins, device=prob.device + ) + + # following Adabins, default linear + if self.norm_strategy == "linear": + prob = torch.relu(prob) + eps = 0.1 + prob = prob + eps + prob = prob / prob.sum(dim=1, keepdim=True) + elif self.norm_strategy == "softmax": + prob = torch.softmax(prob, dim=1) + elif self.norm_strategy == "sigmoid": + prob = torch.sigmoid(prob) + prob = prob / prob.sum(dim=1, keepdim=True) + + depth = torch.einsum("ikhw,k->ihw", [prob, bins]) + depth = depth.unsqueeze(dim=1) + return depth + + +class DepthSigmoidPrediction(nn.Module): + def __init__(self, min_depth=0.001, max_depth=10): + super().__init__() + self.min_depth = min_depth + self.max_depth = max_depth + + def forward(self, pred): + depth = pred.sigmoid() + depth = self.min_depth + depth * (self.max_depth - self.min_depth) + return depth + + +class FeatureFusionBlock(nn.Module): + def __init__(self, features, kernel_size, with_skip=True): + super().__init__() + self.with_skip = with_skip + if self.with_skip: + self.resConfUnit1 = ResidualConvUnit(features, kernel_size) + + self.resConfUnit2 = ResidualConvUnit(features, kernel_size) + + def forward(self, x, skip_x=None): + if skip_x is not None: + assert self.with_skip and skip_x.shape == x.shape + x = self.resConfUnit1(x) + skip_x + + x = self.resConfUnit2(x) + return x + + +class ResidualConvUnit(nn.Module): + def __init__(self, features, kernel_size): + super().__init__() + assert kernel_size % 1 == 0, "Kernel size needs to be odd" + padding = kernel_size // 2 + self.conv = nn.Sequential( + nn.Conv2d(features, features, kernel_size, padding=padding), + nn.ReLU(True), + nn.Conv2d(features, features, kernel_size, padding=padding), + nn.ReLU(True), + ) + + def forward(self, x): + return self.conv(x) + x + + +class DPT(nn.Module): + def __init__(self, input_dims, output_dim, hidden_dim=512, kernel_size=3, hr=False, swin=False): + super().__init__() + assert len(input_dims) == 4 + self.hr = hr + self.conv_0 = nn.Conv2d(input_dims[0], hidden_dim, 1, padding=0) + self.conv_1 = nn.Conv2d(input_dims[1], hidden_dim, 1, padding=0) + self.conv_2 = nn.Conv2d(input_dims[2], hidden_dim, 1, padding=0) + self.conv_3 = nn.Conv2d(input_dims[3], hidden_dim, 1, padding=0) + + self.ref_0 = FeatureFusionBlock(hidden_dim, kernel_size) + self.ref_1 = FeatureFusionBlock(hidden_dim, kernel_size) + self.ref_2 = FeatureFusionBlock(hidden_dim, kernel_size) + self.ref_3 = FeatureFusionBlock(hidden_dim, kernel_size, with_skip=False) + + self.out_conv = nn.Sequential( + nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1), + nn.ReLU(True), + nn.Conv2d(hidden_dim, output_dim, 3, padding=1), + ) + + if swin: + self.scale_factor = [1, 2, 4, 4] + else: + self.scale_factor = [2, 2, 2, 2] + + def forward(self, features): + """Prediction each pixel.""" + assert len(features) == 4 + feats = features.copy() + feats[0] = self.conv_0(feats[0]) + feats[1] = self.conv_1(feats[1]) + feats[2] = self.conv_2(feats[2]) + feats[3] = self.conv_3(feats[3]) + + feats = [interpolate(x, scale_factor=scale_factor) for x, scale_factor in zip(feats, self.scale_factor)] + + out = self.ref_3(feats[3], None) + out = self.ref_2(feats[2], out) + out = self.ref_1(feats[1], out) + out = self.ref_0(feats[0], out) + if not self.hr: + return self.out_conv(out) + out = interpolate(out, scale_factor=4) + out = self.out_conv(out) + # out = interpolate(out, scale_factor=2) + return out + + +def make_conv(input_dim, hidden_dim, output_dim, num_layers, kernel_size=1): + return conv + + +class Linear(nn.Module): + def __init__(self, input_dim, output_dim, kernel_size=1): + super().__init__() + if type(input_dim) is not int: + input_dim = sum(input_dim) + + assert type(input_dim) is int + padding = kernel_size // 2 + self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, padding=padding) + + def forward(self, feats): + if type(feats) is list: + feats = torch.cat(feats, dim=1) + + feats = interpolate(feats, scale_factor=4, mode="bilinear") + return self.conv(feats) + + +class MultiscaleHead(nn.Module): + def __init__(self, input_dims, output_dim, hidden_dim=512, kernel_size=1): + super().__init__() + + self.convs = nn.ModuleList( + [make_conv(in_d, None, hidden_dim, 1, kernel_size) for in_d in input_dims] + ) + interm_dim = len(input_dims) * hidden_dim + self.conv_mid = make_conv(interm_dim, hidden_dim, hidden_dim, 3, kernel_size) + self.conv_out = make_conv(hidden_dim, hidden_dim, output_dim, 2, kernel_size) + + def forward(self, feats): + num_feats = len(feats) + feats = [self.convs[i](feats[i]) for i in range(num_feats)] + + h, w = feats[-1].shape[-2:] + feats = [interpolate(feat, (h, w), mode="bilinear") for feat in feats] + feats = torch.cat(feats, dim=1).relu() + + # upsample + feats = interpolate(feats, scale_factor=2, mode="bilinear") + feats = self.conv_mid(feats).relu() + feats = interpolate(feats, scale_factor=4, mode="bilinear") + return self.conv_out(feats) + +def get_norm(norm, out_channels, num_norm_groups=32): + """ + Args: + norm (str or callable): either one of BN, SyncBN, FrozenBN, GN; + or a callable that takes a channel number and returns + the normalization layer as a nn.Module. + Returns: + nn.Module or None: the normalization layer + """ + if norm is None: + return None + if isinstance(norm, str): + if len(norm) == 0: + return None + norm = { + "GN": lambda channels: nn.GroupNorm(num_norm_groups, channels), + }[norm] + return norm(out_channels) + + +def get_activation(activation): + """ + Args: + activation (str or callable): either one of relu, lrelu, prelu, leaky_relu, + sigmoid, tanh, elu, selu, swish, mish; or a callable that takes a + tensor and returns a tensor. + Returns: + nn.Module or None: the activation layer + """ + if activation is None: + return None + if isinstance(activation, str): + if len(activation) == 0: + return None + activation = { + "relu": nn.ReLU, + "lrelu": nn.LeakyReLU, + "prelu": nn.PReLU, + "leaky_relu": nn.LeakyReLU, + "sigmoid": nn.Sigmoid, + "tanh": nn.Tanh, + "elu": nn.ELU, + "selu": nn.SELU, + }[activation] + return activation() + + +# SCE crisscross + diags +class EfficientSpatialContextNet(nn.Module): + def __init__(self, kernel_size=7, in_channels=768, out_channels=768, use_cuda=True): + super(EfficientSpatialContextNet, self).__init__() + self.kernel_size = kernel_size + self.pad = kernel_size // 2 + self.conv = torch.nn.Conv2d( + in_channels + 4 * self.kernel_size, + out_channels, + 1, + bias=True, + padding_mode="zeros", + ) + + if use_cuda: + self.conv = self.conv.cuda() + + def forward(self, feature): + b, c, h, w = feature.size() + feature_normalized = F.normalize(feature, p=2, dim=1) + feature_pad = F.pad( + feature_normalized, (self.pad, self.pad, self.pad, self.pad), "constant", 0 + ) + output = torch.zeros( + [4 * self.kernel_size, b, h, w], + dtype=feature.dtype, + requires_grad=feature.requires_grad, + ) + if feature.is_cuda: + output = output.cuda(feature.get_device()) + + # left-top to right-bottom + for i in range(self.kernel_size): + c = i + r = i + output[i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1) + + # col + for i in range(self.kernel_size): + c = self.kernel_size // 2 + r = i + output[1 * self.kernel_size + i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1) + + # right-top to left-bottom + for i in range(self.kernel_size): + c = (self.kernel_size - 1) - i + r = i + output[2 * self.kernel_size + i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1) + + # row + for i in range(self.kernel_size): + c = i + r = self.kernel_size // 2 + output[3 * self.kernel_size + i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1) + + output = output.transpose(0, 1).contiguous() + output = torch.cat((feature, output), 1) + output = self.conv(output) + # output = F.relu(output) + + return output + + +class Conv2d(nn.Conv2d): + """ + A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features. + """ + + def __init__(self, *args, **kwargs): + """ + Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: + Args: + norm (nn.Module, optional): a normalization layer + activation (callable(Tensor) -> Tensor): a callable activation function + It assumes that norm layer is used before activation. + """ + norm = kwargs.pop("norm", None) + activation = kwargs.pop("activation", None) + super().__init__(*args, **kwargs) + + self.norm = norm + self.activation = activation + + def forward(self, x): + x = F.conv2d( + x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups + ) + if self.norm is not None: + x = self.norm(x) + if self.activation is not None: + x = self.activation(x) + return x + + +class CNNBlockBase(nn.Module): + """ + A CNN block is assumed to have input channels, output channels and a stride. + The input and output of `forward()` method must be NCHW tensors. + The method can perform arbitrary computation but must match the given + channels and stride specification. + Attribute: + in_channels (int): + out_channels (int): + stride (int): + """ + + def __init__(self, in_channels, out_channels, stride): + """ + The `__init__` method of any subclass should also contain these arguments. + Args: + in_channels (int): + out_channels (int): + stride (int): + """ + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + + +class BottleneckBlock(CNNBlockBase): + """ + The standard bottleneck residual block used by ResNet-50, 101 and 152 + defined in :paper:`ResNet`. It contains 3 conv layers with kernels + 1x1, 3x3, 1x1, and a projection shortcut if needed. + """ + + def __init__( + self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm="GN", + stride_in_1x1=False, + dilation=1, + num_norm_groups=32, + kernel_size=(1, 3, 1) + ): + """ + Args: + bottleneck_channels (int): number of output channels for the 3x3 + "bottleneck" conv layers. + num_groups (int): number of groups for the 3x3 conv layer. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + stride_in_1x1 (bool): when stride>1, whether to put stride in the + first 1x1 convolution or the bottleneck 3x3 convolution. + dilation (int): the dilation rate of the 3x3 conv layer. + """ + super().__init__(in_channels, out_channels, stride) + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels, num_norm_groups), + ) + else: + self.shortcut = None + + # The original MSRA ResNet models have stride in the first 1x1 conv + # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have + # stride in the 3x3 conv + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=kernel_size[0], + stride=stride_1x1, + padding=(kernel_size[0] - 1) // 2, + bias=False, + norm=get_norm(norm, bottleneck_channels, num_norm_groups), + ) + + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=kernel_size[1], + stride=stride_3x3, + padding=dilation * (kernel_size[1] - 1) // 2, + bias=False, + groups=num_groups, + dilation=dilation, + norm=get_norm(norm, bottleneck_channels, num_norm_groups), + ) + + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=kernel_size[2], + bias=False, + norm=get_norm(norm, out_channels, num_norm_groups), + ) + + for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + # Zero-initialize the last normalization in each residual branch, + # so that at the beginning, the residual branch starts with zeros, + # and each residual block behaves like an identity. + # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": + # "For BN layers, the learnable scaling coefficient γ is initialized + # to be 1, except for each residual block's last BN + # where γ is initialized to be 0." + + # nn.init.constant_(self.conv3.norm.weight, 0) + # TODO this somehow hurts performance when training GN models from scratch. + # Add it as an option when we need to use this code to train a backbone. + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + + out = self.conv2(out) + out = F.relu_(out) + + out = self.conv3(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class ResNet(nn.Module): + """ + Implement :paper:`ResNet`. + """ + + def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0): + """ + Args: + stem (nn.Module): a stem module + stages (list[list[CNNBlockBase]]): several (typically 4) stages, + each contains multiple :class:`CNNBlockBase`. + num_classes (None or int): if None, will not perform classification. + Otherwise, will create a linear layer. + out_features (list[str]): name of the layers whose outputs should + be returned in forward. Can be anything in "stem", "linear", or "res2" ... + If None, will return the output of the last layer. + freeze_at (int): The number of stages at the beginning to freeze. + see :meth:`freeze` for detailed explanation. + """ + super().__init__() + self.stem = stem + self.num_classes = num_classes + + current_stride = self.stem.stride + self._out_feature_strides = {"stem": current_stride} + self._out_feature_channels = {"stem": self.stem.out_channels} + + self.stage_names, self.stages = [], [] + + if out_features is not None: + # Avoid keeping unused layers in this module. They consume extra memory + # and may cause allreduce to fail + num_stages = max( + [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features] + ) + stages = stages[:num_stages] + for i, blocks in enumerate(stages): + assert len(blocks) > 0, len(blocks) + for block in blocks: + assert isinstance(block, CNNBlockBase), block + + name = "res" + str(i + 2) + stage = nn.Sequential(*blocks) + + self.add_module(name, stage) + self.stage_names.append(name) + self.stages.append(stage) + + self._out_feature_strides[name] = current_stride = int( + current_stride * np.prod([k.stride for k in blocks]) + ) + self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels + self.stage_names = tuple(self.stage_names) # Make it static for scripting + + if num_classes is not None: + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.linear = nn.Linear(curr_channels, num_classes) + + # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": + # "The 1000-way fully-connected layer is initialized by + # drawing weights from a zero-mean Gaussian with standard deviation of 0.01." + nn.init.normal_(self.linear.weight, std=0.01) + name = "linear" + + if out_features is None: + out_features = [name] + self._out_features = out_features + assert len(self._out_features) + children = [x[0] for x in self.named_children()] + for out_feature in self._out_features: + assert out_feature in children, "Available children: {}".format(", ".join(children)) + self.freeze(freeze_at) + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: names and the corresponding features + """ + assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!" + outputs = {} + x = self.stem(x) + if "stem" in self._out_features: + outputs["stem"] = x + for name, stage in zip(self.stage_names, self.stages): + x = stage(x) + if name in self._out_features: + outputs[name] = x + if self.num_classes is not None: + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.linear(x) + if "linear" in self._out_features: + outputs["linear"] = x + return outputs + + def freeze(self, freeze_at=0): + """ + Freeze the first several stages of the ResNet. Commonly used in + fine-tuning. + Layers that produce the same feature map spatial size are defined as one + "stage" by :paper:`FPN`. + Args: + freeze_at (int): number of stages to freeze. + `1` means freezing the stem. `2` means freezing the stem and + one residual stage, etc. + Returns: + nn.Module: this ResNet itself + """ + if freeze_at >= 1: + self.stem.freeze() + for idx, stage in enumerate(self.stages, start=2): + if freeze_at >= idx: + for block in stage.children(): + block.freeze() + return self + + @staticmethod + def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs): + """ + Create a list of blocks of the same type that forms one ResNet stage. + Args: + block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this + stage. A module of this type must not change spatial resolution of inputs unless its + stride != 1. + num_blocks (int): number of blocks in this stage + in_channels (int): input channels of the entire stage. + out_channels (int): output channels of **every block** in the stage. + kwargs: other arguments passed to the constructor of + `block_class`. If the argument name is "xx_per_block", the + argument is a list of values to be passed to each block in the + stage. Otherwise, the same argument is passed to every block + in the stage. + Returns: + list[CNNBlockBase]: a list of block module. + Examples: + :: + stage = ResNet.make_stage( + BottleneckBlock, 3, in_channels=16, out_channels=64, + bottleneck_channels=16, num_groups=1, + stride_per_block=[2, 1, 1], + dilations_per_block=[1, 1, 2] + ) + Usually, layers that produce the same feature map spatial size are defined as one + "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should + all be 1. + """ + blocks = [] + for i in range(num_blocks): + curr_kwargs = {} + for k, v in kwargs.items(): + if k.endswith("_per_block"): + assert len(v) == num_blocks, ( + f"Argument '{k}' of make_stage should have the " + f"same length as num_blocks={num_blocks}." + ) + newk = k[: -len("_per_block")] + assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!" + curr_kwargs[newk] = v[i] + else: + curr_kwargs[k] = v + + blocks.append( + block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs) + ) + in_channels = out_channels + return blocks + + @staticmethod + def make_default_stages(depth, block_class=None, **kwargs): + """ + Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152). + If it doesn't create the ResNet variant you need, please use :meth:`make_stage` + instead for fine-grained customization. + Args: + depth (int): depth of ResNet + block_class (type): the CNN block class. Has to accept + `bottleneck_channels` argument for depth > 50. + By default it is BasicBlock or BottleneckBlock, based on the + depth. + kwargs: + other arguments to pass to `make_stage`. Should not contain + stride and channels, as they are predefined for each depth. + Returns: + list[list[CNNBlockBase]]: modules in all stages; see arguments of + :class:`ResNet.__init__`. + """ + num_blocks_per_stage = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], + }[depth] + if block_class is None: + block_class = BasicBlock if depth < 50 else BottleneckBlock + if depth < 50: + in_channels = [64, 64, 128, 256] + out_channels = [64, 128, 256, 512] + else: + in_channels = [64, 256, 512, 1024] + out_channels = [256, 512, 1024, 2048] + ret = [] + for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels): + if depth >= 50: + kwargs["bottleneck_channels"] = o // 4 + ret.append( + ResNet.make_stage( + block_class=block_class, + num_blocks=n, + stride_per_block=[s] + [1] * (n - 1), + in_channels=i, + out_channels=o, + **kwargs, + ) + ) + return ret + +class DummyAggregationNetwork(nn.Module): # for testing, return the input + def __init__(self): + super(DummyAggregationNetwork, self).__init__() + # dummy paprameter + self.dummy = nn.Parameter(torch.ones([])) + + def forward(self, batch, pose=None): + return batch * self.dummy + + +class AggregationNetwork(nn.Module): + """ + Module for aggregating feature maps across time and space. + Design inspired by the Feature Extractor from ODISE (Xu et. al., CVPR 2023). + https://github.com/NVlabs/ODISE/blob/5836c0adfcd8d7fd1f8016ff5604d4a31dd3b145/odise/modeling/backbone/feature_extractor.py + """ + + def __init__( + self, + device, + feature_dims=[640, 1280, 1280, 768], + projection_dim=384, + num_norm_groups=32, + save_timestep=[1], + kernel_size=[1, 3, 1], + contrastive_temp=10, + feat_map_dropout=0.0, + ): + super().__init__() + self.skip_connection = True + self.feat_map_dropout = feat_map_dropout + self.azimuth_embedding = None + self.pos_embedding = None + self.bottleneck_layers = nn.ModuleList() + self.feature_dims = feature_dims + # For CLIP symmetric cross entropy loss during training + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + self.self_logit_scale = nn.Parameter(torch.ones([]) * np.log(contrastive_temp)) + self.device = device + self.save_timestep = save_timestep + + self.mixing_weights_names = [] + for l, feature_dim in enumerate(self.feature_dims): + bottleneck_layer = nn.Sequential( + *ResNet.make_stage( + BottleneckBlock, + num_blocks=1, + in_channels=feature_dim, + bottleneck_channels=projection_dim // 4, + out_channels=projection_dim, + norm="GN", + num_norm_groups=num_norm_groups, + kernel_size=kernel_size + ) + ) + self.bottleneck_layers.append(bottleneck_layer) + for t in save_timestep: + # 1-index the layer name following prior work + self.mixing_weights_names.append(f"timestep-{save_timestep}_layer-{l + 1}") + self.last_layer = None + self.bottleneck_layers = self.bottleneck_layers.to(device) + mixing_weights = torch.ones(len(self.bottleneck_layers) * len(save_timestep)) + self.mixing_weights = nn.Parameter(mixing_weights.to(device)) + # count number of parameters + num_params = 0 + for param in self.parameters(): + num_params += param.numel() + print(f"AggregationNetwork has {num_params} parameters.") + + def load_pretrained_weights(self, pretrained_dict): + custom_dict = self.state_dict() + + # Handle size mismatch + if 'mixing_weights' in custom_dict and 'mixing_weights' in pretrained_dict and custom_dict[ + 'mixing_weights'].shape != pretrained_dict['mixing_weights'].shape: + # Keep the first four weights from the pretrained model, and randomly initialize the fifth weight + custom_dict['mixing_weights'][:4] = pretrained_dict['mixing_weights'][:4] + custom_dict['mixing_weights'][4] = torch.zeros_like(custom_dict['mixing_weights'][4]) + else: + custom_dict['mixing_weights'][:4] = pretrained_dict['mixing_weights'][:4] + + # Load the weights that do match + matching_keys = {k: v for k, v in pretrained_dict.items() if k in custom_dict and k != 'mixing_weights'} + custom_dict.update(matching_keys) + + # Now load the updated state_dict + self.load_state_dict(custom_dict, strict=False) + + def forward(self, batch, pose=None): + """ + Assumes batch is shape (B, C, H, W) where C is the concatentation of all layer features. + """ + if self.feat_map_dropout > 0 and self.training: + batch = F.dropout(batch, p=self.feat_map_dropout) + + output_feature = None + start = 0 + mixing_weights = torch.nn.functional.softmax(self.mixing_weights, dim=0) + if self.pos_embedding is not None: # position embedding + batch = torch.cat((batch, self.pos_embedding), dim=1) + for i in range(len(mixing_weights)): + # Share bottleneck layers across timesteps + bottleneck_layer = self.bottleneck_layers[i % len(self.feature_dims)] + # Chunk the batch according the layer + # Account for looping if there are multiple timesteps + end = start + self.feature_dims[i % len(self.feature_dims)] + feats = batch[:, start:end, :, :] + start = end + # Downsample the number of channels and weight the layer + bottlenecked_feature = bottleneck_layer(feats) + bottlenecked_feature = mixing_weights[i] * bottlenecked_feature + if output_feature is None: + output_feature = bottlenecked_feature + else: + output_feature += bottlenecked_feature + + if self.last_layer is not None: + + output_feature_after = self.last_layer(output_feature) + if self.skip_connection: + # skip connection + output_feature = output_feature + output_feature_after + return output_feature + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution without padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) + + +class BasicBlock(nn.Module): + def __init__(self, in_planes, planes, stride=1): + super().__init__() + self.conv1 = conv3x3(in_planes, planes, stride) + self.conv2 = conv3x3(planes, planes) + self.bn1 = nn.BatchNorm2d(planes) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + + if stride == 1: + self.downsample = None + else: + self.downsample = nn.Sequential( + conv1x1(in_planes, planes, stride=stride), + nn.BatchNorm2d(planes) + ) + + def forward(self, x): + y = x + y = self.relu(self.bn1(self.conv1(y))) + y = self.bn2(self.conv2(y)) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x + y) \ No newline at end of file diff --git a/EdgeCape/models/backbones/dino.py b/EdgeCape/models/backbones/dino.py new file mode 100644 index 0000000000000000000000000000000000000000..4323eba0dfc74ba71b18274b57e91bcd91650046 --- /dev/null +++ b/EdgeCape/models/backbones/dino.py @@ -0,0 +1,206 @@ +import einops as E +import numpy as np +import torch +import torch.nn.functional as F +from transformers.models.vit_mae.modeling_vit_mae import ( + get_2d_sincos_pos_embed_from_grid, +) + + +def resize_pos_embed( + pos_embed: torch.Tensor, hw: tuple[int, int], has_cls_token: bool = True +): + """ + Resize positional embedding for arbitrary image resolution. Resizing is done + via bicubic interpolation. + + Args: + pos_embed: Positional embedding tensor of shape ``(n_patches, embed_dim)``. + hw: Target height and width of the tensor after interpolation. + has_cls_token: Whether ``pos_embed[0]`` is for the ``[cls]`` token. + + Returns: + Tensor of shape ``(new_n_patches, embed_dim)`` of resized embedding. + ``new_n_patches`` is ``new_height * new_width`` if ``has_cls`` is False, + else ``1 + new_height * new_width``. + """ + + n_grid = pos_embed.shape[0] - 1 if has_cls_token else pos_embed.shape[0] + + # Do not resize if already in same shape. + if n_grid == hw[0] * hw[1]: + return pos_embed + + # Get original position embedding and extract ``[cls]`` token. + if has_cls_token: + cls_embed, pos_embed = pos_embed[[0]], pos_embed[1:] + + orig_dim = int(pos_embed.shape[0] ** 0.5) + + pos_embed = E.rearrange(pos_embed, "(h w) c -> 1 c h w", h=orig_dim) + pos_embed = F.interpolate( + pos_embed, hw, mode="bicubic", align_corners=False, antialias=True + ) + pos_embed = E.rearrange(pos_embed, "1 c h w -> (h w) c") + + # Add embedding of ``[cls]`` token back after resizing. + if has_cls_token: + pos_embed = torch.cat([cls_embed, pos_embed], dim=0) + + return pos_embed + + +def center_padding(images, patch_size): + _, _, h, w = images.shape + diff_h = h % patch_size + diff_w = w % patch_size + + if diff_h == 0 and diff_w == 0: + return images + + pad_h = patch_size - diff_h + pad_w = patch_size - diff_w + + pad_t = pad_h // 2 + pad_l = pad_w // 2 + pad_r = pad_w - pad_l + pad_b = pad_h - pad_t + + images = F.pad(images, (pad_l, pad_r, pad_t, pad_b)) + return images + + +def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False): + """ + COPIED FROM TRANSFORMERS PACKAGE AND EDITED TO ALLOW FOR DIFFERENT WIDTH-HEIGHT + Create 2D sin/cos positional embeddings. + + Args: + embed_dim (`int`): + Embedding dimension. + grid_size (`int`): + The grid height and width. + add_cls_token (`bool`, *optional*, defaults to `False`): + Whether or not to add a classification (CLS) token. + + Returns: + (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or + (1+grid_size*grid_size, embed_dim): the + position embeddings (with or without classification token) + """ + grid_h = np.arange(grid_size[0], dtype=np.float32) + grid_w = np.arange(grid_size[1], dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size[0], grid_size[1]]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if add_cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def tokens_to_output(output_type, dense_tokens, cls_token, feat_hw): + if output_type == "cls": + assert cls_token is not None + output = cls_token + elif output_type == "gap": + output = dense_tokens.mean(dim=1) + elif output_type == "dense": + h, w = feat_hw + dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w) + output = dense_tokens.contiguous() + elif output_type == "dense-cls": + assert cls_token is not None + h, w = feat_hw + dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w) + cls_token = cls_token[:, :, None, None].repeat(1, 1, h, w) + output = torch.cat((dense_tokens, cls_token), dim=1).contiguous() + else: + raise ValueError() + + return output + +class DINO(torch.nn.Module): + def __init__( + self, + dino_name="dinov2", + model_name="vits14", + output="dense-cls", + layer=-1, + return_multilayer=True, + ): + super().__init__() + feat_dims = { + "vits14": 384, + "vitb8": 768, + "vitb16": 768, + "vitb14": 768, + "vitb14_reg": 768, + "vitl14": 1024, + "vitg14": 1536, + } + + # get model + self.model_name = dino_name + self.checkpoint_name = f"{dino_name}_{model_name}" + dino_vit = torch.hub.load(f"facebookresearch/{dino_name}", self.checkpoint_name) + self.vit = dino_vit.eval().to(torch.float32) + self.has_registers = "_reg" in model_name + + assert output in ["cls", "gap", "dense", "dense-cls"] + self.output = output + self.patch_size = self.vit.patch_embed.proj.kernel_size[0] + + feat_dim = feat_dims[model_name] + feat_dim = feat_dim * 2 if output == "dense-cls" else feat_dim + + num_layers = len(self.vit.blocks) + multilayers = [ + num_layers // 4 - 1, + num_layers // 2 - 1, + num_layers // 4 * 3 - 1, + num_layers - 1, + ] + + if return_multilayer: + self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim] + self.multilayers = multilayers + else: + self.feat_dim = feat_dim + layer = multilayers[-1] if layer == -1 else layer + self.multilayers = [layer] + + # define layer name (for logging) + self.layer = "-".join(str(_x) for _x in self.multilayers) + + def forward(self, images): + + # pad images (if needed) to ensure it matches patch_size + images = center_padding(images, self.patch_size) + h, w = images.shape[-2:] + h, w = h // self.patch_size, w // self.patch_size + + if self.model_name == "dinov2": + x = self.vit.prepare_tokens_with_masks(images, None) + else: + x = self.vit.prepare_tokens(images) + + embeds = [] + for i, blk in enumerate(self.vit.blocks): + x = blk(x) + if i in self.multilayers: + embeds.append(x) + if len(embeds) == len(self.multilayers): + break + + num_spatial = h * w + outputs = [] + for i, x_i in enumerate(embeds): + cls_tok = x_i[:, 0] + # ignoring register tokens + spatial = x_i[:, -1 * num_spatial :] + x_i = tokens_to_output(self.output, spatial, cls_tok, (h, w)) + outputs.append(x_i) + + return outputs[0] if len(outputs) == 1 else outputs \ No newline at end of file diff --git a/EdgeCape/models/detectors/EdgeCape.py b/EdgeCape/models/detectors/EdgeCape.py new file mode 100644 index 0000000000000000000000000000000000000000..e8de214f8d5184d74b666d457bdb5ab775ed679d --- /dev/null +++ b/EdgeCape/models/detectors/EdgeCape.py @@ -0,0 +1,392 @@ +import math +import cv2 +import mmcv +import numpy as np +import torch +import torch.nn +import torch.nn.functional as F +from mmcv.image import imwrite +from mmcv.visualization.image import imshow +from mmpose.models import builder +from mmpose.models.builder import POSENETS +from mmpose.models.detectors.base import BasePose +from EdgeCape.models.backbones.adapter import DPT +from EdgeCape.models.backbones.dino import DINO + + +@POSENETS.register_module() +class EdgeCape(BasePose): + """ + EdgeCape: Edge-aware Context-Aware Pose Estimation. + Args: + keypoint_head (dict): Config for keypoint head. + encoder_config (dict): Config for encoder. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + freeze_backbone (bool): If True, freeze backbone. Default: False. + """ + + def __init__(self, + keypoint_head, + encoder_config, + train_cfg=None, + test_cfg=None, + freeze_backbone=False): + super().__init__() + feature_output_setting = encoder_config.get('output', 'dense-cls') + model_name = encoder_config.get('model_name', 'vits14') + self.encoder_sample = self.encoder_query = DINO(output=feature_output_setting, model_name=model_name) + self.probe = DPT(input_dims=self.encoder_query.feat_dim, output_dim=768) + self.backbone = 'dino_extractor' + self.freeze_backbone = freeze_backbone + if keypoint_head.get('freeze', None) is not None: + self.freeze_backbone = True + + self.keypoint_head_module = builder.build_head(keypoint_head) + self.keypoint_head_module.init_weights() + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.target_type = test_cfg.get('target_type', + 'GaussianHeatMap') # GaussianHeatMap + + @property + def with_keypoint(self): + """Check if has keypoint_head.""" + return hasattr(self, 'keypoint_head_module') + + def init_weights(self, pretrained=None): + """Weight initialization for model.""" + self.encoder_sample.init_weights(pretrained) + self.encoder_query.init_weights(pretrained) + self.keypoint_head_module.init_weights() + + def forward(self, + img_s, + img_q, + target_s=None, + target_weight_s=None, + target_q=None, + target_weight_q=None, + img_metas=None, + return_loss=True, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + """ + if return_loss: + return self.forward_train(img_s, target_s, target_weight_s, img_q, + target_q, target_weight_q, img_metas, + **kwargs) + else: + return self.forward_test(img_s, target_s, target_weight_s, img_q, + target_q, target_weight_q, img_metas, + **kwargs) + + def forward_train(self, + img_s, + target_s, + target_weight_s, + img_q, + target_q, + target_weight_q, + img_metas, + **kwargs): + """Defines the computation performed at every call when training.""" + bs, _, h, w = img_q.shape + random_mask = kwargs.get('rand_mask', None) + output, initial_proposals, similarity_map, mask_s, reconstructed_keypoints = self.predict(img_s, + target_s, + target_weight_s, + img_q, + img_metas, + random_mask) + + # parse the img meta to get the target keypoints + device = output.device + target_keypoints = self.parse_keypoints_from_img_meta(img_metas, + device, + keyword='query') + + target_sizes = torch.tensor( + [img_q.shape[-2], img_q.shape[-1]]).unsqueeze(0).repeat( + img_q.shape[0], 1, 1) + + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head_module.get_loss(output, + initial_proposals, + similarity_map, + target_keypoints, + target_q, + target_weight_q * mask_s, + target_sizes, + reconstructed_keypoints, + ) + losses.update(keypoint_losses) + keypoint_accuracy = self.keypoint_head_module.get_accuracy(output[-1], + target_keypoints, + target_weight_q * mask_s, + target_sizes, + height=h) + losses.update(keypoint_accuracy) + return losses + + def forward_test(self, + img_s, + target_s, + target_weight_s, + img_q, + target_q, + target_weight_q, + img_metas=None, + vis_offset=True, + **kwargs): + + """Defines the computation performed at every call when testing.""" + batch_size, _, img_height, img_width = img_q.shape + output, initial_proposals, similarity_map, mask_s, reconstructed_keypoints = self.predict(img_s, + target_s, + target_weight_s, + img_q, + img_metas + ) + predicted_pose = output[-1].detach().cpu().numpy() + result = {} + + if self.with_keypoint: + keypoint_result = self.keypoint_head_module.decode(img_metas, predicted_pose, img_size=[img_width, img_height]) + result.update(keypoint_result) + + if vis_offset: + result.update({"points": torch.cat((initial_proposals[None], output)).cpu().numpy()}) + + result.update({"sample_image_file": [img_metas[i]['sample_image_file'] for i in range(len(img_metas))]}) + + return result + + def predict(self, + img_s, + target_s, + target_weight_s, + img_q, + img_metas=None, + random_mask=None): + + batch_size, _, img_height, img_width = img_q.shape + assert [i['sample_skeleton'][0] != i['query_skeleton'] for i in img_metas] + mask_s = target_weight_s[0] + for target_weight in target_weight_s: + mask_s = mask_s * target_weight + feature_q, feature_s = self.extract_features(img_s, img_q) + skeleton_lst = [i['sample_skeleton'][0] for i in img_metas] + + (output, initial_proposals, similarity_map, reconstructed_keypoints) = self.keypoint_head_module( + feature_q, feature_s, target_s, mask_s, skeleton_lst, random_mask=random_mask) + + return output, initial_proposals, similarity_map, mask_s, reconstructed_keypoints + + def extract_features(self, img_s, img_q): + with torch.no_grad(): + dino_feature_s = [self.encoder_sample(img) for img in img_s] + dino_feature_q = self.encoder_query(img_q) # [bs, 3, h, w] + if self.freeze_backbone: + with torch.no_grad(): + feature_s = [self.probe(f) for f in dino_feature_s] + feature_q = self.probe(dino_feature_q) + else: + feature_s = [self.probe(f) for f in dino_feature_s] + feature_q = self.probe(dino_feature_q) + + return feature_q, feature_s + + def parse_keypoints_from_img_meta(self, img_meta, device, keyword='query'): + """Parse keypoints from the img_meta. + + Args: + img_meta (dict): Image meta info. + device (torch.device): Device of the output keypoints. + keyword (str): 'query' or 'sample'. Default: 'query'. + + Returns: + Tensor: Keypoints coordinates of query images. + """ + + if keyword == 'query': + query_kpt = torch.stack([ + torch.tensor(info[f'{keyword}_joints_3d']).to(device) for info in img_meta], dim=0)[:, :, :2] + else: + query_kpt = [] + for info in img_meta: + if isinstance(info[f'{keyword}_joints_3d'][0], torch.Tensor): + samples = torch.stack(info[f'{keyword}_joints_3d']) + else: + samples = np.array(info[f'{keyword}_joints_3d']) + query_kpt.append(torch.tensor(samples).to(device)[:, :, :2]) + query_kpt = torch.stack(query_kpt, dim=0) # [bs, , num_samples, num_query, 2] + return query_kpt + + def get_full_similarity_map(self, feature_q, feature_s, h, w): + resized_feature_q = F.interpolate(feature_q, size=(h, w), + mode='bilinear') + resized_feature_s = [F.interpolate(s, size=(h, w), mode='bilinear') for + s in feature_s] + return [self.chunk_cosine_sim(f_s, resized_feature_q) for f_s in + resized_feature_s] + + # UNMODIFIED + def show_result(self, + img, + result, + skeleton=None, + kpt_score_thr=0.3, + bbox_color='green', + pose_kpt_color=None, + pose_limb_color=None, + radius=4, + text_color=(255, 0, 0), + thickness=1, + font_scale=0.5, + win_name='', + show=False, + wait_time=0, + out_file=None): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (list[dict]): The results to draw over `img` + (bbox_result, pose_result). + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_limb_color (np.array[Mx3]): Color of M limbs. + If None, do not draw limbs. + text_color (str or tuple or :obj:`Color`): Color of texts. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + + img = mmcv.imread(img) + img = img.copy() + img_h, img_w, _ = img.shape + + bbox_result = [] + pose_result = [] + for res in result: + bbox_result.append(res['bbox']) + pose_result.append(res['keypoints']) + + if len(bbox_result) > 0: + bboxes = np.vstack(bbox_result) + # draw bounding boxes + mmcv.imshow_bboxes( + img, + bboxes, + colors=bbox_color, + top_k=-1, + thickness=thickness, + show=False, + win_name=win_name, + wait_time=wait_time, + out_file=None) + + for person_id, kpts in enumerate(pose_result): + # draw each point on image + if pose_kpt_color is not None: + assert len(pose_kpt_color) == len(kpts), ( + len(pose_kpt_color), len(kpts)) + for kid, kpt in enumerate(kpts): + x_coord, y_coord, kpt_score = int(kpt[0]), int( + kpt[1]), kpt[2] + if kpt_score > kpt_score_thr: + img_copy = img.copy() + r, g, b = pose_kpt_color[kid] + cv2.circle(img_copy, (int(x_coord), int(y_coord)), + radius, (int(r), int(g), int(b)), -1) + transparency = max(0, min(1, kpt_score)) + cv2.addWeighted( + img_copy, + transparency, + img, + 1 - transparency, + 0, + dst=img) + + # draw limbs + if skeleton is not None and pose_limb_color is not None: + assert len(pose_limb_color) == len(skeleton) + for sk_id, sk in enumerate(skeleton): + pos1 = (int(kpts[sk[0] - 1, 0]), int(kpts[sk[0] - 1, + 1])) + pos2 = (int(kpts[sk[1] - 1, 0]), int(kpts[sk[1] - 1, + 1])) + if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0 + and pos1[1] < img_h and pos2[0] > 0 + and pos2[0] < img_w and pos2[1] > 0 + and pos2[1] < img_h + and kpts[sk[0] - 1, 2] > kpt_score_thr + and kpts[sk[1] - 1, 2] > kpt_score_thr): + img_copy = img.copy() + X = (pos1[0], pos2[0]) + Y = (pos1[1], pos2[1]) + mX = np.mean(X) + mY = np.mean(Y) + length = ((Y[0] - Y[1]) ** 2 + ( + X[0] - X[1]) ** 2) ** 0.5 + angle = math.degrees( + math.atan2(Y[0] - Y[1], X[0] - X[1])) + stickwidth = 2 + polygon = cv2.ellipse2Poly( + (int(mX), int(mY)), + (int(length / 2), int(stickwidth)), int(angle), + 0, 360, 1) + + r, g, b = pose_limb_color[sk_id] + cv2.fillConvexPoly(img_copy, polygon, + (int(r), int(g), int(b))) + transparency = max( + 0, + min( + 1, 0.5 * + (kpts[sk[0] - 1, 2] + kpts[ + sk[1] - 1, 2]))) + cv2.addWeighted( + img_copy, + transparency, + img, + 1 - transparency, + 0, + dst=img) + + show, wait_time = 1, 1 + if show: + height, width = img.shape[:2] + max_ = max(height, width) + + factor = min(1, 800 / max_) + enlarge = cv2.resize( + img, (0, 0), + fx=factor, + fy=factor, + interpolation=cv2.INTER_CUBIC) + imshow(enlarge, win_name, wait_time) + + if out_file is not None: + imwrite(img, out_file) + + return img diff --git a/EdgeCape/models/detectors/__init__.py b/EdgeCape/models/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc0456aedb28e94134caca15f93b81db3d0adae --- /dev/null +++ b/EdgeCape/models/detectors/__init__.py @@ -0,0 +1,3 @@ +from .EdgeCape import EdgeCape + +__all__ = ['EdgeCape'] diff --git a/EdgeCape/models/detectors/__pycache__/EdgeCape.cpython-39.pyc b/EdgeCape/models/detectors/__pycache__/EdgeCape.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93fd30c53fd3229bdb88a220c56881dd2576be8a Binary files /dev/null and b/EdgeCape/models/detectors/__pycache__/EdgeCape.cpython-39.pyc differ diff --git a/EdgeCape/models/detectors/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/detectors/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe71eac8979c4c9fabdd5a4cf220a80f8e518a54 Binary files /dev/null and b/EdgeCape/models/detectors/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/models/keypoint_heads/__init__.py b/EdgeCape/models/keypoint_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..58bbac1437bb1221908dc01345553e4982c43020 --- /dev/null +++ b/EdgeCape/models/keypoint_heads/__init__.py @@ -0,0 +1,5 @@ +from .head import TwoStageHead +from .skeleton import SkeletonPredictor + +__all__ = ['TwoStageHead', 'SkeletonPredictor'] + diff --git a/EdgeCape/models/keypoint_heads/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd211c38f41e624bb15962d2ab5ead42c4cfab14 Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/models/keypoint_heads/__pycache__/encoder_decoder.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/encoder_decoder.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc1520b1de8245f9040db146f2efb52fa0d75275 Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/encoder_decoder.cpython-39.pyc differ diff --git a/EdgeCape/models/keypoint_heads/__pycache__/head.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/head.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..795f2101f995cacf23d21ad165f3943ff060d7c7 Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/head.cpython-39.pyc differ diff --git a/EdgeCape/models/keypoint_heads/__pycache__/skeleton.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/skeleton.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d42bad67f3d3f169f464a2ea35db3de13cf32d6 Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/skeleton.cpython-39.pyc differ diff --git a/EdgeCape/models/keypoint_heads/encoder_decoder.py b/EdgeCape/models/keypoint_heads/encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f6d3886679ecd3119647457b15f8c712d213fd41 --- /dev/null +++ b/EdgeCape/models/keypoint_heads/encoder_decoder.py @@ -0,0 +1,670 @@ +import math +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +import copy +from typing import Optional +from EdgeCape.models.utils.bias_attn import BiasedMultiheadAttention +from EdgeCape.models.utils.builder import TRANSFORMER +from mmcv.cnn import (xavier_init) + + +def inverse_sigmoid(x, eps=1e-3): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +class MLP(nn.Module): + """ Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.gelu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +class ProposalGenerator(nn.Module): + + def __init__(self, hidden_dim, proj_dim, dynamic_proj_dim): + super().__init__() + self.support_proj = nn.Linear(hidden_dim, proj_dim) + self.query_proj = nn.Linear(hidden_dim, proj_dim) + self.dynamic_proj = nn.Sequential( + nn.Linear(hidden_dim, dynamic_proj_dim), + nn.ReLU(), + nn.Linear(dynamic_proj_dim, hidden_dim)) + self.dynamic_act = nn.Tanh() + + def forward(self, query_feat, support_feat, spatial_shape): + """ + Args: + support_feat: [query, bs, c] + query_feat: [hw, bs, c] + spatial_shape: h, w + """ + device = query_feat.device + _, bs, c = query_feat.shape + h, w = spatial_shape + side_normalizer = torch.tensor([w, h]).to(query_feat.device)[None, + None, + :] # [bs, query, 2], Normalize the coord to [0,1] + + query_feat = query_feat.transpose(0, 1) + support_feat = support_feat.transpose(0, 1) + nq = support_feat.shape[1] + + fs_proj = self.support_proj(support_feat) # [bs, query, c] + fq_proj = self.query_proj(query_feat) # [bs, hw, c] + pattern_attention = self.dynamic_act( + self.dynamic_proj(fs_proj)) # [bs, query, c] + + fs_feat = (pattern_attention + 1) * fs_proj # [bs, query, c] + similarity = torch.bmm(fq_proj, + fs_feat.transpose(1, 2)) # [bs, hw, query] + similarity = similarity.transpose(1, 2).reshape(bs, nq, h, w) + grid_y, grid_x = torch.meshgrid( + torch.linspace(0.5, h - 0.5, h, dtype=torch.float32, device=device), # (h, w) + torch.linspace(0.5, w - 0.5, w, dtype=torch.float32, device=device), + indexing="ij") + + # compute softmax and sum up + coord_grid = torch.stack([grid_x, grid_y], dim=0).unsqueeze(0).unsqueeze(0).repeat(bs, nq, 1, 1, 1) + # [bs, query, 2, h, w] + coord_grid = coord_grid.permute(0, 1, 3, 4, 2) # [bs, query, h, w, 2] + similarity_softmax = similarity.flatten(2, 3).softmax(dim=-1) # [bs, query, hw] + similarity_coord_grid = similarity_softmax[:, :, :, None] * coord_grid.flatten(2, 3) + proposal_for_loss = similarity_coord_grid.sum(dim=2, keepdim=False) # [bs, + # query, 2] + proposal_for_loss = proposal_for_loss / side_normalizer + + max_pos = torch.argmax(similarity.reshape(bs, nq, -1), dim=-1, keepdim=True) # (bs, nq, 1) + max_mask = F.one_hot(max_pos, num_classes=w * h) # (bs, nq, 1, w*h) + max_mask = max_mask.reshape(bs, nq, w, h).type(torch.float) # (bs, nq, w, h) + local_max_mask = F.max_pool2d(input=max_mask, + kernel_size=3, + stride=1, + padding=1).reshape(bs, nq, w * h, 1) # (bs, nq, w*h, 1) + + # first, extract the local probability map with the mask + local_similarity_softmax = similarity_softmax[:, :, :, None] * local_max_mask # (bs, nq, w*h, 1) + + # then, re-normalize the local probability map + local_similarity_softmax = local_similarity_softmax / ( + local_similarity_softmax.sum(dim=-2, keepdim=True) + 1e-10) # [bs, nq, w*h, 1] + + # point-wise mulplication of local probability map and coord grid + proposals = local_similarity_softmax * coord_grid.flatten(2, 3) # [bs, # nq, w*h, 2] + + # sum the mulplication to obtain the final coord proposals + proposals = proposals.sum(dim=2) / side_normalizer # [bs, nq, 2] + + return proposal_for_loss, similarity, proposals + + +@TRANSFORMER.register_module() +class TwoStageSupportRefineTransformer(nn.Module): + + def __init__(self, + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + similarity_proj_dim=256, + dynamic_proj_dim=128, + return_intermediate_dec=True, + attn_bias=False, + max_hops=5, + use_bias_attn_module=False, + masked_supervision=False, + recon_features=False, + + ): + super().__init__() + + if num_encoder_layers > 0: + encoder_layer = TransformerEncoderLayer(d_model, nhead, + dim_feedforward, dropout, + activation, + normalize_before) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, + num_encoder_layers, encoder_norm) + else: + self.encoder = None + decoder_norm = nn.LayerNorm(d_model) + decoder_layer = TransformerDecoderLayer(d_model, nhead, + dim_feedforward, dropout, + activation, normalize_before, + use_bias_attn_module=use_bias_attn_module, + attn_bias=attn_bias, + max_hops=max_hops, + ) + + self.decoder = TransformerDecoder( + d_model, + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec, + ) + + self.proposal_generator = ProposalGenerator( + hidden_dim=d_model, + proj_dim=similarity_proj_dim, + dynamic_proj_dim=dynamic_proj_dim) + + self.d_model = d_model + self.nhead = nhead + self.recon_features = recon_features + self.masked_supervision = masked_supervision + self.freeze = '' + + def init_weights(self): + # follow the official DETR to init parameters + for m in self.modules(): + if hasattr(m, 'weight') and m.weight.dim() > 1: + xavier_init(m, distribution='uniform') + + def forward(self, query_image_feat, query_mask, support_kp_feat, + query_image_pos_embed, kp_pos_embedding, support_kp_mask, + position_embedding, kpt_branch, adj, attn_adj, return_attn_map=False, random_mask=None): + # flatten NxCxHxW to HWxNxC + # src = query image features, + # support_embed = support image embedding + bs, c, h, w = query_image_feat.shape + + query_image_feat = query_image_feat.flatten(2).permute(2, 0, 1) + query_image_pos_embed = query_image_pos_embed.flatten(2).permute(2, 0, 1) + kp_pos_embedding = kp_pos_embedding.flatten(2).permute(2, 0, 1) + query_image_pos_embed = torch.cat((query_image_pos_embed, kp_pos_embedding)) + query_image_embed = support_kp_feat.transpose(0, 1) # [query, bs, c ] + query_mask = query_mask.flatten(1) + + query_image_embed, refined_kp_support_embed = self.encoder( + query_image_feat, + query_image_embed, + src_key_padding_mask=query_mask, + query_key_padding_mask=support_kp_mask, + pos=query_image_pos_embed) + + # generate initial proposals and corresponding positional embedding. + initial_proposals_for_loss, similarity_map, initial_proposals = ( + self.proposal_generator( + query_image_embed, refined_kp_support_embed, + spatial_shape=[h, w])) # inital_proposals has been normalized + initial_position_embedding = position_embedding.forward_coordinates(initial_proposals) + k, bs, c = refined_kp_support_embed.shape + mask_decoder_cond = self.masked_supervision and self.decoder.training + if mask_decoder_cond: + support_gt_keypoint = refined_kp_support_embed.transpose(0, 1).detach().clone() + old_keypoints = support_gt_keypoint.detach().clone().contiguous() + new_keypoints = old_keypoints * random_mask + ( + (1 - random_mask) * ~support_kp_mask.unsqueeze(-1)) * self.mask_token + new_keypoints = new_keypoints.transpose(0, 1) + attn_adj_aux = attn_adj + change_requires_grad([self.decoder, position_embedding, kpt_branch], False) + recon_hs, recon_out_points, _, _ = self.decoder( + new_keypoints, + query_image_embed.detach(), + memory_key_padding_mask=query_mask.detach(), + pos=query_image_pos_embed.detach(), + query_pos=initial_position_embedding.detach(), + tgt_key_padding_mask=support_kp_mask.detach(), + position_embedding=position_embedding, + initial_proposals=initial_proposals.detach(), + kpt_branch=kpt_branch, + adj=adj, + attn_adj=attn_adj_aux, + return_attn_map=return_attn_map) + reconstructed_keypoints = recon_out_points[-1] + change_requires_grad([self.decoder, position_embedding, kpt_branch], True) + else: + reconstructed_keypoints = None + + adj_dec, attn_adj_dec = adj, attn_adj + hs, out_points, adjs, attn_maps = self.decoder( + refined_kp_support_embed, + query_image_embed, + memory_key_padding_mask=query_mask, + pos=query_image_pos_embed, + query_pos=initial_position_embedding, + tgt_key_padding_mask=support_kp_mask, + position_embedding=position_embedding, + initial_proposals=initial_proposals, + kpt_branch=kpt_branch, + adj=adj_dec, + attn_adj=attn_adj_dec, + return_attn_map=return_attn_map) + + return ( + hs.transpose(1, 2), + initial_proposals_for_loss, + out_points, + similarity_map, + reconstructed_keypoints + ) + + +def change_requires_grad(models, status=True): + for model in models: + model.requires_grad_(status) + + +class TransformerEncoder(nn.Module): + + def __init__(self, encoder_layer, num_layers, norm=None): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, + src, + query, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + query_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + # src: [hw, bs, c] + # query: [num_query, bs, c] + # mask: None by default + # src_key_padding_mask: [bs, hw] + # query_key_padding_mask: [bs, nq] + # pos: [hw, bs, c] + + n, bs, c = src.shape + src_cat = torch.cat((src, query), dim=0) # [hw + nq, bs, c] + mask_cat = torch.cat((src_key_padding_mask, query_key_padding_mask), dim=1) # [bs, hw+nq] + output = src_cat + + for layer in self.layers: + output = layer( + output, + query_length=n, + src_mask=mask, + src_key_padding_mask=mask_cat, + pos=pos) + + if self.norm is not None: + output = self.norm(output) + + # resplit the output into src and query + refined_query = output[n:, :, :] # [nq, bs, c] + output = output[:n, :, :] # [n, bs, c] + + return output, refined_query + + +class TransformerDecoder(nn.Module): + + def __init__(self, + d_model, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False, + ): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + self.ref_point_head = MLP(d_model, d_model, d_model, + 2) # this MLP will process the positional + + def forward(self, + support_feat, + query_feat, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + position_embedding=None, + initial_proposals=None, + kpt_branch=None, + adj=None, + attn_adj=None, + return_attn_map=False): + """ + position_embedding: Class used to compute positional embedding + inital_proposals: [bs, nq, 2], normalized coordinates of inital + proposals + kpt_branch: MLP used to predict the offsets for each query. + """ + + refined_support_feat = support_feat + intermediate = [] + attn_maps = [] + adjs = [] + bi = initial_proposals.detach() + query_points = [initial_proposals.detach()] + + tgt_key_padding_mask_remove_all_true = tgt_key_padding_mask.clone().to(tgt_key_padding_mask.device) + tgt_key_padding_mask_remove_all_true[tgt_key_padding_mask.logical_not().sum(dim=-1) == 0, 0] = False + + for layer_idx, layer in enumerate(self.layers): + if layer_idx == 0: # use positional embedding form inital + # proposals + query_pos_embed = query_pos.transpose(0, 1) + else: + # recalculate the positional embedding + query_pos_embed = position_embedding.forward_coordinates(bi) + query_pos_embed = query_pos_embed.transpose(0, 1) + query_pos_embed = self.ref_point_head(query_pos_embed) + + (refined_support_feat, query_feat, adjs_layer, img_attn_map, + kpt_attention_map) = layer( + refined_support_feat, + query_feat, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask_remove_all_true, + memory_key_padding_mask=memory_key_padding_mask, + concat_pos_embed=pos, + init_pos_emb=query_pos_embed, + adj=adj, + attn_adj=attn_adj, ) + + adj_gt = adjs_layer[1] # same for all layers + + if self.return_intermediate: + intermediate.append(self.norm(refined_support_feat)) + adjs.append(adjs_layer[0]) + + if return_attn_map: + attn_maps.append(img_attn_map) + + # update the query coordinates + delta_bi = kpt_branch[layer_idx]( + refined_support_feat.transpose(0, 1)) + + # Prediction loss + bi_tag = self.update(bi, delta_bi) + bi_pred = bi_tag + + bi = bi_tag.detach() + query_points.append(bi_pred) + + if self.norm is not None: + refined_support_feat = self.norm(refined_support_feat) + if self.return_intermediate: + intermediate.pop() + intermediate.append(refined_support_feat) + + if self.return_intermediate: + if adjs[0] is None: + return (torch.stack(intermediate), + query_points, + [[], adj_gt], + attn_maps) + return (torch.stack(intermediate), + query_points, + [torch.stack(adjs), adj_gt], + attn_maps) + + return (refined_support_feat.unsqueeze(0), + query_points, + [adjs, adj_gt], + attn_maps) + + def update(self, query_coordinates, delta_unsig): + query_coordinates_unsigmoid = inverse_sigmoid(query_coordinates) + new_query_coordinates = query_coordinates_unsigmoid + delta_unsig + new_query_coordinates = new_query_coordinates.sigmoid() + return new_query_coordinates + + +class TransformerEncoderLayer(nn.Module): + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward(self, + src, + query_length, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + src = self.with_pos_embed(src, pos) + q = k = src + # NOTE: compared with original implementation, we add positional + # embedding into the VALUE. + src2 = self.self_attn( + q, + k, + value=src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask, + need_weights=False)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + +class GCNLayer(nn.Module): + def __init__(self, + in_features, + out_features, + kernel_size=2, + use_bias=True, + activation=nn.ReLU(inplace=True), + batch_first=True): + super(GCNLayer, self).__init__() + self.conv = nn.Conv1d( + in_features, + out_features * kernel_size, + kernel_size=1, + padding=0, + stride=1, + dilation=1, + bias=use_bias, ) + + self.kernel_size = kernel_size + self.activation = activation + self.batch_first = batch_first + + def forward(self, x, adj): + assert adj.size(1) == self.kernel_size + if not self.batch_first: + x = x.permute(1, 2, 0) + else: + x = x.transpose(1, 2) + x = self.conv(x) + b, kc, v = x.size() + x = x.view(b, self.kernel_size, kc // self.kernel_size, v) + x = torch.einsum('bkcv,bkwv->bcw', (x, adj)) + if self.activation is not None: + x = self.activation(x) + if not self.batch_first: + x = x.permute(2, 0, 1) + else: + x = x.transpose(1, 2) + return x + + +class TransformerDecoderLayer(nn.Module): + + def __init__(self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + attn_bias=False, + max_hops=5, + use_bias_attn_module=False, + add_pos_emb=False, + learn_hops=False, + edge_features=False, + two_way_attn=False, + ): + super().__init__() + self.d_model = d_model + self.attn_bias = attn_bias + self.max_hops = max_hops + self.learn_hops = learn_hops + self.edge_features = edge_features + self.two_way_attn = two_way_attn + if attn_bias or use_bias_attn_module: + self.self_attn = BiasedMultiheadAttention(d_model, nhead, + self_attention=True, + dropout=dropout, + max_hops=self.max_hops, + bias_attn=attn_bias) + else: + self.self_attn = nn.MultiheadAttention(d_model, nhead, + dropout=dropout) + self.add_pos_emb = add_pos_emb + self.multihead_attn = nn.MultiheadAttention(d_model * 2, nhead, dropout=dropout, vdim=d_model) + self.choker = nn.Linear(in_features=2 * d_model, out_features=d_model) + self.ffn1 = GCNLayer(d_model, dim_feedforward, batch_first=False) + self.ffn2 = nn.Linear(dim_feedforward, d_model) + self.dropout = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + if self.two_way_attn: + self.cross_attn_image_to_token = nn.MultiheadAttention(d_model * 2, nhead, dropout=dropout, vdim=d_model) + self.cross_attn_image_to_token_choker = nn.Linear(in_features=2 * d_model, out_features=d_model) + self.dropout4 = nn.Dropout(dropout) + self.norm4 = nn.LayerNorm(d_model) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward(self, + refined_support_feat, + refined_query_feat, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + concat_pos_embed: Optional[Tensor] = None, + init_pos_emb: Optional[Tensor] = None, + adj: Optional[Tensor] = None, + attn_adj: Optional[Tensor] = None): + + q = k = v = refined_support_feat + if self.attn_bias: + if self.learn_hops: + attn_adj = adj[:, 1] + tgt2, kpt_attention_map = self.self_attn( + q, + k, + v, + attn_bias=attn_adj, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask, + need_weights=False) + else: + tgt2, kpt_attention_map = self.self_attn( + q, + k, + v, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask, + need_weights=False) + + refined_support_feat = refined_support_feat + self.dropout1(tgt2) + refined_support_feat = self.norm1(refined_support_feat) + # concatenate the positional embedding with the content feature, instead of direct addition + cross_attn_q = torch.cat((refined_support_feat, init_pos_emb + concat_pos_embed[refined_query_feat.shape[0]:]), + dim=-1) + cross_attn_k = torch.cat((refined_query_feat, concat_pos_embed[:refined_query_feat.shape[0]]), dim=-1) + + tgt2, attn_map = self.multihead_attn( + query=cross_attn_q, + key=cross_attn_k, + value=refined_query_feat, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask) + + refined_support_feat = refined_support_feat + self.dropout2(self.choker(tgt2)) + refined_support_feat = self.norm2(refined_support_feat) + adj_loss = gt_adj_loss = None + tgt2 = self.ffn2(self.dropout( + self.activation(self.ffn1(refined_support_feat, adj)))) + refined_support_feat = refined_support_feat + self.dropout3(tgt2) + refined_support_feat = self.norm3(refined_support_feat) + if self.two_way_attn: + q = torch.cat((refined_query_feat, concat_pos_embed[:refined_query_feat.shape[0]]), dim=-1) + k = torch.cat((refined_support_feat, init_pos_emb + concat_pos_embed[refined_query_feat.shape[0]:]), dim=-1) + v = refined_support_feat + tgt4, _ = self.cross_attn_image_to_token( + q, + k, + v, + attn_mask=tgt_mask, + need_weights=False) + refined_query_feat = refined_query_feat + self.dropout4(self.cross_attn_image_to_token_choker(tgt4)) + refined_query_feat = self.norm4(refined_query_feat) + + return refined_support_feat, refined_query_feat, [adj_loss, gt_adj_loss], attn_map, kpt_attention_map + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(F"activation should be relu/gelu, not {activation}.") + + +def clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for _ in range(N)]) diff --git a/EdgeCape/models/keypoint_heads/head.py b/EdgeCape/models/keypoint_heads/head.py new file mode 100644 index 0000000000000000000000000000000000000000..375291f7ecba4c9a96e2e42b6c9ff4697d029dc3 --- /dev/null +++ b/EdgeCape/models/keypoint_heads/head.py @@ -0,0 +1,387 @@ +from copy import deepcopy + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import (Conv2d, Linear, xavier_init) +from mmcv.cnn.bricks.transformer import build_positional_encoding +from mmpose.core.evaluation import keypoint_pck_accuracy +from mmpose.core.post_processing import transform_preds +from mmpose.models import HEADS +from mmpose.models import builder +from mmpose.models.utils.ops import resize +from EdgeCape.models.utils import build_transformer + + +# From ControlNet Rep: https://github.com/lllyasviel/ControlNet-v1-1-nightly +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def inverse_sigmoid(x, eps=1e-3): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +class TokenDecodeMLP(nn.Module): + ''' + The MLP used to predict coordinates from the support keypoints tokens. + ''' + + def __init__(self, + in_channels, + hidden_channels, + out_channels=2, + num_layers=3): + super(TokenDecodeMLP, self).__init__() + layers = [] + for i in range(num_layers): + if i == 0: + layers.append(nn.Linear(in_channels, hidden_channels)) + layers.append(nn.GELU()) + else: + layers.append(nn.Linear(hidden_channels, hidden_channels)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_channels, out_channels)) + # TODO: what about tanh / 2 + center ? + self.mlp = nn.Sequential(*layers) + + def forward(self, x): + return self.mlp(x) + + +@HEADS.register_module() +class TwoStageHead(nn.Module): + ''' + In two stage regression A3, the proposal generator are moved into transformer. + All valid proposals will be added with an positional embedding to better regress the location + ''' + + def __init__(self, + in_channels, + transformer=None, + positional_encoding=dict( + type='SinePositionalEncoding', + num_feats=128, + normalize=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1, + train_cfg=None, + test_cfg=None, + skeleton_head=None, + learn_skeleton=False, + masked_supervision=False, + freeze=None, + model_freeze=None, + masking_ratio=0.5, + ): + super().__init__() + + self.in_channels = in_channels + self.positional_encoding = build_positional_encoding(positional_encoding) + self.encoder_positional_encoding = build_positional_encoding(positional_encoding) + self.transformer = build_transformer(transformer) + self.embed_dims = self.transformer.d_model + self.with_heatmap_loss = with_heatmap_loss + self.heatmap_loss_weight = heatmap_loss_weight + self.skeleton_loss_weight = skeleton_loss_weight + assert 'num_feats' in positional_encoding + num_feats = positional_encoding['num_feats'] + assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ + f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ + f' and {num_feats}.' + + """Initialize layers of the transformer head.""" + self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1) + self.query_proj = Linear(self.in_channels, self.embed_dims) + # Instantiate the proposal generator and subsequent keypoint branch. + kpt_branch = TokenDecodeMLP( + in_channels=self.embed_dims, hidden_channels=self.embed_dims) + if share_kpt_branch: + self.kpt_branch = nn.ModuleList( + [kpt_branch for i in range(num_decoder_layer)]) + else: + self.kpt_branch = nn.ModuleList( + [deepcopy(kpt_branch) for i in range(num_decoder_layer)]) + + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + self.target_type = self.test_cfg.get('target_type', 'GaussianHeatMap') + + skeleton_head['max_hop'] = transformer.get('max_hops', 4) + self.skeleton_head = builder.build_head(skeleton_head) + self.skeleton_head.init_weights() + self.learn_skeleton = learn_skeleton + self.masking_ratio = masking_ratio + transformer_d_model = transformer.get('d_model', 256) + self.masked_supervision = masked_supervision + self.transformer.masked_supervision = self.masked_supervision + self.transformer.mask_token = nn.Parameter(torch.zeros(1, transformer_d_model)) + self.transformer.masking_ratio = self.masking_ratio + self.use_zero_conv = skeleton_head.get('use_zero_conv', False) + if freeze == "skeleton" or model_freeze == "skeleton": + self.skeleton_head.requires_grad_(False) + self.input_proj.requires_grad_(False) + self.query_proj.requires_grad_(False) + elif freeze == "prediction" or model_freeze == "prediction": + self.kpt_branch.requires_grad_(False) + self.transformer.requires_grad_(False) + self.encoder_positional_encoding.requires_grad_(False) + self.transformer.freeze = freeze + + def init_weights(self): + for m in self.modules(): + if hasattr(m, 'weight') and m.weight.dim() > 1: + xavier_init(m, distribution='uniform') + """Initialize weights of the transformer head.""" + # The initialization for transformer is important + self.transformer.init_weights() + # initialization for input_proj & prediction head + for mlp in self.kpt_branch: + nn.init.constant_(mlp.mlp[-1].weight.data, 0) + nn.init.constant_(mlp.mlp[-1].bias.data, 0) + nn.init.xavier_uniform_(self.input_proj.weight, gain=1) + nn.init.constant_(self.input_proj.bias, 0) + nn.init.xavier_uniform_(self.query_proj.weight, gain=1) + nn.init.constant_(self.query_proj.bias, 0) + if self.use_zero_conv: + self.skeleton_head.zero_conv = zero_module(self.skeleton_head.zero_conv) + + def forward(self, + feature_q, + feature_s, + target_s, + mask_s, + skeleton_lst, + return_attn_maps=False, + random_mask=None): + feature_q = self.input_proj(feature_q) # [bs, dim, h64, w64], /64 resolution. + bs, dim, h, w = feature_q.shape + # Feature map pos embedding + masks = feature_q.new_zeros((feature_q.shape[0], feature_q.shape[2], feature_q.shape[3])).to(torch.bool) + query_image_pos_embed = self.positional_encoding(masks) # [bs, embed_dim, h, w] + query_embed_list = [] + for i, (feature, target) in enumerate(zip(feature_s, target_s)): + # resize the support feature back to the heatmap sizes. + resized_feature = resize( + input=feature, + size=target.shape[-2:], + mode='bilinear', + align_corners=False) + target = target / (target.sum(dim=-1).sum(dim=-1)[:, :, None, None] + 1e-8) + support_keypoints = target.flatten(2) @ resized_feature.flatten(2).permute(0, 2, 1) + query_embed_list.append(support_keypoints) + # support_keypoints is the support keypoint features. + support_keypoints = torch.mean(torch.stack(query_embed_list, dim=0), 0) + support_keypoints = support_keypoints * mask_s + support_keypoints = self.query_proj(support_keypoints) + kp_mask = (~mask_s.to(torch.bool)).squeeze(-1) # True indicating this query matched no actual joints. + kp_pos_embedding = feature_q.new_zeros((bs, self.embed_dims, 1, target_s[0].shape[1])).to(torch.bool) + + # Predict Skeleton + skeleton_kp_embed = kp_pos_embedding + support_image_features = feature_s + support_keypoints_skeleton, support_image_features_skeleton = support_keypoints, support_image_features + adj, attn_adj, unnormalized_adj = self.skeleton_head(skeleton_lst, + support_keypoints_skeleton, + support_image_features_skeleton, + kp_mask, + query_image_pos_embed) + + (outs_dec, initial_proposals, out_points, + similarity_map, reconstructed_keypoints) = self.transformer(feature_q, + masks, + support_keypoints, + query_image_pos_embed, + kp_pos_embedding, + kp_mask, + self.positional_encoding, + self.kpt_branch, + adj, + attn_adj, + return_attn_map=return_attn_maps, + random_mask=random_mask) + + output_kpts = [] + for idx in range(outs_dec.shape[0]): + layer_delta_unsig = self.kpt_branch[idx](outs_dec[idx]) + layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(out_points[idx]) + output_kpts.append(layer_outputs_unsig.sigmoid()) + + return torch.stack(output_kpts, dim=0), initial_proposals, similarity_map, reconstructed_keypoints + + def get_loss(self, output, initial_proposals, similarity_map, target, + target_heatmap, target_weight, target_sizes, reconstructed_keypoints): + """Calculate top-down keypoint loss.""" + + losses = dict() + if self.learn_skeleton: + num_dec_layer, bs, nq = output.shape[:3] + normalizer = target_weight.squeeze(dim=-1).sum(dim=-1) # [bs, ] + normalizer[normalizer == 0] = 1 + reconstructed_keypoints, mask_indices = reconstructed_keypoints + support_gt_keypoints = target / target_sizes.to(output.device) + pred_loss = F.l1_loss(reconstructed_keypoints, support_gt_keypoints, reduction="none") + pred_loss = pred_loss.sum(dim=-1, keepdim=False) * target_weight.squeeze(dim=-1) + pred_loss = pred_loss.sum(dim=-1, keepdim=False) / normalizer + pred_loss = pred_loss.sum(dim=-1, keepdim=False) / bs + pred_loss = pred_loss * self.skeleton_loss_weight + losses['adj_reconstruct_loss'] = pred_loss.sum() + + num_dec_layer, bs, nq = output.shape[:3] + target_sizes = target_sizes.to(output.device) # [bs, 1, 2] + target = target / target_sizes + target = target[None, :, :, :].repeat(num_dec_layer, 1, 1, 1) + # set the weight for unset query point to be zero + normalizer = target_weight.squeeze(dim=-1).sum(dim=-1) # [bs, ] + normalizer[normalizer == 0] = 1 + + # compute the heatmap loss + if self.with_heatmap_loss: + losses['heatmap_loss'] = self.heatmap_loss( + similarity_map, target_heatmap, target_weight, + normalizer) * self.heatmap_loss_weight + + # compute L1 loss for inital_proposals + proposal_l1_loss = F.l1_loss(initial_proposals, target[0], reduction="none") + proposal_l1_loss = proposal_l1_loss.sum(dim=-1, keepdim=False) * target_weight.squeeze(dim=-1) + proposal_l1_loss = proposal_l1_loss.sum(dim=-1, keepdim=False) / normalizer # [bs, ] + losses['proposal_loss'] = proposal_l1_loss.sum() / bs + + # compute L1 loss for each layer + for idx in range(num_dec_layer): + layer_output, layer_target = output[idx], target[idx] + l1_loss = F.l1_loss(layer_output, layer_target, reduction="none") # [bs, query, 2] + l1_loss = l1_loss.sum(dim=-1, keepdim=False) * target_weight.squeeze(dim=-1) # [bs, query] + # normalize the loss for each sample with the number of visible joints + l1_loss = l1_loss.sum(dim=-1, keepdim=False) / normalizer # [bs, ] + losses['l1_loss' + '_layer' + str(idx)] = l1_loss.sum() / bs + + return losses + + def heatmap_loss(self, similarity_map, target_heatmap, target_weight, + normalizer): + # similarity_map: [bs, num_query, h, w] + # target_heatmap: [bs, num_query, sh, sw] + # target_weight: [bs, num_query, 1] + + # preprocess the similarity_map + h, w = similarity_map.shape[-2:] + # similarity_map = torch.clamp(similarity_map, 0.0, None) + similarity_map = similarity_map.sigmoid() + + target_heatmap = F.interpolate( + target_heatmap, size=(h, w), mode='bilinear') + target_heatmap = (target_heatmap / + (target_heatmap.max(dim=-1)[0].max(dim=-1)[0] + 1e-10)[:, :, None, None]) + + l2_loss = F.mse_loss(similarity_map, target_heatmap, reduction="none") # bs, nq, h, w + l2_loss = l2_loss * target_weight[:, :, :, None] # bs, nq, h, w + l2_loss = l2_loss.flatten(2, 3).sum(-1) / (h * w) # bs, nq + l2_loss = l2_loss.sum(-1) / normalizer # bs, + + return l2_loss.mean() + + def get_accuracy(self, output, target, target_weight, target_sizes, height=256): + """Calculate accuracy for top-down keypoint loss. + + Args: + output (torch.Tensor[NxKx2]): estimated keypoints in ABSOLUTE coordinates. + target (torch.Tensor[NxKx2]): gt keypoints in ABSOLUTE coordinates. + target_weight (torch.Tensor[NxKx1]): Weights across different joint types. + target_sizes (torch.Tensor[Nx2): shapes of the image. + """ + # NOTE: In POMNet, PCK is estimated on 1/8 resolution, which is slightly different here. + + accuracy = dict() + output = output * float(height) + output, target, target_weight, target_sizes = ( + output.detach().cpu().numpy(), target.detach().cpu().numpy(), + target_weight.squeeze(-1).long().detach().cpu().numpy(), + target_sizes.squeeze(1).detach().cpu().numpy()) + + _, avg_acc, _ = keypoint_pck_accuracy( + output, + target, + target_weight.astype(np.bool8), + thr=0.2, + normalize=target_sizes) + accuracy['acc_pose'] = float(avg_acc) + + return accuracy + + def decode(self, img_metas, output, img_size, **kwargs): + """Decode the predicted keypoints from prediction. + + Args: + img_metas (list(dict)): Information about data augmentation + By default this includes: + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + output (np.ndarray[N, K, H, W]): model predicted heatmaps. + """ + batch_size = len(img_metas) + W, H = img_size + output = output * np.array([W, H])[None, None, :] # [bs, query, 2], coordinates with recovered shapes. + + if 'bbox_id' or 'query_bbox_id' in img_metas[0]: + bbox_ids = [] + else: + bbox_ids = None + + c = np.zeros((batch_size, 2), dtype=np.float32) + s = np.zeros((batch_size, 2), dtype=np.float32) + image_paths = [] + score = np.ones(batch_size) + for i in range(batch_size): + c[i, :] = img_metas[i]['query_center'] + s[i, :] = img_metas[i]['query_scale'] + image_paths.append(img_metas[i]['query_image_file']) + + if 'query_bbox_score' in img_metas[i]: + score[i] = np.array( + img_metas[i]['query_bbox_score']).reshape(-1) + if 'bbox_id' in img_metas[i]: + bbox_ids.append(img_metas[i]['bbox_id']) + elif 'query_bbox_id' in img_metas[i]: + bbox_ids.append(img_metas[i]['query_bbox_id']) + + preds = np.zeros(output.shape) + for idx in range(output.shape[0]): + preds[idx] = transform_preds( + output[idx], + c[idx], + s[idx], [W, H], + use_udp=self.test_cfg.get('use_udp', False)) + + all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32) + all_boxes = np.zeros((batch_size, 6), dtype=np.float32) + all_preds[:, :, 0:2] = preds[:, :, 0:2] + all_preds[:, :, 2:3] = 1.0 # NOTE: Currently, assume all predicted points are of 100% confidence. + all_boxes[:, 0:2] = c[:, 0:2] + all_boxes[:, 2:4] = s[:, 0:2] + all_boxes[:, 4] = np.prod(s * 200.0, axis=1) + all_boxes[:, 5] = score + + result = {} + + result['preds'] = all_preds + result['boxes'] = all_boxes + result['image_paths'] = image_paths + result['bbox_ids'] = bbox_ids + + return result diff --git a/EdgeCape/models/keypoint_heads/skeleton.py b/EdgeCape/models/keypoint_heads/skeleton.py new file mode 100644 index 0000000000000000000000000000000000000000..ee530de319e88dd706c8f68770b436bdd9a3cac3 --- /dev/null +++ b/EdgeCape/models/keypoint_heads/skeleton.py @@ -0,0 +1,208 @@ +import random +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init +from mmpose.models import HEADS +from EdgeCape.models.keypoint_heads.encoder_decoder import (TransformerDecoderLayer, _get_clones) + + +@HEADS.register_module() +class SkeletonPredictor(nn.Module): + def __init__(self, + d_model=256, + nhead=8, + num_layers=3, + dim_feedforward=768, + dropout=0.1, + activation="relu", + normalize_before=False, + learn_skeleton: bool = False, + max_hop: int = 5, + adj_normalization: bool = True, + markov_bias: bool = True, + mask_res: bool = False, + use_zero_conv: bool = True, + max_hops: int = 4, + two_way_attn: bool = True, + gcn_norm: bool = False, ): + super(SkeletonPredictor, self).__init__() + if num_layers > 0: + decoder_layer = TransformerDecoderLayer(d_model=d_model, + nhead=nhead, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation='relu', + normalize_before=normalize_before, + max_hops=max_hops, + two_way_attn=two_way_attn) + self.skeleton_predictor = _get_clones(decoder_layer, num_layers) + self.gcn_norm = gcn_norm + self.image_project = nn.Conv2d(dim_feedforward, d_model, kernel_size=1) + self.learn_skeleton = learn_skeleton + self.max_hop = max_hop + if activation == "relu": + self.activation = nn.ReLU() + else: + self.activation = nn.Sigmoid() + self.adj_normalization = adj_normalization + self.markov_bias = markov_bias + self.k_proj = nn.Linear(d_model, d_model) + self.q_proj = nn.Linear(d_model, d_model) + self.mh_linear = nn.Conv2d(nhead, 1, kernel_size=1) + self.num_heads = nhead + self.mask_res = mask_res + self.use_zero_conv = use_zero_conv + if self.use_zero_conv: + self.zero_conv = nn.Conv2d(1, 1, kernel_size=1, stride=1, padding=0) + + def forward(self, + skeleton: list, + kp_features: torch.Tensor, + image_features: torch.Tensor, + kp_mask: torch.Tensor, + query_image_pos_embed: torch.Tensor, + ) -> [torch.Tensor, torch.Tensor]: + + assert skeleton is not None + b, num_pts, _ = kp_features.shape + gt_adj, _ = self.adj_mx_from_edges(num_pts=num_pts, + skeleton=skeleton, + mask=kp_mask, + device=kp_features.device) + binary_adj = gt_adj[:, 1] > 0 + if not self.learn_skeleton: + return gt_adj, None, binary_adj + adj, adj_for_attn, unnnormalized_adj = self.predict_adj(image_features=image_features, + kp_features=kp_features, + kp_mask=kp_mask, + query_image_pos_embed=query_image_pos_embed, + gt_adj=binary_adj) + return adj, adj_for_attn, unnnormalized_adj + + def refine_features(self, + image_features: torch.Tensor, + kp_features: torch.Tensor, + kp_mask: torch.Tensor, + query_image_pos_embed: torch.Tensor, + adj: torch.Tensor = None, + ): + + bs, num_pts, _ = kp_features.shape + adj = self.soft_normalize_adj(adj, kp_mask) + image_features = [self.image_project(image_feature) for image_feature in image_features] + zero_pos_embed = torch.zeros_like(kp_features).flatten(2).permute(1, 0, 2) + query_image_pos_embed = query_image_pos_embed.flatten(2).permute(2, 0, 1) + concat_pos_embed = torch.cat((query_image_pos_embed, zero_pos_embed)) + kp_features = kp_features.flatten(2).permute(1, 0, 2) + image_features = [image_feature.flatten(2).permute(2, 0, 1) for image_feature in image_features] + tgt_key_padding_mask_remove_all_true = kp_mask.clone().to(kp_mask.device) + tgt_key_padding_mask_remove_all_true[kp_mask.logical_not().sum(dim=-1) == 0, 0] = False + kp_feat_lst = [] + for s, image_feature in enumerate(image_features): + s_kp_features = kp_features.clone() + for i, layer in enumerate(self.skeleton_predictor): + s_kp_features, image_feature, _, _, _ = layer( + s_kp_features, + image_feature, + tgt_key_padding_mask=tgt_key_padding_mask_remove_all_true, + concat_pos_embed=concat_pos_embed, + init_pos_emb=zero_pos_embed, + adj=adj, + ) + kp_feat_lst.append(s_kp_features.permute(1, 0, 2)) + kp_features = torch.mean(torch.stack(kp_feat_lst, dim=0), 0) + + return kp_features + + def predict_adj(self, + image_features: torch.Tensor, + kp_features: torch.Tensor, + kp_mask: torch.Tensor, + query_image_pos_embed: torch.Tensor, + gt_adj: torch.Tensor = None): + + kp_features = self.refine_features(image_features, + kp_features, + kp_mask, + query_image_pos_embed, + gt_adj) + + normalized_adj, unnnormalized_adj = self.predict_skeleton(kp_features, kp_mask, gt_adj) + attn_bias_matrix = self.markov_transition_matrix(normalized_adj[:, 1]) + return normalized_adj, attn_bias_matrix, unnnormalized_adj + + def predict_skeleton(self, kp_features, kp_mask, gt_adj): + bs, num_pts, _ = kp_features.shape + # Self-attention matrix from kp_features + kp_features = kp_features.permute(1, 0, 2) * ~kp_mask.transpose(0, 1).unsqueeze(-1) + q_kp = self.q_proj(kp_features).contiguous().view(num_pts, bs * self.num_heads, -1).transpose(0, 1) + k_kp = self.k_proj(kp_features).contiguous().view(num_pts, bs * self.num_heads, -1).transpose(0, 1) + attn = torch.bmm(q_kp, k_kp.transpose(1, 2)).view(bs, self.num_heads, num_pts, num_pts) + unnormalized_adj_matrix = self.mh_linear(attn).squeeze(1) + unnormalized_adj_matrix = (unnormalized_adj_matrix + unnormalized_adj_matrix.transpose(1, 2)) / 2 + unnormalized_adj_matrix = self.combine_adj(gt_adj, unnormalized_adj_matrix) + unnormalized_adj_matrix = self.activation(unnormalized_adj_matrix) + normalized_adj = self.soft_normalize_adj(unnormalized_adj_matrix, kp_mask, gt_adj) + unnormalized_adj_matrix = unnormalized_adj_matrix * ~kp_mask.unsqueeze(-1) * ~kp_mask.unsqueeze(-2) + return normalized_adj, unnormalized_adj_matrix + + def combine_adj(self, gt_adj, predicted_adj): + if self.use_zero_conv: + predicted_adj = self.zero_conv(predicted_adj.unsqueeze(1)).squeeze(1) + adj = gt_adj + predicted_adj + return adj + + def markov_transition_matrix(self, adj): + """ + Compute the Markov transition matrix from the adjacency matrix. + :param adj: (bs, num_pts, num_pts) + :return: (bs, num_pts, num_pts) + """ + adj = adj / (adj.sum(dim=-1, keepdim=True) + 1e-8) + transfer_mat = [torch.matrix_power(adj.float(), d) for d in range(self.max_hop + 1)] + arrive_mat = torch.stack(transfer_mat) + return arrive_mat + + def init_weights(self): + for m in self.modules(): + if hasattr(m, 'weight') and m.weight.dim() > 1: + xavier_init(m, distribution='uniform') + """Initialize weights of the transformer head.""" + # nn.init.xavier_uniform_(self.input_proj.weight, gain=1) + # nn.init.constant_(self.input_proj.bias, 0) + + def adj_mx_from_edges(self, num_pts, skeleton, mask=None, device='cuda'): + binary_adj_mx = torch.empty(0, device=device) + batch_size = len(skeleton) + for b in range(batch_size): + edges = torch.tensor(skeleton[b]) + adj = torch.zeros(num_pts, num_pts, device=device) + if len(edges.shape) > 1: + adj[edges[:, 0], edges[:, 1]] = 1 + adj[edges[:, 1], edges[:, 0]] = 1 + binary_adj_mx = torch.concatenate((binary_adj_mx, adj.unsqueeze(0)), dim=0) + if mask is not None: + adj = self.normalize_adj(binary_adj_mx, mask) + else: + adj = None + return adj, binary_adj_mx + + def normalize_adj(self, binary_adj_mx, mask): + trans_adj_mx = torch.transpose(binary_adj_mx, 1, 2) + cond = (trans_adj_mx > binary_adj_mx).float() + adj_unnormalized = binary_adj_mx + trans_adj_mx * cond - binary_adj_mx * cond + adj = adj_unnormalized * ~mask[..., None] * ~mask[:, None] + adj = torch.nan_to_num(adj / adj.sum(dim=-1, keepdim=True)) + adj = torch.stack((torch.diag_embed(~mask), adj), dim=1) + return adj + + def soft_normalize_adj(self, adj_mx, mask, gt_adj=None): + adj_mask = ~mask[..., None] * ~mask[:, None] + if self.mask_res and gt_adj is not None: + adj_mask = gt_adj + adj = adj_mx * adj_mask + if self.adj_normalization: + adj = adj / (adj.sum(dim=-1, keepdim=True) + 1e-8) + if not self.gcn_norm: + adj = torch.stack((torch.diag_embed(~mask), adj), dim=1) + return adj diff --git a/EdgeCape/models/utils/__init__.py b/EdgeCape/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..af4e06657cc4b8a6e0c1b306cd4011aebead30dc --- /dev/null +++ b/EdgeCape/models/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import build_linear_layer, build_transformer, build_backbone +from .transformer import (DetrTransformerDecoderLayer, DetrTransformerDecoder, + DetrTransformerEncoder, DynamicConv) +from .positional_encoding import (LearnedPositionalEncoding, + SinePositionalEncoding) + +from EdgeCape.models.keypoint_heads.encoder_decoder import TwoStageSupportRefineTransformer + +__all__ = [ + 'build_transformer', 'build_backbone', 'build_linear_layer', 'DetrTransformerDecoderLayer', + 'DetrTransformerDecoder', 'DetrTransformerEncoder', + 'LearnedPositionalEncoding', 'SinePositionalEncoding', + 'TwoStageSupportRefineTransformer', +] diff --git a/EdgeCape/models/utils/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77cd369ac355297ee71f071ba0d2be918d29ed00 Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/EdgeCape/models/utils/__pycache__/bias_attn.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/bias_attn.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b88361a03ac2931ef6ef21981fdb4d09aea6a9dd Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/bias_attn.cpython-39.pyc differ diff --git a/EdgeCape/models/utils/__pycache__/builder.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/builder.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74a056f3a44266f13d32cef67348a9f240847f08 Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/builder.cpython-39.pyc differ diff --git a/EdgeCape/models/utils/__pycache__/positional_encoding.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/positional_encoding.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2daeaaefd66ba34f5b43b324fecc52bbc6bb9ff5 Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/positional_encoding.cpython-39.pyc differ diff --git a/EdgeCape/models/utils/__pycache__/transformer.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/transformer.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7eb92254f1a20bf6f818903f5b0b9b16096f9d2d Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/transformer.cpython-39.pyc differ diff --git a/EdgeCape/models/utils/__pycache__/visualization.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/visualization.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7c16e1384c7c2e46f71536f0ae3c0050361d029 Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/visualization.cpython-39.pyc differ diff --git a/EdgeCape/models/utils/backbone.py b/EdgeCape/models/utils/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..65c3a0d2eb30060ad3c456382747ae27d3805422 --- /dev/null +++ b/EdgeCape/models/utils/backbone.py @@ -0,0 +1,116 @@ +# -------------------------------------------------------- +# SimMIM +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# Modified by Zhenda Xie +# -------------------------------------------------------- + +import os +import torch +import torch.distributed as dist +import numpy as np +from scipy import interpolate + +def load_pretrained(config, model, logger): + checkpoint = torch.load(config, map_location='cpu') + checkpoint_model = checkpoint['model'] + + if any([True if 'encoder.' in k else False for k in checkpoint_model.keys()]): + checkpoint_model = {k.replace('encoder.', ''): v for k, v in checkpoint_model.items() if k.startswith('encoder.')} + print('Detect pre-trained model, remove [encoder.] prefix.') + else: + print('Detect non-pre-trained model, pass without doing anything.') + + checkpoint = remap_pretrained_keys_swin(model, checkpoint_model, logger) + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + del checkpoint + torch.cuda.empty_cache() + + +def remap_pretrained_keys_swin(model, checkpoint_model, logger): + state_dict = model.state_dict() + + # Geometric interpolation when pre-trained patch size mismatch with fine-tuned patch size + all_keys = list(checkpoint_model.keys()) + for key in all_keys: + if "relative_position_bias_table" in key: + relative_position_bias_table_pretrained = checkpoint_model[key] + relative_position_bias_table_current = state_dict[key] + L1, nH1 = relative_position_bias_table_pretrained.size() + L2, nH2 = relative_position_bias_table_current.size() + if nH1 != nH2: + print(f"Error in loading {key}, passing......") + else: + if L1 != L2: + print(f"{key}: Interpolate relative_position_bias_table using geo.") + src_size = int(L1 ** 0.5) + dst_size = int(L2 ** 0.5) + + def geometric_progression(a, r, n): + return a * (1.0 - r ** n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + # if q > 1.090307: + # q = 1.090307 + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + print("Original positions = %s" % str(x)) + print("Target positions = %s" % str(dx)) + + all_rel_pos_bias = [] + + for i in range(nH1): + z = relative_position_bias_table_pretrained[:, i].view(src_size, src_size).float().numpy() + f_cubic = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append(torch.Tensor(f_cubic(dx, dy)).contiguous().view(-1, 1).to( + relative_position_bias_table_pretrained.device)) + + new_rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + checkpoint_model[key] = new_rel_pos_bias + + # delete relative_position_index since we always re-init it + relative_position_index_keys = [k for k in checkpoint_model.keys() if "relative_position_index" in k] + for k in relative_position_index_keys: + del checkpoint_model[k] + + # delete relative_coords_table since we always re-init it + relative_coords_table_keys = [k for k in checkpoint_model.keys() if "relative_coords_table" in k] + for k in relative_coords_table_keys: + del checkpoint_model[k] + + # re-map keys due to name change + rpe_mlp_keys = [k for k in checkpoint_model.keys() if "rpe_mlp" in k] + for k in rpe_mlp_keys: + checkpoint_model[k.replace('rpe_mlp', 'cpb_mlp')] = checkpoint_model.pop(k) + + # delete attn_mask since we always re-init it + attn_mask_keys = [k for k in checkpoint_model.keys() if "attn_mask" in k] + for k in attn_mask_keys: + del checkpoint_model[k] + + return checkpoint_model \ No newline at end of file diff --git a/EdgeCape/models/utils/bias_attn.py b/EdgeCape/models/utils/bias_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..1dbe51e203dd3267dfe027d45a5adfa0ecc7ef02 --- /dev/null +++ b/EdgeCape/models/utils/bias_attn.py @@ -0,0 +1,265 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Optional, Tuple + +import torch +import torchvision +from fairseq import utils +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor, nn +from torchvision.ops import MLP + + +class BiasedMultiheadAttention(nn.Module): + """Multi-headed attention. + + See "Attention Is All You Need" for more details. + """ + + def __init__( + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias_attn=False, + bias=True, + self_attention=False, + q_noise=0.0, + qn_block_size=8, + max_hops=5, + ): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + + self.self_attention = self_attention + + assert self.self_attention, "Only support self attention" + + assert not self.self_attention or self.qkv_same_dim, ( + "Self-attention requires query, key and " "value to be of the same size" + ) + + self.k_proj = quant_noise( + nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.v_proj = quant_noise( + nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.q_proj = quant_noise( + nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size + ) + + self.out_proj = quant_noise( + nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.max_hops = max_hops + self.bias_attn = bias_attn + if bias_attn: + self.markov_structural_mlp = MLP(self.max_hops + 1, + [self.max_hops + num_heads, num_heads]) + self.reset_parameters() + self.onnx_trace = False + + def prepare_for_onnx_export_(self): + raise NotImplementedError + + def reset_parameters(self): + if self.qkv_same_dim: + # Empirically observed the convergence to be much better with + # the scaled initialization + nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) + else: + nn.init.xavier_uniform_(self.k_proj.weight) + nn.init.xavier_uniform_(self.v_proj.weight) + nn.init.xavier_uniform_(self.q_proj.weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.out_proj.bias is not None: + nn.init.constant_(self.out_proj.bias, 0.0) + + def forward( + self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + attn_bias: Optional[Tensor] = None, + key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, + attn_mask: Optional[Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + ) -> Tuple[Tensor, Optional[Tensor]]: + """Input shape: Time x Batch x Channel + + Args: + key_padding_mask (ByteTensor, optional): mask to exclude + keys that are pads, of shape `(batch, src_len)`, where + padding elements are indicated by 1s. + need_weights (bool, optional): return the attention weights, + averaged over heads (default: False). + attn_mask (ByteTensor, optional): typically used to + implement causal attention, where the mask prevents the + attention from looking forward in time (default: None). + before_softmax (bool, optional): return the raw attention + weights and values before the attention softmax. + need_head_weights (bool, optional): return the attention + weights for each head. Implies *need_weights*. Default: + return the average attention weights over all heads. + """ + if need_head_weights: + need_weights = True + + tgt_len, bsz, embed_dim = query.size() + src_len = tgt_len + assert embed_dim == self.embed_dim, f"query dim {embed_dim} != {self.embed_dim}" + assert list(query.size()) == [tgt_len, bsz, embed_dim] + if key is not None: + src_len, key_bsz, _ = key.size() + if not torch.jit.is_scripting(): + assert key_bsz == bsz + assert value is not None + assert src_len, bsz == value.shape[:2] + + q = self.q_proj(query) + k = self.k_proj(query) + v = self.v_proj(query) + q *= self.scaling + + q = ( + q.contiguous() + .view(tgt_len, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + if k is not None: + k = ( + k.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + if v is not None: + v = ( + v.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + assert k is not None + assert k.size(1) == src_len + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] + + if self.bias_attn and attn_bias is not None: + attn_bias_val = self.markov_structural_mlp(attn_bias.permute(1, 2, 3, 0)) + attn_bias_val = attn_bias_val.permute(0, 3, 1, 2).reshape(bsz * self.num_heads, tgt_len, src_len) + attn_weights += attn_bias_val + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + attn_weights += attn_mask + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v + + attn_weights_float = utils.softmax( + attn_weights, dim=-1, onnx_trace=self.onnx_trace + ) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = self.dropout_module(attn_weights) + + assert v is not None + attn = torch.bmm(attn_probs, v) + assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] + + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = self.out_proj(attn) + + attn_weights: Optional[Tensor] = None + if need_weights: + attn_weights = attn_weights_float.view( + bsz, self.num_heads, tgt_len, src_len + ).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + + return attn, attn_weights + + def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): + return attn_weights + + def rename_state_dict(self, state_dict, prefix): + # prefix = name + "." if name != "" else "" + items_to_add = {} + keys_to_remove = [] + for k in state_dict.keys(): + if k.startswith(prefix): + if k.endswith(prefix + "in_proj_weight"): + # in_proj_weight used to be q + k + v with same dimensions + dim = int(state_dict[k].shape[0] / 3) + items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] + items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim: 2 * dim] + items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim:] + keys_to_remove.append(k) + + if k.endswith(prefix + "in_proj_bias"): + dim = int(state_dict[k].shape[0] / 3) + items_to_add[prefix + "q_proj.bias"] = state_dict[k][:dim] + items_to_add[prefix + "k_proj.bias"] = state_dict[k][dim: 2 * dim] + items_to_add[prefix + "v_proj.bias"] = state_dict[k][2 * dim:] + keys_to_remove.append(prefix + "in_proj_bias") + + for k in keys_to_remove: + del state_dict[k] + + for key, value in items_to_add.items(): + state_dict[key] = value + + def _load_from_state_dict(self, state_dict, name, *args, **kwargs): + self.rename_state_dict(state_dict, name) + super()._load_from_state_dict(state_dict, name, *args, **kwargs) diff --git a/EdgeCape/models/utils/builder.py b/EdgeCape/models/utils/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..f695813d0a6f83d25568997169870a2801f511d1 --- /dev/null +++ b/EdgeCape/models/utils/builder.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.utils import Registry, build_from_cfg + +TRANSFORMER = Registry('Transformer') +BACKBONES = Registry('BACKBONES') +LINEAR_LAYERS = Registry('linear layers') + + +def build_backbone(cfg, default_args=None): + """Build backbone.""" + return build_from_cfg(cfg, BACKBONES, default_args) + +def build_transformer(cfg, default_args=None): + """Builder for Transformer.""" + return build_from_cfg(cfg, TRANSFORMER, default_args) + + +LINEAR_LAYERS.register_module('Linear', module=nn.Linear) + + +def build_linear_layer(cfg, *args, **kwargs): + """Build linear layer. + Args: + cfg (None or dict): The linear layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate an linear layer. + args (argument list): Arguments passed to the `__init__` + method of the corresponding linear layer. + kwargs (keyword arguments): Keyword arguments passed to the `__init__` + method of the corresponding linear layer. + Returns: + nn.Module: Created linear layer. + """ + if cfg is None: + cfg_ = dict(type='Linear') + else: + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in LINEAR_LAYERS: + raise KeyError(f'Unrecognized linear type {layer_type}') + else: + linear_layer = LINEAR_LAYERS.get(layer_type) + + layer = linear_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/EdgeCape/models/utils/positional_encoding.py b/EdgeCape/models/utils/positional_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..62b6e4dfb57b740e364d78665b63582a9fc66cd4 --- /dev/null +++ b/EdgeCape/models/utils/positional_encoding.py @@ -0,0 +1,219 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING +from mmcv.runner import BaseModule + +#TODO: add an SinePositionalEncoding for coordinates input + +@POSITIONAL_ENCODING.register_module() +class SinePositionalEncoding(BaseModule): + """Position encoding with sine and cosine functions. + + See `End-to-End Object Detection with Transformers + `_ for details. + + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + normalize (bool, optional): Whether to normalize the position + embedding. Defaults to False. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + eps (float, optional): A value added to the denominator for + numerical stability. Defaults to 1e-6. + offset (float): offset add to embed when do the normalization. + Defaults to 0. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__(self, + num_feats, + temperature=10000, + normalize=False, + scale=2 * math.pi, + eps=1e-6, + offset=0., + init_cfg=None): + super(SinePositionalEncoding, self).__init__(init_cfg) + if normalize: + assert isinstance(scale, (float, int)), 'when normalize is set,' \ + 'scale should be provided and in float or int type, ' \ + f'found {type(scale)}' + self.num_feats = num_feats + self.temperature = temperature + self.normalize = normalize + self.scale = scale + self.eps = eps + self.offset = offset + + def forward(self, mask): + """Forward function for `SinePositionalEncoding`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + # For convenience of exporting to ONNX, it's required to convert + # `masks` from bool to int. + mask = mask.to(torch.int) + not_mask = 1 - mask # logical_not + y_embed = not_mask.cumsum(1, dtype=torch.float32) # [bs, h, w], recording the y coordinate ot each pixel + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: # default True + y_embed = (y_embed + self.offset) / \ + (y_embed[:, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / \ + (x_embed[:, :, -1:] + self.eps) * self.scale + dim_t = torch.arange( + self.num_feats, dtype=torch.float32, device=mask.device) + dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) + pos_x = x_embed[:, :, :, None] / dim_t # [bs, h, w, num_feats] + pos_y = y_embed[:, :, :, None] / dim_t + # use `view` instead of `flatten` for dynamically exporting to ONNX + B, H, W = mask.size() + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), + dim=4).view(B, H, W, -1) # [bs, h, w, num_feats] + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), + dim=4).view(B, H, W, -1) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + def forward_coordinates(self, coord): + """ + Forward funtion for normalized coordinates input with the shape of [bs, kpt, 2] + return: + pos (Tensor): position embedding with the shape of [bs, kpt, num_feats*2] + """ + x_embed, y_embed = coord[:,:,0], coord[:,:,1] # [bs, kpt] + x_embed = x_embed * self.scale # [bs, kpt] + y_embed = y_embed * self.scale + + dim_t = torch.arange( + self.num_feats, dtype=torch.float32, device=coord.device) + dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats) + + pos_x = x_embed[:, :, None] / dim_t # [bs, kpt, num_feats] + pos_y = y_embed[:, :, None] / dim_t # [bs, kpt, num_feats] + bs, kpt, _ = pos_x.shape + + pos_x = torch.stack( + (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), + dim=3).view(bs, kpt, -1) # [bs, kpt, num_feats] + pos_y = torch.stack( + (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), + dim=3).view(bs, kpt, -1) # [bs, kpt, num_feats] + pos = torch.cat((pos_y, pos_x), dim=2) # [bs, kpt, num_feats * 2] + + return pos + + def forward_3d_coordinates(self, coord): + """ + Forward funtion for normalized coordinates input with the shape of [bs, 3, kpt] + return: + pos (Tensor): position embedding with the shape of [bs, kpt, num_feats*2] + """ + bs, _, H, W = coord.shape + + x_embed, y_embed, z_embed = coord[:, 0], coord[:, 1], coord[:, 2] # [bs, kpt] + x_embed = x_embed.flatten(1) * self.scale # [bs, kpt] + y_embed = y_embed.flatten(1) * self.scale + z_embed = z_embed.flatten(1) * self.scale + + dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=coord.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats) + + pos_x = x_embed[:, :, None] / dim_t # [bs, HW, num_feats] + pos_y = y_embed[:, :, None] / dim_t # [bs, HW, num_feats] + pos_z = z_embed[:, :, None] / dim_t # [bs, HW, num_feats] + + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),dim=3).view(bs, H, W, -1) # [bs, H, W, num_feats] + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),dim=3).view(bs, H, W, -1) # [bs, H, W, num_feats] + pos_z = torch.stack((pos_z[:, :, 0::2].sin(), pos_z[:, :, 1::2].cos()),dim=3).view(bs, H, W, -1) # [bs, H, W, num_feats] + pos = torch.cat((pos_y, pos_x, pos_z), dim=3).permute(0, 3, 1, 2) # [bs, H, W, num_feats * 3] + + return pos + + def __repr__(self): + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f'(num_feats={self.num_feats}, ' + repr_str += f'temperature={self.temperature}, ' + repr_str += f'normalize={self.normalize}, ' + repr_str += f'scale={self.scale}, ' + repr_str += f'eps={self.eps})' + return repr_str + + +@POSITIONAL_ENCODING.register_module() +class LearnedPositionalEncoding(BaseModule): + """Position embedding with learnable embedding weights. + + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. The final returned dimension for + each position is 2 times of this value. + row_num_embed (int, optional): The dictionary size of row embeddings. + Default 50. + col_num_embed (int, optional): The dictionary size of col embeddings. + Default 50. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, + num_feats, + row_num_embed=50, + col_num_embed=50, + init_cfg=dict(type='Uniform', layer='Embedding')): + super(LearnedPositionalEncoding, self).__init__(init_cfg) + self.row_embed = nn.Embedding(row_num_embed, num_feats) + self.col_embed = nn.Embedding(col_num_embed, num_feats) + self.num_feats = num_feats + self.row_num_embed = row_num_embed + self.col_num_embed = col_num_embed + + def forward(self, mask): + """Forward function for `LearnedPositionalEncoding`. + + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + h, w = mask.shape[-2:] + x = torch.arange(w, device=mask.device) + y = torch.arange(h, device=mask.device) + x_embed = self.col_embed(x) + y_embed = self.row_embed(y) + pos = torch.cat( + (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat( + 1, w, 1)), + dim=-1).permute(2, 0, + 1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1) + return pos + + def __repr__(self): + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f'(num_feats={self.num_feats}, ' + repr_str += f'row_num_embed={self.row_num_embed}, ' + repr_str += f'col_num_embed={self.col_num_embed})' + return repr_str diff --git a/EdgeCape/models/utils/post_processing/__init__.py b/EdgeCape/models/utils/post_processing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..47980293fc1bd49ac1f74fbbfa6e54f25d62fcfa --- /dev/null +++ b/EdgeCape/models/utils/post_processing/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from .nms import nearby_joints_nms, oks_iou, oks_nms, soft_oks_nms +from .one_euro_filter import OneEuroFilter +from .post_transforms import (affine_transform, flip_back, fliplr_joints, + fliplr_regression, get_affine_transform, + get_warp_matrix, rotate_point, transform_preds, + warp_affine_joints) +from .smoother import Smoother + +__all__ = [ + 'oks_nms', 'soft_oks_nms', 'nearby_joints_nms', 'affine_transform', + 'rotate_point', 'flip_back', 'fliplr_joints', 'fliplr_regression', + 'transform_preds', 'get_affine_transform', 'get_warp_matrix', + 'warp_affine_joints', 'oks_iou', 'OneEuroFilter', 'Smoother' +] \ No newline at end of file diff --git a/EdgeCape/models/utils/post_processing/group.py b/EdgeCape/models/utils/post_processing/group.py new file mode 100644 index 0000000000000000000000000000000000000000..24077c7a6e8da0ad8e503680f79f6301077d1a95 --- /dev/null +++ b/EdgeCape/models/utils/post_processing/group.py @@ -0,0 +1,557 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/princeton-vl/pose-ae-train/ +# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License. +# ------------------------------------------------------------------------------ + +import numpy as np +import torch +from munkres import Munkres + +from mmpose.core.evaluation import post_dark_udp + + +def _py_max_match(scores): + """Apply munkres algorithm to get the best match. + + Args: + scores(np.ndarray): cost matrix. + + Returns: + np.ndarray: best match. + """ + m = Munkres() + tmp = m.compute(scores) + tmp = np.array(tmp).astype(int) + return tmp + + +def _match_by_tag(inp, params): + """Match joints by tags. Use Munkres algorithm to calculate the best match + for keypoints grouping. + + Note: + number of keypoints: K + max number of people in an image: M (M=30 by default) + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + inp(tuple): + tag_k (np.ndarray[KxMxL]): tag corresponding to the + top k values of feature map per keypoint. + loc_k (np.ndarray[KxMx2]): top k locations of the + feature maps for keypoint. + val_k (np.ndarray[KxM]): top k value of the + feature maps per keypoint. + params(Params): class Params(). + + Returns: + np.ndarray: result of pose groups. + """ + assert isinstance(params, _Params), 'params should be class _Params()' + + tag_k, loc_k, val_k = inp + + default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]), + dtype=np.float32) + + joint_dict = {} + tag_dict = {} + for i in range(params.num_joints): + idx = params.joint_order[i] + + tags = tag_k[idx] + joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1) + mask = joints[:, 2] > params.detection_threshold + tags = tags[mask] # shape: [M, L] + joints = joints[mask] # shape: [M, 3 + L], 3: x, y, val + + if joints.shape[0] == 0: + continue + + if i == 0 or len(joint_dict) == 0: + for tag, joint in zip(tags, joints): + key = tag[0] + joint_dict.setdefault(key, np.copy(default_))[idx] = joint + tag_dict[key] = [tag] + else: + # shape: [M] + grouped_keys = list(joint_dict.keys()) + if params.ignore_too_much: + grouped_keys = grouped_keys[:params.max_num_people] + # shape: [M, L] + grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys] + + # shape: [M, M, L] + diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :] + # shape: [M, M] + diff_normed = np.linalg.norm(diff, ord=2, axis=2) + diff_saved = np.copy(diff_normed) + + if params.use_detection_val: + diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3] + + num_added = diff.shape[0] + num_grouped = diff.shape[1] + + if num_added > num_grouped: + diff_normed = np.concatenate( + (diff_normed, + np.zeros((num_added, num_added - num_grouped), + dtype=np.float32) + 1e10), + axis=1) + + pairs = _py_max_match(diff_normed) + for row, col in pairs: + if (row < num_added and col < num_grouped + and diff_saved[row][col] < params.tag_threshold): + key = grouped_keys[col] + joint_dict[key][idx] = joints[row] + tag_dict[key].append(tags[row]) + else: + key = tags[row][0] + joint_dict.setdefault(key, np.copy(default_))[idx] = \ + joints[row] + tag_dict[key] = [tags[row]] + + joint_dict_keys = list(joint_dict.keys()) + if params.ignore_too_much: + # The new person joints beyond the params.max_num_people will be + # ignored, for the dict is in ordered when python > 3.6 version. + joint_dict_keys = joint_dict_keys[:params.max_num_people] + results = np.array([joint_dict[i] + for i in joint_dict_keys]).astype(np.float32) + return results + + +class _Params: + """A class of parameter. + + Args: + cfg(Config): config. + """ + + def __init__(self, cfg): + self.num_joints = cfg['num_joints'] + self.max_num_people = cfg['max_num_people'] + + self.detection_threshold = cfg['detection_threshold'] + self.tag_threshold = cfg['tag_threshold'] + self.use_detection_val = cfg['use_detection_val'] + self.ignore_too_much = cfg['ignore_too_much'] + + if self.num_joints == 17: + self.joint_order = [ + i - 1 for i in + [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17] + ] + else: + self.joint_order = list(np.arange(self.num_joints)) + + +class HeatmapParser: + """The heatmap parser for post processing.""" + + def __init__(self, cfg): + self.params = _Params(cfg) + self.tag_per_joint = cfg['tag_per_joint'] + self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1, + cfg['nms_padding']) + self.use_udp = cfg.get('use_udp', False) + self.score_per_joint = cfg.get('score_per_joint', False) + + def nms(self, heatmaps): + """Non-Maximum Suppression for heatmaps. + + Args: + heatmap(torch.Tensor): Heatmaps before nms. + + Returns: + torch.Tensor: Heatmaps after nms. + """ + + maxm = self.pool(heatmaps) + maxm = torch.eq(maxm, heatmaps).float() + heatmaps = heatmaps * maxm + + return heatmaps + + def match(self, tag_k, loc_k, val_k): + """Group keypoints to human poses in a batch. + + Args: + tag_k (np.ndarray[NxKxMxL]): tag corresponding to the + top k values of feature map per keypoint. + loc_k (np.ndarray[NxKxMx2]): top k locations of the + feature maps for keypoint. + val_k (np.ndarray[NxKxM]): top k value of the + feature maps per keypoint. + + Returns: + list + """ + + def _match(x): + return _match_by_tag(x, self.params) + + return list(map(_match, zip(tag_k, loc_k, val_k))) + + def top_k(self, heatmaps, tags): + """Find top_k values in an image. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + max number of people: M + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmaps (torch.Tensor[NxKxHxW]) + tags (torch.Tensor[NxKxHxWxL]) + + Returns: + dict: A dict containing top_k values. + + - tag_k (np.ndarray[NxKxMxL]): + tag corresponding to the top k values of + feature map per keypoint. + - loc_k (np.ndarray[NxKxMx2]): + top k location of feature map per keypoint. + - val_k (np.ndarray[NxKxM]): + top k value of feature map per keypoint. + """ + heatmaps = self.nms(heatmaps) + N, K, H, W = heatmaps.size() + heatmaps = heatmaps.view(N, K, -1) + val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2) + + tags = tags.view(tags.size(0), tags.size(1), W * H, -1) + if not self.tag_per_joint: + tags = tags.expand(-1, self.params.num_joints, -1, -1) + + tag_k = torch.stack( + [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))], + dim=3) + + x = ind % W + y = ind // W + + ind_k = torch.stack((x, y), dim=3) + + results = { + 'tag_k': tag_k.cpu().numpy(), + 'loc_k': ind_k.cpu().numpy(), + 'val_k': val_k.cpu().numpy() + } + + return results + + @staticmethod + def adjust(results, heatmaps): + """Adjust the coordinates for better accuracy. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + + Args: + results (list(np.ndarray)): Keypoint predictions. + heatmaps (torch.Tensor[NxKxHxW]): Heatmaps. + """ + _, _, H, W = heatmaps.shape + for batch_id, people in enumerate(results): + for people_id, people_i in enumerate(people): + for joint_id, joint in enumerate(people_i): + if joint[2] > 0: + x, y = joint[0:2] + xx, yy = int(x), int(y) + tmp = heatmaps[batch_id][joint_id] + if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1), + xx]: + y += 0.25 + else: + y -= 0.25 + + if tmp[yy, min(W - 1, xx + 1)] > tmp[yy, + max(0, xx - 1)]: + x += 0.25 + else: + x -= 0.25 + results[batch_id][people_id, joint_id, + 0:2] = (x + 0.5, y + 0.5) + return results + + @staticmethod + def refine(heatmap, tag, keypoints, use_udp=False): + """Given initial keypoint predictions, we identify missing joints. + + Note: + number of keypoints: K + heatmap height: H + heatmap width: W + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmap: np.ndarray(K, H, W). + tag: np.ndarray(K, H, W) | np.ndarray(K, H, W, L) + keypoints: np.ndarray of size (K, 3 + L) + last dim is (x, y, score, tag). + use_udp: bool-unbiased data processing + + Returns: + np.ndarray: The refined keypoints. + """ + + K, H, W = heatmap.shape + if len(tag.shape) == 3: + tag = tag[..., None] + + tags = [] + for i in range(K): + if keypoints[i, 2] > 0: + # save tag value of detected keypoint + x, y = keypoints[i][:2].astype(int) + x = np.clip(x, 0, W - 1) + y = np.clip(y, 0, H - 1) + tags.append(tag[i, y, x]) + + # mean tag of current detected people + prev_tag = np.mean(tags, axis=0) + results = [] + + for _heatmap, _tag in zip(heatmap, tag): + # distance of all tag values with mean tag of + # current detected people + distance_tag = (((_tag - + prev_tag[None, None, :])**2).sum(axis=2)**0.5) + norm_heatmap = _heatmap - np.round(distance_tag) + + # find maximum position + y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape) + xx = x.copy() + yy = y.copy() + # detection score at maximum position + val = _heatmap[y, x] + if not use_udp: + # offset by 0.5 + x += 0.5 + y += 0.5 + + # add a quarter offset + if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]: + x += 0.25 + else: + x -= 0.25 + + if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]: + y += 0.25 + else: + y -= 0.25 + + results.append((x, y, val)) + results = np.array(results) + + if results is not None: + for i in range(K): + # add keypoint if it is not detected + if results[i, 2] > 0 and keypoints[i, 2] == 0: + keypoints[i, :3] = results[i, :3] + + return keypoints + + def parse(self, heatmaps, tags, adjust=True, refine=True): + """Group keypoints into poses given heatmap and tag. + + Note: + batch size: N + number of keypoints: K + heatmap height: H + heatmap width: W + dim of tags: L + If use flip testing, L=2; else L=1. + + Args: + heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps. + tags (torch.Tensor[NxKxHxWxL]): model output tagmaps. + + Returns: + tuple: A tuple containing keypoint grouping results. + + - results (list(np.ndarray)): Pose results. + - scores (list/list(np.ndarray)): Score of people. + """ + results = self.match(**self.top_k(heatmaps, tags)) + + if adjust: + if self.use_udp: + for i in range(len(results)): + if results[i].shape[0] > 0: + results[i][..., :2] = post_dark_udp( + results[i][..., :2].copy(), heatmaps[i:i + 1, :]) + else: + results = self.adjust(results, heatmaps) + + if self.score_per_joint: + scores = [i[:, 2] for i in results[0]] + else: + scores = [i[:, 2].mean() for i in results[0]] + + if refine: + results = results[0] + # for every detected person + for i in range(len(results)): + heatmap_numpy = heatmaps[0].cpu().numpy() + tag_numpy = tags[0].cpu().numpy() + if not self.tag_per_joint: + tag_numpy = np.tile(tag_numpy, + (self.params.num_joints, 1, 1, 1)) + results[i] = self.refine( + heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp) + results = [results] + + return results, scores + + +class HeatmapOffsetParser: + """The heatmap&offset parser for post processing.""" + + def __init__(self, cfg): + super(HeatmapOffsetParser, self).__init__() + + self.num_joints = cfg['num_joints'] + self.keypoint_threshold = cfg['keypoint_threshold'] + self.max_num_people = cfg['max_num_people'] + + # init pooling layer + kernel_size = cfg.get('max_pool_kernel', 5) + self.pool = torch.nn.MaxPool2d(kernel_size, 1, kernel_size // 2) + + def _offset_to_pose(self, offsets): + """Convert offset maps to pose maps. + + Note: + batch size: N + number of keypoints: K + offset maps height: H + offset maps width: W + + Args: + offsets (torch.Tensor[NxKxHxW]): model output offset maps. + + Returns: + torch.Tensor[NxKxHxW]: A tensor containing pose for each pixel. + """ + h, w = offsets.shape[-2:] + offsets = offsets.view(self.num_joints, -1, h, w) + + # generate regular coordinates + x = torch.arange(0, offsets.shape[-1]).float() + y = torch.arange(0, offsets.shape[-2]).float() + y, x = torch.meshgrid(y, x) + regular_coords = torch.stack((x, y), dim=0).unsqueeze(0) + + posemaps = regular_coords.to(offsets) - offsets + posemaps = posemaps.view(1, -1, h, w) + return posemaps + + def _get_maximum_from_heatmap(self, heatmap): + """Find local maximum of heatmap to localize instances. + + Note: + batch size: N + heatmap height: H + heatmap width: W + + Args: + heatmap (torch.Tensor[Nx1xHxW]): model output center heatmap. + + Returns: + tuple: A tuple containing instances detection results. + + - pos_idx (torch.Tensor): Index of pixels which have detected + instances. + - score (torch.Tensor): Score of detected instances. + """ + assert heatmap.size(0) == 1 and heatmap.size(1) == 1 + max_map = torch.eq(heatmap, self.pool(heatmap)).float() + heatmap = heatmap * max_map + score = heatmap.view(-1) + + score, pos_idx = score.topk(self.max_num_people) + mask = score > self.keypoint_threshold + score = score[mask] + pos_idx = pos_idx[mask] + return pos_idx, score + + def decode(self, heatmaps, offsets): + """Convert center heatmaps and offset maps to poses. + + Note: + batch size: N + number of keypoints: K + offset maps height: H + offset maps width: W + + Args: + heatmaps (torch.Tensor[Nx(1+K)xHxW]): model output heatmaps. + offsets (torch.Tensor[NxKxHxW]): model output offset maps. + + Returns: + torch.Tensor[NxKx4]: A tensor containing predicted pose and + score for each instance. + """ + + posemap = self._offset_to_pose(offsets) + inst_indexes, inst_scores = self._get_maximum_from_heatmap( + heatmaps[:, :1]) + + poses = posemap.view(posemap.size(1), -1)[..., inst_indexes] + poses = poses.view(self.num_joints, 2, -1).permute(2, 0, + 1).contiguous() + inst_scores = inst_scores.unsqueeze(1).unsqueeze(2).expand( + poses.size()) + poses = torch.cat((poses, inst_scores), dim=2) + return poses.clone() + + def refine_score(self, heatmaps, poses): + """Refine instance scores with keypoint heatmaps. + + Note: + batch size: N + number of keypoints: K + offset maps height: H + offset maps width: W + + Args: + heatmaps (torch.Tensor[Nx(1+K)xHxW]): model output heatmaps. + poses (torch.Tensor[NxKx4]): decoded pose and score for each + instance. + + Returns: + torch.Tensor[NxKx4]: poses with refined scores. + """ + normed_poses = poses.unsqueeze(0).permute(2, 0, 1, 3).contiguous() + normed_poses = torch.cat(( + normed_poses.narrow(3, 0, 1) / (heatmaps.size(3) - 1) * 2 - 1, + normed_poses.narrow(3, 1, 1) / (heatmaps.size(2) - 1) * 2 - 1, + ), + dim=3) + kpt_scores = torch.nn.functional.grid_sample( + heatmaps[:, 1:].view(self.num_joints, 1, heatmaps.size(2), + heatmaps.size(3)), + normed_poses, + padding_mode='border').view(self.num_joints, -1) + kpt_scores = kpt_scores.transpose(0, 1).contiguous() + + # scores only from keypoint heatmaps + poses[..., 3] = kpt_scores + # combine center and keypoint heatmaps + poses[..., 2] = poses[..., 2] * kpt_scores + + return poses \ No newline at end of file diff --git a/EdgeCape/models/utils/post_processing/nms.py b/EdgeCape/models/utils/post_processing/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..f4db52f95ed7c287ff7663327bb14a9a1955f87d --- /dev/null +++ b/EdgeCape/models/utils/post_processing/nms.py @@ -0,0 +1,279 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import numpy as np + + +def nms(dets, thr): + """Greedily select boxes with high confidence and overlap <= thr. + + Args: + dets: [[x1, y1, x2, y2, score]]. + thr: Retain overlap < thr. + + Returns: + list: Indexes to keep. + """ + if len(dets) == 0: + return [] + + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while len(order) > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thr)[0] + order = order[inds + 1] + + return keep + + +def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None): + """Calculate oks ious. + + Args: + g: Ground truth keypoints. + d: Detected keypoints. + a_g: Area of the ground truth object. + a_d: Area of the detected object. + sigmas: standard deviation of keypoint labelling. + vis_thr: threshold of the keypoint visibility. + + Returns: + list: The oks ious. + """ + if sigmas is None: + sigmas = np.array([ + .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, + .87, .87, .89, .89 + ]) / 10.0 + vars = (sigmas * 2)**2 + xg = g[0::3] + yg = g[1::3] + vg = g[2::3] + ious = np.zeros(len(d), dtype=np.float32) + for n_d in range(0, len(d)): + xd = d[n_d, 0::3] + yd = d[n_d, 1::3] + vd = d[n_d, 2::3] + dx = xd - xg + dy = yd - yg + e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 + if vis_thr is not None: + ind = list(vg > vis_thr) and list(vd > vis_thr) + e = e[ind] + ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0 + return ious + + +def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False): + """OKS NMS implementations. + + Args: + kpts_db: keypoints. + thr: Retain overlap < thr. + sigmas: standard deviation of keypoint labelling. + vis_thr: threshold of the keypoint visibility. + score_per_joint: the input scores (in kpts_db) are per joint scores + + Returns: + np.ndarray: indexes to keep. + """ + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'].flatten() for k in kpts_db]) + areas = np.array([k['area'] for k in kpts_db]) + + order = scores.argsort()[::-1] + + keep = [] + while len(order) > 0: + i = order[0] + keep.append(i) + + oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], + sigmas, vis_thr) + + inds = np.where(oks_ovr <= thr)[0] + order = order[inds + 1] + + keep = np.array(keep) + + return keep + + +def _rescore(overlap, scores, thr, type='gaussian'): + """Rescoring mechanism gaussian or linear. + + Args: + overlap: calculated ious + scores: target scores. + thr: retain oks overlap < thr. + type: 'gaussian' or 'linear' + + Returns: + np.ndarray: indexes to keep + """ + assert len(overlap) == len(scores) + assert type in ['gaussian', 'linear'] + + if type == 'linear': + inds = np.where(overlap >= thr)[0] + scores[inds] = scores[inds] * (1 - overlap[inds]) + else: + scores = scores * np.exp(-overlap**2 / thr) + + return scores + + +def soft_oks_nms(kpts_db, + thr, + max_dets=20, + sigmas=None, + vis_thr=None, + score_per_joint=False): + """Soft OKS NMS implementations. + + Args: + kpts_db: keypoints and scores. + thr: retain oks overlap < thr. + max_dets: max number of detections to keep. + sigmas: Keypoint labelling uncertainty. + score_per_joint: the input scores (in kpts_db) are per joint scores + + Returns: + np.ndarray: indexes to keep. + """ + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'].flatten() for k in kpts_db]) + areas = np.array([k['area'] for k in kpts_db]) + + order = scores.argsort()[::-1] + scores = scores[order] + + keep = np.zeros(max_dets, dtype=np.intp) + keep_cnt = 0 + while len(order) > 0 and keep_cnt < max_dets: + i = order[0] + + oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], + sigmas, vis_thr) + + order = order[1:] + scores = _rescore(oks_ovr, scores[1:], thr) + + tmp = scores.argsort()[::-1] + order = order[tmp] + scores = scores[tmp] + + keep[keep_cnt] = i + keep_cnt += 1 + + keep = keep[:keep_cnt] + + return keep + + +def nearby_joints_nms( + kpts_db, + dist_thr, + num_nearby_joints_thr=None, + score_per_joint=False, + max_dets=-1, +): + """Nearby joints NMS implementations. + + Args: + kpts_db (list[dict]): keypoints and scores. + dist_thr (float): threshold for judging whether two joints are close. + num_nearby_joints_thr (int): threshold for judging whether two + instances are close. + max_dets (int): max number of detections to keep. + score_per_joint (bool): the input scores (in kpts_db) are per joint + scores. + + Returns: + np.ndarray: indexes to keep. + """ + + assert dist_thr > 0, '`dist_thr` must be greater than 0.' + if len(kpts_db) == 0: + return [] + + if score_per_joint: + scores = np.array([k['score'].mean() for k in kpts_db]) + else: + scores = np.array([k['score'] for k in kpts_db]) + + kpts = np.array([k['keypoints'] for k in kpts_db]) + + num_people, num_joints, _ = kpts.shape + if num_nearby_joints_thr is None: + num_nearby_joints_thr = num_joints // 2 + assert num_nearby_joints_thr < num_joints, '`num_nearby_joints_thr` must '\ + 'be less than the number of joints.' + + # compute distance threshold + pose_area = kpts.max(axis=1) - kpts.min(axis=1) + pose_area = np.sqrt(np.power(pose_area, 2).sum(axis=1)) + pose_area = pose_area.reshape(num_people, 1, 1) + pose_area = np.tile(pose_area, (num_people, num_joints)) + close_dist_thr = pose_area * dist_thr + + # count nearby joints between instances + instance_dist = kpts[:, None] - kpts + instance_dist = np.sqrt(np.power(instance_dist, 2).sum(axis=3)) + close_instance_num = (instance_dist < close_dist_thr).sum(2) + close_instance = close_instance_num > num_nearby_joints_thr + + # apply nms + ignored_pose_inds, keep_pose_inds = set(), list() + indexes = np.argsort(scores)[::-1] + for i in indexes: + if i in ignored_pose_inds: + continue + keep_inds = close_instance[i].nonzero()[0] + keep_ind = keep_inds[np.argmax(scores[keep_inds])] + if keep_ind not in ignored_pose_inds: + keep_pose_inds.append(keep_ind) + ignored_pose_inds = ignored_pose_inds.union(set(keep_inds)) + + # limit the number of output instances + if max_dets > 0 and len(keep_pose_inds) > max_dets: + sub_inds = np.argsort(scores[keep_pose_inds])[-1:-max_dets - 1:-1] + keep_pose_inds = [keep_pose_inds[i] for i in sub_inds] + + return keep_pose_inds \ No newline at end of file diff --git a/EdgeCape/models/utils/post_processing/one_euro_filter.py b/EdgeCape/models/utils/post_processing/one_euro_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..4a1a834ab7520b2c6f051ccf5710a9c4fa15dc05 --- /dev/null +++ b/EdgeCape/models/utils/post_processing/one_euro_filter.py @@ -0,0 +1,113 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy +# Original licence: Copyright (c) HoBeom Jeon, under the MIT License. +# ------------------------------------------------------------------------------ +import warnings +from time import time + +import numpy as np + + +def smoothing_factor(t_e, cutoff): + r = 2 * np.pi * cutoff * t_e + return r / (r + 1) + + +def exponential_smoothing(a, x, x_prev): + return a * x + (1 - a) * x_prev + + +class OneEuroFilter: + + def __init__(self, + x0, + dx0=0.0, + min_cutoff=1.7, + beta=0.3, + d_cutoff=30.0, + fps=None): + """One Euro Filter for keypoints smoothing. + + Args: + x0 (np.ndarray[K, 2]): Initialize keypoints value + dx0 (float): 0.0 + min_cutoff (float): parameter for one euro filter + beta (float): parameter for one euro filter + d_cutoff (float): Input data FPS + fps (float): Video FPS for video inference + """ + warnings.warn( + 'OneEuroFilter from ' + '`mmpose/core/post_processing/one_euro_filter.py` will ' + 'be deprecated in the future. Please use Smoother' + '(`mmpose/core/post_processing/smoother.py`) with ' + 'OneEuroFilter (`mmpose/core/post_processing/temporal_' + 'filters/one_euro_filter.py`).', DeprecationWarning) + + # The parameters. + self.data_shape = x0.shape + self.min_cutoff = np.full(x0.shape, min_cutoff) + self.beta = np.full(x0.shape, beta) + self.d_cutoff = np.full(x0.shape, d_cutoff) + # Previous values. + self.x_prev = x0.astype(np.float32) + self.dx_prev = np.full(x0.shape, dx0) + self.mask_prev = np.ma.masked_where(x0 <= 0, x0) + self.realtime = True + if fps is None: + # Using in realtime inference + self.t_e = None + self.skip_frame_factor = d_cutoff + self.fps = d_cutoff + else: + # fps using video inference + self.realtime = False + self.fps = float(fps) + self.d_cutoff = np.full(x0.shape, self.fps) + + self.t_prev = time() + + def __call__(self, x, t_e=1.0): + """Compute the filtered signal. + + Hyper-parameters (cutoff, beta) are from `VNect + `__ . + + Realtime Camera fps (d_cutoff) default 30.0 + + Args: + x (np.ndarray[K, 2]): keypoints results in frame + t_e (Optional): video skip frame count for posetrack + evaluation + """ + assert x.shape == self.data_shape + + t = 0 + if self.realtime: + t = time() + t_e = (t - self.t_prev) * self.skip_frame_factor + t_e = np.full(x.shape, t_e) + + # missing keypoints mask + mask = np.ma.masked_where(x <= 0, x) + + # The filtered derivative of the signal. + a_d = smoothing_factor(t_e / self.fps, self.d_cutoff) + dx = (x - self.x_prev) / t_e + dx_hat = exponential_smoothing(a_d, dx, self.dx_prev) + + # The filtered signal. + cutoff = self.min_cutoff + self.beta * np.abs(dx_hat) + a = smoothing_factor(t_e / self.fps, cutoff) + x_hat = exponential_smoothing(a, x, self.x_prev) + + # missing keypoints remove + np.copyto(x_hat, -10, where=mask.mask) + + # Memorize the previous values. + self.x_prev = x_hat + self.dx_prev = dx_hat + self.t_prev = t + self.mask_prev = mask + + return x_hat \ No newline at end of file diff --git a/EdgeCape/models/utils/post_processing/post_transforms.py b/EdgeCape/models/utils/post_processing/post_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..ca0b22aac2515bb332742642715b9d4de9e45629 --- /dev/null +++ b/EdgeCape/models/utils/post_processing/post_transforms.py @@ -0,0 +1,366 @@ +# ------------------------------------------------------------------------------ +# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch +# Original licence: Copyright (c) Microsoft, under the MIT License. +# ------------------------------------------------------------------------------ + +import math + +import cv2 +import numpy as np +import torch + + +def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs): + """Flip human joints horizontally. + + Note: + - num_keypoints: K + + Args: + joints_3d (np.ndarray([K, 3])): Coordinates of keypoints. + joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints. + img_width (int): Image width. + flip_pairs (list[tuple]): Pairs of keypoints which are mirrored + (for example, left ear and right ear). + + Returns: + tuple: Flipped human joints. + + - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints. + - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility. + """ + + assert len(joints_3d) == len(joints_3d_visible) + assert img_width > 0 + + joints_3d_flipped = joints_3d.copy() + joints_3d_visible_flipped = joints_3d_visible.copy() + + # Swap left-right parts + for left, right in flip_pairs: + joints_3d_flipped[left, :] = joints_3d[right, :] + joints_3d_flipped[right, :] = joints_3d[left, :] + + joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :] + joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :] + + # Flip horizontally + joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0] + joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0) + + return joints_3d_flipped, joints_3d_visible_flipped + + +def fliplr_regression(regression, + flip_pairs, + center_mode='static', + center_x=0.5, + center_index=0): + """Flip human joints horizontally. + + Note: + - batch_size: N + - num_keypoint: K + + Args: + regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K + is the joint number and C is the dimension. Example shapes are: + + - [N, K, C]: a batch of keypoints where N is the batch size. + - [N, T, K, C]: a batch of pose sequences, where T is the frame + number. + flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + center_mode (str): The mode to set the center location on the x-axis + to flip around. Options are: + + - static: use a static x value (see center_x also) + - root: use a root joint (see center_index also) + center_x (float): Set the x-axis location of the flip center. Only used + when center_mode=static. + center_index (int): Set the index of the root joint, whose x location + will be used as the flip center. Only used when center_mode=root. + + Returns: + np.ndarray([..., K, C]): Flipped joints. + """ + assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}' + + allowed_center_mode = {'static', 'root'} + assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \ + f'{center_mode}, allowed choices are {allowed_center_mode}' + + if center_mode == 'static': + x_c = center_x + elif center_mode == 'root': + assert regression.shape[-2] > center_index + x_c = regression[..., center_index:center_index + 1, 0] + + regression_flipped = regression.copy() + # Swap left-right parts + for left, right in flip_pairs: + regression_flipped[..., left, :] = regression[..., right, :] + regression_flipped[..., right, :] = regression[..., left, :] + + # Flip horizontally + regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0] + return regression_flipped + + +def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): + """Flip the flipped heatmaps back to the original form. + + Note: + - batch_size: N + - num_keypoints: K + - heatmap height: H + - heatmap width: W + + Args: + output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained + from the flipped images. + flip_pairs (list[tuple()): Pairs of keypoints which are mirrored + (for example, left ear -- right ear). + target_type (str): GaussianHeatmap or CombinedTarget + + Returns: + np.ndarray: heatmaps that flipped back to the original image + """ + assert output_flipped.ndim == 4, \ + 'output_flipped should be [batch_size, num_keypoints, height, width]' + shape_ori = output_flipped.shape + channels = 1 + if target_type.lower() == 'CombinedTarget'.lower(): + channels = 3 + output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] + output_flipped = output_flipped.reshape(shape_ori[0], -1, channels, + shape_ori[2], shape_ori[3]) + output_flipped_back = output_flipped.copy() + + # Swap left-right parts + for left, right in flip_pairs: + output_flipped_back[:, left, ...] = output_flipped[:, right, ...] + output_flipped_back[:, right, ...] = output_flipped[:, left, ...] + output_flipped_back = output_flipped_back.reshape(shape_ori) + # Flip horizontally + output_flipped_back = output_flipped_back[..., ::-1] + return output_flipped_back + + +def transform_preds(coords, center, scale, output_size, use_udp=False): + """Get final keypoint predictions from heatmaps and apply scaling and + translation to map them back to the image. + + Note: + num_keypoints: K + + Args: + coords (np.ndarray[K, ndims]): + + * If ndims=2, corrds are predicted keypoint location. + * If ndims=4, corrds are composed of (x, y, scores, tags) + * If ndims=5, corrds are composed of (x, y, scores, tags, + flipped_tags) + + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + output_size (np.ndarray[2, ] | list(2,)): Size of the + destination heatmaps. + use_udp (bool): Use unbiased data processing + + Returns: + np.ndarray: Predicted coordinates in the images. + """ + assert coords.shape[1] in (2, 4, 5) + assert len(center) == 2 + assert len(scale) == 2 + assert len(output_size) == 2 + + # Recover the scale which is normalized by a factor of 200. + scale = scale * 200.0 + + if use_udp: + scale_x = scale[0] / (output_size[0] - 1.0) + scale_y = scale[1] / (output_size[1] - 1.0) + else: + scale_x = scale[0] / output_size[0] + scale_y = scale[1] / output_size[1] + + target_coords = coords.copy() + target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5 + target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5 + + return target_coords + + +def get_affine_transform(center, + scale, + rot, + output_size, + shift=(0., 0.), + inv=False): + """Get the affine transform matrix, given the center/scale/rot/output_size. + + Args: + center (np.ndarray[2, ]): Center of the bounding box (x, y). + scale (np.ndarray[2, ]): Scale of the bounding box + wrt [width, height]. + rot (float): Rotation angle (degree). + output_size (np.ndarray[2, ] | list(2,)): Size of the + destination heatmaps. + shift (0-100%): Shift translation ratio wrt the width/height. + Default (0., 0.). + inv (bool): Option to inverse the affine transform direction. + (inv=False: src->dst or inv=True: dst->src) + + Returns: + np.ndarray: The transform matrix. + """ + assert len(center) == 2 + assert len(scale) == 2 + assert len(output_size) == 2 + assert len(shift) == 2 + + # pixel_std is 200. + scale_tmp = scale * 200.0 + + shift = np.array(shift) + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = rotate_point([0., src_w * -0.5], rot_rad) + dst_dir = np.array([0., dst_w * -0.5]) + + src = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + src[2, :] = _get_3rd_point(src[0, :], src[1, :]) + + dst = np.zeros((3, 2), dtype=np.float32) + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir + dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + + return trans + + +def affine_transform(pt, trans_mat): + """Apply an affine transformation to the points. + + Args: + pt (np.ndarray): a 2 dimensional point to be transformed + trans_mat (np.ndarray): 2x3 matrix of an affine transform + + Returns: + np.ndarray: Transformed points. + """ + assert len(pt) == 2 + new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.]) + + return new_pt + + +def _get_3rd_point(a, b): + """To calculate the affine matrix, three pairs of points are required. This + function is used to get the 3rd point, given 2D points a & b. + + The 3rd point is defined by rotating vector `a - b` by 90 degrees + anticlockwise, using b as the rotation center. + + Args: + a (np.ndarray): point(x,y) + b (np.ndarray): point(x,y) + + Returns: + np.ndarray: The 3rd point. + """ + assert len(a) == 2 + assert len(b) == 2 + direction = a - b + third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) + + return third_pt + + +def rotate_point(pt, angle_rad): + """Rotate a point by an angle. + + Args: + pt (list[float]): 2 dimensional point to be rotated + angle_rad (float): rotation angle by radian + + Returns: + list[float]: Rotated point. + """ + assert len(pt) == 2 + sn, cs = np.sin(angle_rad), np.cos(angle_rad) + new_x = pt[0] * cs - pt[1] * sn + new_y = pt[0] * sn + pt[1] * cs + rotated_pt = [new_x, new_y] + + return rotated_pt + + +def get_warp_matrix(theta, size_input, size_dst, size_target): + """Calculate the transformation matrix under the constraint of unbiased. + Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased + Data Processing for Human Pose Estimation (CVPR 2020). + + Args: + theta (float): Rotation angle in degrees. + size_input (np.ndarray): Size of input image [w, h]. + size_dst (np.ndarray): Size of output image [w, h]. + size_target (np.ndarray): Size of ROI in input plane [w, h]. + + Returns: + np.ndarray: A matrix for transformation. + """ + theta = np.deg2rad(theta) + matrix = np.zeros((2, 3), dtype=np.float32) + scale_x = size_dst[0] / size_target[0] + scale_y = size_dst[1] / size_target[1] + matrix[0, 0] = math.cos(theta) * scale_x + matrix[0, 1] = -math.sin(theta) * scale_x + matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) + + 0.5 * size_input[1] * math.sin(theta) + + 0.5 * size_target[0]) + matrix[1, 0] = math.sin(theta) * scale_y + matrix[1, 1] = math.cos(theta) * scale_y + matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) - + 0.5 * size_input[1] * math.cos(theta) + + 0.5 * size_target[1]) + return matrix + + +def warp_affine_joints(joints, mat): + """Apply affine transformation defined by the transform matrix on the + joints. + + Args: + joints (np.ndarray[..., 2]): Origin coordinate of joints. + mat (np.ndarray[3, 2]): The affine matrix. + + Returns: + np.ndarray[..., 2]: Result coordinate of joints. + """ + joints = np.array(joints) + shape = joints.shape + joints = joints.reshape(-1, 2) + return np.dot( + np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1), + mat.T).reshape(shape) + + +def affine_transform_torch(pts, t): + npts = pts.shape[0] + pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1) + out = torch.mm(t, torch.t(pts_homo)) + return torch.t(out[:2, :]) \ No newline at end of file diff --git a/EdgeCape/models/utils/post_processing/smoother.py b/EdgeCape/models/utils/post_processing/smoother.py new file mode 100644 index 0000000000000000000000000000000000000000..d15db14ca98cc052da477db50c4d6e22df3b73d4 --- /dev/null +++ b/EdgeCape/models/utils/post_processing/smoother.py @@ -0,0 +1,227 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from typing import Dict, Union + +import numpy as np +from mmcv import Config, is_seq_of + +from mmpose.core.post_processing.temporal_filters import build_filter + + +class Smoother(): + """Smoother to apply temporal smoothing on pose estimation results with a + filter. + + Note: + T: The temporal length of the pose sequence + K: The keypoint number of each target + C: The keypoint coordinate dimension + + Args: + filter_cfg (dict | str): The filter config. See example config files in + `configs/_base_/filters/` for details. Alternatively a config file + path can be accepted and the config will be loaded. + keypoint_dim (int): The keypoint coordinate dimension, which is + also indicated as C. Default: 2 + keypoint_key (str): The dict key of the keypoints in the pose results. + Default: 'keypoints' + Example: + >>> import numpy as np + >>> # Build dummy pose result + >>> results = [] + >>> for t in range(10): + >>> results_t = [] + >>> for track_id in range(2): + >>> result = { + >>> 'track_id': track_id, + >>> 'keypoints': np.random.rand(17, 3) + >>> } + >>> results_t.append(result) + >>> results.append(results_t) + >>> # Example 1: Smooth multi-frame pose results offline. + >>> filter_cfg = dict(type='GaussianFilter', window_size=3) + >>> smoother = Smoother(filter_cfg, keypoint_dim=2) + >>> smoothed_results = smoother.smooth(results) + >>> # Example 2: Smooth pose results online frame-by-frame + >>> filter_cfg = dict(type='GaussianFilter', window_size=3) + >>> smoother = Smoother(filter_cfg, keypoint_dim=2) + >>> for result_t in results: + >>> smoothed_result_t = smoother.smooth(result_t) + """ + + def __init__(self, + filter_cfg: Union[Dict, str], + keypoint_dim: int = 2, + keypoint_key: str = 'keypoints'): + if isinstance(filter_cfg, str): + filter_cfg = Config.fromfile(filter_cfg).filter_cfg + self.filter_cfg = filter_cfg + self._filter = build_filter(filter_cfg) + self.keypoint_dim = keypoint_dim + self.key = keypoint_key + self.padding_size = self._filter.window_size - 1 + self.history = {} + + def _get_filter(self): + fltr = self._filter + if not fltr.shareable: + # If the filter is not shareable, build a new filter for the next + # requires + self._filter = build_filter(self.filter_cfg) + return fltr + + def _collate_pose(self, results): + """Collate the pose results to pose sequences. + + Args: + results (list[list[dict]]): The pose results of multiple frames. + + Returns: + dict[str, np.ndarray]: A dict of collated pose sequences, where + the key is the track_id (in untracked scenario, the target index + will be used as the track_id), and the value is the pose sequence + in an array of shape [T, K, C] + """ + + if self._has_track_id(results): + # If the results have track_id, use it as the target indicator + results = [{res['track_id']: res + for res in results_t} for results_t in results] + track_ids = results[0].keys() + + for t, results_t in enumerate(results[1:]): + if results_t.keys() != track_ids: + raise ValueError(f'Inconsistent track ids in frame {t+1}') + + collated = { + id: np.stack([ + results_t[id][self.key][:, :self.keypoint_dim] + for results_t in results + ]) + for id in track_ids + } + else: + # If the results don't have track_id, use the target index + # as the target indicator + n_target = len(results[0]) + for t, results_t in enumerate(results[1:]): + if len(results_t) != n_target: + raise ValueError( + f'Inconsistent target number in frame {t+1}: ' + f'{len(results_t)} vs {n_target}') + + collated = { + id: np.stack([ + results_t[id][self.key][:, :self.keypoint_dim] + for results_t in results + ]) + for id in range(n_target) + } + + return collated + + def _scatter_pose(self, results, poses): + """Scatter the smoothed pose sequences and use them to update the pose + results. + + Args: + results (list[list[dict]]): The original pose results + poses (dict[str, np.ndarray]): The smoothed pose sequences + + Returns: + list[list[dict]]: The updated pose results + """ + updated_results = [] + for t, results_t in enumerate(results): + updated_results_t = [] + if self._has_track_id(results): + id2result = ((result['track_id'], result) + for result in results_t) + else: + id2result = enumerate(results_t) + + for track_id, result in id2result: + result = copy.deepcopy(result) + result[self.key][:, :self.keypoint_dim] = poses[track_id][t] + updated_results_t.append(result) + + updated_results.append(updated_results_t) + return updated_results + + @staticmethod + def _has_track_id(results): + """Check if the pose results contain track_id.""" + return 'track_id' in results[0][0] + + def smooth(self, results): + """Apply temporal smoothing on pose estimation sequences. + + Args: + results (list[dict] | list[list[dict]]): The pose results of a + single frame (non-nested list) or multiple frames (nested + list). The result of each target is a dict, which should + contains: + + - track_id (optional, Any): The track ID of the target + - keypoints (np.ndarray): The keypoint coordinates in [K, C] + + Returns: + (list[dict] | list[list[dict]]): Temporal smoothed pose results, + which has the same data structure as the input's. + """ + + # Check if input is empty + if not (results) or not (results[0]): + warnings.warn('Smoother received empty result.') + return results + + # Check input is single frame or sequence + if is_seq_of(results, dict): + single_frame = True + results = [results] + else: + assert is_seq_of(results, list) + single_frame = False + + # Get temporal length of input + T = len(results) + + # Collate the input results to pose sequences + poses = self._collate_pose(results) + + # Smooth the pose sequence of each target + smoothed_poses = {} + update_history = {} + for track_id, pose in poses.items(): + if track_id in self.history: + # For tracked target, get its filter and pose history + pose_history, pose_filter = self.history[track_id] + if self.padding_size > 0: + # Pad the pose sequence with pose history + pose = np.concatenate((pose_history, pose), axis=0) + else: + # For new target, build a new filter + pose_filter = self._get_filter() + + # Update the history information + if self.padding_size > 0: + pose_history = pose[-self.padding_size:].copy() + else: + pose_history = None + update_history[track_id] = (pose_history, pose_filter) + + # Smooth the pose sequence with the filter + smoothed_pose = pose_filter(pose) + smoothed_poses[track_id] = smoothed_pose[-T:] + + self.history = update_history + + # Scatter the pose sequences back to the format of results + smoothed_results = self._scatter_pose(results, smoothed_poses) + + # If the input is single frame, remove the nested list to keep the + # output structure consistent with the input's + if single_frame: + smoothed_results = smoothed_results[0] + return smoothed_results \ No newline at end of file diff --git a/EdgeCape/models/utils/transformer.py b/EdgeCape/models/utils/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..de56b404646363fefd47ac495ba01503c394d5e8 --- /dev/null +++ b/EdgeCape/models/utils/transformer.py @@ -0,0 +1,329 @@ +import torch +import torch.nn as nn +from mmcv.cnn import (build_activation_layer, build_conv_layer, + build_norm_layer, xavier_init) +from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, + TransformerLayerSequence, + build_transformer_layer_sequence) +from mmcv.runner.base_module import BaseModule +from EdgeCape.models.utils.builder import TRANSFORMER + +@TRANSFORMER.register_module() +class Transformer(BaseModule): + """Implements the DETR transformer. + Following the official DETR implementation, this module copy-paste + from torch.nn.Transformer with modifications: + * positional encodings are passed in MultiheadAttention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers + See `paper: End-to-End Object Detection with Transformers + `_ for details. + Args: + encoder (`mmcv.ConfigDict` | Dict): Config of + TransformerEncoder. Defaults to None. + decoder ((`mmcv.ConfigDict` | Dict)): Config of + TransformerDecoder. Defaults to None + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Defaults to None. + """ + + def __init__(self, encoder=None, decoder=None, init_cfg=None): + super(Transformer, self).__init__(init_cfg=init_cfg) + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = self.encoder.embed_dims + + def init_weights(self): + # follow the official DETR to init parameters + for m in self.modules(): + if hasattr(m, 'weight') and m.weight.dim() > 1: + xavier_init(m, distribution='uniform') + self._is_init = True + + def forward(self, x, mask, query_embed, pos_embed, mask_query): + """Forward function for `Transformer`. + Args: + x (Tensor): Input query with shape [bs, c, h, w] where + c = embed_dims. + mask (Tensor): The key_padding_mask used for encoder and decoder, + with shape [bs, h, w]. + query_embed (Tensor): The query embedding for decoder, with shape + [num_query, c]. + pos_embed (Tensor): The positional encoding for encoder and + decoder, with the same shape as `x`. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - out_dec: Output from decoder. If return_intermediate_dec \ + is True output has shape [num_dec_layers, bs, + num_query, embed_dims], else has shape [1, bs, \ + num_query, embed_dims]. + - memory: Output results from encoder, with shape \ + [bs, embed_dims, h, w]. + + Notes: + x: query image features with shape [bs, c, h, w] + mask: mask for x with shape [bs, h, w] + pos_embed: positional embedding for x with shape [bs, c, h, w] + query_embed: sample keypoint features with shape [bs, num_query, c] + mask_query: mask for query_embed with shape [bs, num_query] + Outputs: + out_dec: [num_layers, bs, num_query, c] + memory: [bs, c, h, w] + + """ + bs, c, h, w = x.shape + # use `view` instead of `flatten` for dynamically exporting to ONNX + x = x.view(bs, c, -1).permute(2, 0, 1) # [bs, c, h, w] -> [h*w, bs, c] + mask = mask.view(bs, -1) # [bs, h, w] -> [bs, h*w] Note: this mask should be filled with False, since all images are with the same shape. + pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1) # positional embeding for memory, i.e., the query. + memory = self.encoder( + query=x, + key=None, + value=None, + query_pos=pos_embed, + query_key_padding_mask=mask) # output memory: [hw, bs, c] + + query_embed = query_embed.permute(1, 0, 2) # [bs, num_query, c] -> [num_query, bs, c] + # target = torch.zeros_like(query_embed) + # out_dec: [num_layers, num_query, bs, c] + out_dec = self.decoder( + query=query_embed, + key=memory, + value=memory, + key_pos=pos_embed, + # query_pos=query_embed, + query_key_padding_mask=mask_query, + key_padding_mask=mask) + out_dec = out_dec.transpose(1, 2) # [decoder_layer, bs, num_query, c] + memory = memory.permute(1, 2, 0).reshape(bs, c, h, w) + return out_dec, memory + +@TRANSFORMER_LAYER.register_module() +class DetrTransformerDecoderLayer(BaseTransformerLayer): + """Implements decoder layer in DETR transformer. + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__(self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(DetrTransformerDecoderLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + # assert len(operation_order) == 6 + # assert set(operation_order) == set( + # ['self_attn', 'norm', 'cross_attn', 'ffn']) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DetrTransformerEncoder(TransformerLayerSequence): + """TransformerEncoder of DETR. + Args: + post_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. Only used when `self.pre_norm` is `True` + """ + + def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs): + super(DetrTransformerEncoder, self).__init__(*args, **kwargs) + if post_norm_cfg is not None: + self.post_norm = build_norm_layer( + post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None + else: + # assert not self.pre_norm, f'Use prenorm in ' \ + # f'{self.__class__.__name__},' \ + # f'Please specify post_norm_cfg' + self.post_norm = None + + def forward(self, *args, **kwargs): + """Forward function for `TransformerCoder`. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + x = super(DetrTransformerEncoder, self).forward(*args, **kwargs) + if self.post_norm is not None: + x = self.post_norm(x) + return x + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DetrTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + post_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, + *args, + post_norm_cfg=dict(type='LN'), + return_intermediate=False, + **kwargs): + + super(DetrTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + if post_norm_cfg is not None: + self.post_norm = build_norm_layer(post_norm_cfg, + self.embed_dims)[1] + else: + self.post_norm = None + + def forward(self, query, *args, **kwargs): + """Forward function for `TransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + if not self.return_intermediate: + x = super().forward(query, *args, **kwargs) + if self.post_norm: + x = self.post_norm(x)[None] + return x + + intermediate = [] + for layer in self.layers: + query = layer(query, *args, **kwargs) + if self.return_intermediate: + if self.post_norm is not None: + intermediate.append(self.post_norm(query)) + else: + intermediate.append(query) + return torch.stack(intermediate) + + +@TRANSFORMER.register_module() +class DynamicConv(BaseModule): + """Implements Dynamic Convolution. + This module generate parameters for each sample and + use bmm to implement 1*1 convolution. Code is modified + from the `official github repo `_ . + Args: + in_channels (int): The input feature channel. + Defaults to 256. + feat_channels (int): The inner feature channel. + Defaults to 64. + out_channels (int, optional): The output feature channel. + When not specified, it will be set to `in_channels` + by default + input_feat_shape (int): The shape of input feature. + Defaults to 7. + with_proj (bool): Project two-dimentional feature to + one-dimentional feature. Default to True. + act_cfg (dict): The activation config for DynamicConv. + norm_cfg (dict): Config dict for normalization layer. Default + layer normalization. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=256, + feat_channels=64, + out_channels=None, + input_feat_shape=7, + with_proj=True, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + init_cfg=None): + super(DynamicConv, self).__init__(init_cfg) + self.in_channels = in_channels + self.feat_channels = feat_channels + self.out_channels_raw = out_channels + self.input_feat_shape = input_feat_shape + self.with_proj = with_proj + self.act_cfg = act_cfg + self.norm_cfg = norm_cfg + self.out_channels = out_channels if out_channels else in_channels + + self.num_params_in = self.in_channels * self.feat_channels + self.num_params_out = self.out_channels * self.feat_channels + self.dynamic_layer = nn.Linear( + self.in_channels, self.num_params_in + self.num_params_out) + + self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1] + self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1] + + self.activation = build_activation_layer(act_cfg) + + num_output = self.out_channels * input_feat_shape**2 + if self.with_proj: + self.fc_layer = nn.Linear(num_output, self.out_channels) + self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1] + + def forward(self, param_feature, input_feature): + """Forward function for `DynamicConv`. + Args: + param_feature (Tensor): The feature can be used + to generate the parameter, has shape + (num_all_proposals, in_channels). + input_feature (Tensor): Feature that + interact with parameters, has shape + (num_all_proposals, in_channels, H, W). + Returns: + Tensor: The output feature has shape + (num_all_proposals, out_channels). + """ + input_feature = input_feature.flatten(2).permute(2, 0, 1) + + input_feature = input_feature.permute(1, 0, 2) + parameters = self.dynamic_layer(param_feature) + + param_in = parameters[:, :self.num_params_in].view( + -1, self.in_channels, self.feat_channels) + param_out = parameters[:, -self.num_params_out:].view( + -1, self.feat_channels, self.out_channels) + + # input_feature has shape (num_all_proposals, H*W, in_channels) + # param_in has shape (num_all_proposals, in_channels, feat_channels) + # feature has shape (num_all_proposals, H*W, feat_channels) + features = torch.bmm(input_feature, param_in) + features = self.norm_in(features) + features = self.activation(features) + + # param_out has shape (batch_size, feat_channels, out_channels) + features = torch.bmm(features, param_out) + features = self.norm_out(features) + features = self.activation(features) + + if self.with_proj: + features = features.flatten(1) + features = self.fc_layer(features) + features = self.fc_norm(features) + features = self.activation(features) + + return features \ No newline at end of file diff --git a/EdgeCape/models/utils/visualization.py b/EdgeCape/models/utils/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..a400051e0b2fdb413d6ef0ebd44049071bc927fb --- /dev/null +++ b/EdgeCape/models/utils/visualization.py @@ -0,0 +1,629 @@ +import collections +import os +import random + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import torch +import numpy as np +import torch.nn.functional as F +import uuid + +from matplotlib.colors import BoundaryNorm +import matplotlib.patheffects as mpe +from itertools import cycle + +colors = [ + [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], + [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255], + [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255], + [255, 0, 255], [255, 0, 170], [255, 0, 85]] + + +def plot_heatmap(support_img, query_img, heatmaps, support_kp, support_w, query_kp, query_w, skeleton, + initial_proposals, prediction, radius=6, n_heatmaps=5): + h, w, c = support_img.shape + fig, axes = plt.subplots(n_heatmaps + 1, 4, gridspec_kw={'wspace': 0, 'hspace': 0}) + fig.set_size_inches(40, 10 * (n_heatmaps - 1), forward=True) + [axi.set_axis_off() for axi in axes.ravel()] + plt.subplots_adjust(wspace=0, hspace=0) + # Plot Skeleton + support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img)) + query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img)) + axes[0, 0].imshow(support_img) + axes[0, 1].imshow(query_img) + axes[0, 2].imshow(support_img) + axes[0, 3].imshow(query_img) + for k in range(support_kp.shape[0]): + if support_w[k] > 0: + kp = support_kp[k, :2] + c = (1, 0, 0, 0.75) if support_w[k] == 1 else (0, 0, 1, 0.6) + patch = plt.Circle(kp, radius, color=c) + axes[0, 0].add_patch(patch) + axes[0, 0].text(kp[0], kp[1], k) + kp = query_kp[k, :2] + c = (1, 0, 0, 0.75) if query_kp[k, 2] == 1 else (0, 0, 1, 0.6) + patch = plt.Circle(kp, radius, color=c) + axes[0, 1].add_patch(patch) + axes[0, 1].text(kp[0], kp[1], k) + plt.draw() + for l, limb in enumerate(skeleton): + if l > len(colors) - 1: + c = [x / 255 for x in random.sample(range(0, 255), 3)] + else: + c = [x / 255 for x in colors[l]] + if support_w[limb[0]] > 0 and support_w[limb[1]] > 0 and query_w[limb[0]] > 0 and query_w[limb[1]] > 0: + patch = plt.Line2D([support_kp[limb[0], 0], support_kp[limb[1], 0]], + [support_kp[limb[0], 1], support_kp[limb[1], 1]], + linewidth=2, color=c, alpha=0.5) + axes[0, 2].add_artist(patch) + patch = plt.Line2D([query_kp[limb[0], 0], query_kp[limb[1], 0]], + [query_kp[limb[0], 1], query_kp[limb[1], 1]], + linewidth=2, color=c, alpha=0.5) + axes[0, 3].add_artist(patch) + # Plot heatmap + prediction = prediction[-1] * h + initial_proposals = initial_proposals[0] * h + # similarity_map = F.interpolate(heatmaps[:, None], size=(h, w), mode='bilinear').squeeze() + similarity_map = heatmaps + # similarity_map_shape = similarity_map.shape + # similarity_map = similarity_map.reshape(*similarity_map_shape[:2], -1) + # similarity_map = (similarity_map - torch.min( + # similarity_map, dim=2)[0].unsqueeze(2)) / ( + # torch.max(similarity_map, dim=2)[0].unsqueeze(2) - + # torch.min(similarity_map, dim=2)[0].unsqueeze(2) + 1e-10) + j = 0 + for i in range(n_heatmaps): + if support_w[j] > 0 and query_w[j] > 0: + if i > len(colors) - 1: + c = [x / 255 for x in random.sample(range(0, 255), 3)] + else: + c = [x / 255 for x in colors[i]] + kp = support_kp[j, :2] + patch = plt.Circle(kp, radius, color=c, alpha=0.6) + axes[i + 1, 0].add_patch(patch) + axes[i + 1, 0].text(kp[0], kp[1], j) + axes[i + 1, 0].imshow(support_img) + axes[i + 1, 1].imshow(similarity_map[j].cpu().numpy(), alpha=0.6, cmap='jet') + axes[i + 1, 2].imshow(query_img) + patch = plt.Circle(initial_proposals[j], 0.2 * h, color=c, alpha=0.6) + axes[i + 1, 2].add_patch(patch) + patch = plt.Circle(query_kp[j], radius, color=(1, 0, 0), alpha=0.8) + axes[i + 1, 2].add_patch(patch) + axes[i + 1, 2].text(initial_proposals[j][0], initial_proposals[j][1], j) + axes[i + 1, 3].imshow(query_img) + patch = plt.Circle(prediction[j], 0.2 * h, color=c, alpha=0.6) + axes[i + 1, 3].add_patch(patch) + patch = plt.Circle(query_kp[j], radius, color=(1, 0, 0), alpha=0.8) + axes[i + 1, 3].add_patch(patch) + axes[i + 1, 3].text(initial_proposals[j][0], initial_proposals[j][1], j) + j += 1 + if j > 99: + break + img_names = [img.split(".")[0] for img in os.listdir('./heatmaps') if img.endswith('.png')] + if len(img_names) > 0: + name_idx = max([int(img_name) for img_name in img_names]) + 1 + else: + name_idx = 0 + plt.savefig(f'./heatmaps/{str(name_idx)}.png') + plt.clf() + + +def plot_attn(support_img, query_img, similarity_map, support_kp, support_w, query_kp, query_w, skeleton, + attn_map, adjs, prediction, radius=14, n_heatmaps=1): + h, w, c = support_img.shape + plt.rc('xtick', labelsize=18) + plt.rc('ytick', labelsize=18) + fig, axes = plt.subplots(4, 4, gridspec_kw={'wspace': 0.2, 'hspace': 0.2}) + fig.set_size_inches(50, 50, forward=True) + axes[0, 0].set_axis_off() + axes[0, 1].set_axis_off() + axes[0, 2].set_axis_off() + axes[0, 3].set_axis_off() + plt.subplots_adjust(wspace=0.2, hspace=0.2) + # Plot Skeleton + support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img)) + query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img)) + axes[0, 0].imshow(support_img) + axes[0, 1].imshow(query_img) + axes[0, 2].imshow(support_img) + axes[0, 3].imshow(query_img) + for k in range(support_kp.shape[0]): + if support_w[k] > 0: + kp = support_kp[k, :2] + c = (1, 0, 0, 0.75) if support_w[k] == 1 else (0, 0, 1, 0.6) + patch = plt.Circle(kp, radius, color=c) + axes[0, 0].add_patch(patch) + axes[0, 0].text(kp[0], kp[1], k) + kp = query_kp[k, :2] + c = (1, 0, 0, 0.75) if query_kp[k, 2] == 1 else (0, 0, 1, 0.6) + patch = plt.Circle(kp, radius, color=c) + axes[0, 1].add_patch(patch) + axes[0, 1].text(kp[0], kp[1], k) + plt.draw() + for l, limb in enumerate(skeleton): + if l > len(colors) - 1: + c = [x / 255 for x in random.sample(range(0, 255), 3)] + else: + c = [x / 255 for x in colors[l]] + if support_w[limb[0]] > 0 and support_w[limb[1]]: + patch = plt.Line2D([support_kp[limb[0], 0], support_kp[limb[1], 0]], + [support_kp[limb[0], 1], support_kp[limb[1], 1]], + linewidth=8, color=c, alpha=0.5) + axes[0, 2].add_artist(patch) + if query_w[limb[0]] > 0 and query_w[limb[1]]: + patch = plt.Line2D([query_kp[limb[0], 0], query_kp[limb[1], 0]], + [query_kp[limb[0], 1], query_kp[limb[1], 1]], + linewidth=8, color=c, alpha=0.5) + axes[0, 3].add_artist(patch) + # Plot heatmap + axes[1, 0].set_title("GT") + axes[1, 1].set_title("L1") + axes[1, 2].set_title("L2") + axes[1, 3].set_title("L3") + min_kp_pos = np.argmax(np.cumsum(query_w)) + 1 + mask = torch.from_numpy(query_w).bool()[None] + gt_A = adj_mx_from_edges(num_pts=100, skeleton=[skeleton], device=mask.device).cpu().numpy() + gt_A = gt_A[:min_kp_pos, :min_kp_pos] + axes[1, 0].imshow(gt_A, alpha=0.6, cmap='Reds') + for i in range(min_kp_pos): + for j in range(min_kp_pos): + text = axes[1, 0].text(j, i, np.round(gt_A[i, j], 2), ha="center", va="center") + np.fill_diagonal(gt_A, 0) + axes[2, 0].imshow(gt_A, alpha=0.6, cmap='Reds') + for i in range(min_kp_pos): + for j in range(min_kp_pos): + text = axes[2, 0].text(j, i, np.round(gt_A[i, j], 2), ha="center", va="center") + for col, attn in enumerate(attn_map): + heatmap = attn[:, :min_kp_pos, :min_kp_pos].squeeze().cpu().numpy() + axes[1, col+1].imshow(heatmap, alpha=0.6, cmap='Reds') + for i in range(min_kp_pos): + for j in range(min_kp_pos): + text = axes[1, col+1].text(j, i, np.round(heatmap[i, j], 2), ha="center", va="center") + # np.fill_diagonal(heatmap, 0) + # heatmap = heatmap / heatmap.sum(1, keepdims=True) + axes[2, col+1].imshow(heatmap, alpha=0.6, cmap='Reds') + for i in range(min_kp_pos): + for j in range(min_kp_pos): + text = axes[2, col+1].text(j, i, np.round(heatmap[i, j], 2), ha="center", va="center") + + # Plot self-attention on image + self_attention_skeleton = [] + for i in range(min_kp_pos): + topk = np.argsort(heatmap[i])[::-1] + for m in range(5): + self_attention_skeleton.append([i, topk[m], heatmap[i, topk[m]]]) + axes[3, col+1].imshow(query_img) + for k in range(support_kp.shape[0]): + if support_w[k] > 0: + kp = query_kp[k, :2] + c = (1, 0, 0, 0.75) if query_kp[k, 2] == 1 else (0, 0, 1, 0.6) + patch = plt.Circle(kp, radius//2, color=c) + axes[3, col+1].add_patch(patch) + axes[3, col+1].text(kp[0], kp[1], k, fontsize=12) + plt.draw() + for l, limb in enumerate(self_attention_skeleton): + if query_w[limb[0]] > 0 and query_w[limb[1]]: + patch = plt.Line2D( + [query_kp[limb[0], 0], query_kp[limb[1], 0]], + [query_kp[limb[0], 1], query_kp[limb[1], 1]], + linewidth=30*limb[2], color='red', alpha=limb[2]) + axes[3, col+1].add_artist(patch) + # cur_adj = torch.nn.functional.sigmoid(adjs[col])[:, :min_kp_pos, :min_kp_pos].squeeze().cpu().numpy() + # axes[3, col + 1].imshow(cur_adj, alpha=0.6, cmap='Reds') + # for i in range(min_kp_pos): + # for j in range(min_kp_pos): + # text = axes[3, col + 1].text(j, i, np.round(cur_adj[i, j], 2), ha="center", va="center") + img_names = [img.split(".")[0] for img in os.listdir('./heatmaps') if str_is_int(img.split(".")[0])] + if len(img_names) > 0: + name_idx = max([int(img_name) for img_name in img_names]) + 1 + else: + name_idx = 0 + # crete dir + # if not os.path.isdir(f'./heatmaps/{str(name_idx)}'): + # os.mkdir(f'./heatmaps/{str(name_idx)}') + plt.savefig(f'./heatmaps/{str(name_idx)}.png') + extent = axes[3,3].get_window_extent().transformed( + fig.dpi_scale_trans.inverted()) + fig.savefig(f'./heatmaps/layer_{str(name_idx)}.png', bbox_inches=extent) + # for k, row in enumerate(axes): + # for i, ax in enumerate(row): + # extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + # plt.savefig(f'./heatmaps/{str(name_idx)}/{str(k)}_{str(i)}.png', bbox_inches=extent) + + plt.clf() + + +def plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w, + skeleton=None, initial_proposals=None, prediction=None, + radius=6, out_dir='./heatmaps', file_name=None, in_color=None, + original_skeleton=None, img_alpha=0.6, target_keypoints=None): + img_names = [img.split("_")[0] for img in os.listdir(out_dir) if str_is_int(img.split("_")[0])] + if file_name is None: + if len(img_names) > 0: + name_idx = str(max([int(img_name) for img_name in img_names]) + 1) + else: + name_idx = '0' + else: + name_idx = file_name + # crete dir + # if not os.path.isdir(f'./heatmaps/{str(name_idx)}'): + # os.mkdir(f'./heatmaps/{str(name_idx)}') + + h, w, c = support_img.shape + prediction = prediction[-1] * h + if isinstance(prediction, torch.Tensor): + prediction = prediction.cpu().numpy() + if isinstance(skeleton, list): + skeleton = adj_mx_from_edges(num_pts=100, skeleton=[skeleton]).cpu().numpy()[0] + original_skeleton = skeleton + support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img)) + query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img)) + error_mask = None + for id, (img, w, keypoint, adj) in enumerate(zip([support_img, support_img, query_img], + [support_w, support_w, query_w], + # [support_kp, query_kp])): + [support_kp, support_kp, prediction], + [original_skeleton, skeleton, skeleton])): + color = in_color + f, axes = plt.subplots() + plt.imshow(img, alpha=img_alpha) + + # On qeury image plot + if id == 2 and target_keypoints is not None: + error = np.linalg.norm(keypoint - target_keypoints, axis=-1) + error_mask = error > (256 * 0.05) + + for k in range(keypoint.shape[0]): + if w[k] > 0: + kp = keypoint[k, :2] + c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6) + if error_mask is not None and error_mask[k]: + c = (1, 1, 0, 0.75) + patch = plt.Circle(kp, + radius, + color=c, + path_effects=[mpe.withStroke(linewidth=8, foreground='black'), + mpe.withStroke(linewidth=4, foreground='white'), + mpe.withStroke(linewidth=2, foreground='black'), + ], + zorder=260) + axes.add_patch(patch) + axes.text(kp[0], kp[1], k, fontsize=10, color='black', ha="center", va="center", zorder=320,) + else: + patch = plt.Circle(kp, + radius, + color=c, + path_effects=[mpe.withStroke(linewidth=2, foreground='black')], + zorder=200) + axes.add_patch(patch) + axes.text(kp[0], kp[1], k, fontsize=(radius+4), color='white', ha="center", va="center", zorder=300, + path_effects=[mpe.withStroke(linewidth=max(1, int((radius+4)/5)), foreground='black')]) + # axes.text(kp[0], kp[1], k) + plt.draw() + # Create keypoint pairs index list + # color_hack = { + # (0, 1): '3000ff', + # (1, 2): 'ff008a', + # (2, 3): 'ff00de', + # (3, 4): 'd200ff', + # (4, 5): '8400ff', + # (5, 0): '003cff', + # } + # reverse_key_color_hack = {(k[1], k[0]): v for k, v in color_hack.items()} + # color_hack = {**color_hack, **reverse_key_color_hack} + if adj is not None: + # Make max value 6 + draw_skeleton = adj ** 1 + max_skel_val = np.max(draw_skeleton) + draw_skeleton = draw_skeleton / max_skel_val * 6 + for i in range(1, keypoint.shape[0]): + for j in range(0, i): + # if c_index > len(colors) - 1: + # c = [x / 255 for x in random.sample(range(0, 255), 3)] + # else: + # c = [x / 255 for x in colors[c_index]] + # if (i, j) in color_hack: + # c = color_hack[(i, j)] + # c = [int(c[i:i + 2], 16) / 255 for i in (0, 2, 4)] + # c_index -= 1 + if w[i] > 0 and w[j] > 0 and original_skeleton[i][j] > 0: + if color is None: + num_colors = int((skeleton > 0.05).sum() / 2) + color = iter(plt.cm.rainbow(np.linspace(0, 1, num_colors+1))) + c = next(color) + elif isinstance(color, str): + c = color + elif isinstance(color, collections.Iterable): + c = next(color) + else: + raise ValueError("Color must be a string or an iterable") + if w[i] > 0 and w[j] > 0 and skeleton[i][j] > 0: + width = draw_skeleton[i][j] + stroke_width = width + (width / 3) + patch = plt.Line2D([keypoint[i, 0], keypoint[j, 0]], + [keypoint[i, 1], keypoint[j, 1]], + linewidth=width, color=c, alpha=0.6, + path_effects=[mpe.withStroke(linewidth=stroke_width, foreground='black')], zorder=1) + axes.add_artist(patch) + + plt.axis('off') # command for hiding the axis. + plt.savefig(f'./{out_dir}/{str(name_idx)}_{str(id)}.png', bbox_inches='tight', pad_inches=0) + plt.clf() + # plt.close('all') + + +def old_plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w, skeleton, + initial_proposals, prediction, radius=6, out_dir='./heatmaps', + file_name=None): + img_names = [img.split("_")[0] for img in os.listdir(out_dir) if str_is_int(img.split("_")[0])] + if file_name is None: + if len(img_names) > 0: + name_idx = str(max([int(img_name) for img_name in img_names]) + 1) + else: + name_idx = '0' + else: + name_idx = file_name + # crete dir + # if not os.path.isdir(f'./heatmaps/{str(name_idx)}'): + # os.mkdir(f'./heatmaps/{str(name_idx)}') + + h, w, c = support_img.shape + prediction = prediction[-1].cpu().numpy() * h + support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img)) + query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img)) + + for id, (img, w, keypoint) in enumerate(zip([support_img, query_img], + [support_w, query_w], + # [support_kp, query_kp])): + [support_kp, prediction])): + f, axes = plt.subplots() + plt.imshow(img) + for k in range(keypoint.shape[0]): + if w[k] > 0: + kp = keypoint[k, :2] + c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6) + patch = plt.Circle(kp, radius, color=c) + axes.add_patch(patch) + axes.text(kp[0], kp[1], k, fontsize=20) + # axes.text(kp[0], kp[1], k) + plt.draw() + # Create keypoint pairs index list + # color_hack = { + # (0, 1): '3000ff', + # (1, 2): 'ff008a', + # (2, 3): 'ff00de', + # (3, 4): 'd200ff', + # (4, 5): '8400ff', + # (5, 0): '003cff', + # } + # reverse_key_color_hack = {(k[1], k[0]): v for k, v in color_hack.items()} + # color_hack = {**color_hack, **reverse_key_color_hack} + # c_index = 0 + # for i in range(1, keypoint.shape[0]): + # for j in range(0, i): + # if c_index > len(colors) - 1: + # c = [x / 255 for x in random.sample(range(0, 255), 3)] + # else: + # c = [x / 255 for x in colors[c_index]] + # if (i, j) in color_hack: + # c = color_hack[(i, j)] + # c = [int(c[i:i + 2], 16) / 255 for i in (0, 2, 4)] + # c_index -= 1 + # if w[i] > 0 and w[j] > 0 and skeleton[i][j] > 0: + # patch = plt.Line2D([keypoint[i, 0], keypoint[j, 0]], + # [keypoint[i, 1], keypoint[j, 1]], + # # linewidth=skeleton[i][j]*20, color=c, alpha=0.6) + # linewidth=5, color=c, alpha=0.6) + # axes.add_artist(patch) + # c_index += 1 + + for l, limb in enumerate(skeleton): + kp = keypoint[:, :2] + if l > len(colors) - 1: + c = [x / 255 for x in random.sample(range(0, 255), 3)] + else: + c = [x / 255 for x in colors[l]] + if w[limb[0]] > 0 and w[limb[1]] > 0: + patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]], + [kp[limb[0], 1], kp[limb[1], 1]], + linewidth=6, color=c, alpha=0.6) + axes.add_artist(patch) + plt.axis('off') # command for hiding the axis. + plt.savefig(f'./{out_dir}/{str(name_idx)}_{str(id)}.png', bbox_inches='tight', pad_inches=0) + plt.clf() + +def str_is_int(s): + try: + int(s) + return True + except ValueError: + return False + + +def adj_mx_from_edges(num_pts, skeleton, device='cuda', normalization_fix=True): + adj_mx = torch.empty(0, device=device) + batch_size = len(skeleton) + for b in range(batch_size): + edges = torch.tensor(skeleton[b]) + adj = torch.zeros(num_pts, num_pts, device=device) + adj[edges[:, 0], edges[:, 1]] = 1 + adj_mx = torch.concatenate((adj_mx, adj.unsqueeze(0)), dim=0) + trans_adj_mx = torch.transpose(adj_mx, 1, 2) + cond = (trans_adj_mx > adj_mx).float() + adj = adj_mx + trans_adj_mx * cond - adj_mx * cond + # if normalization_fix: + # adj = adj * ~mask[..., None] * ~mask[:, None] + # adj = torch.nan_to_num(adj / adj.sum(dim=-1, keepdim=True)) + # else: + # adj = torch.nan_to_num(adj / adj.sum(dim=2, keepdim=True)) * ~mask[..., None] * ~mask[:, None] + # adj = torch.stack((torch.diag_embed(~mask), adj), dim=1) + return adj + +def vis_skeleton(support_img, support_kp, support_w, a_pred, a_gt, file_name=None, radius=3, line_width=6, alpha=0.8): + h, w, c = support_img.shape + # Normalize the support image + support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img)) + # Create figure + fig, axes = plt.subplots(2, 2, figsize=(20, 20), gridspec_kw={'height_ratios': [1, 1]}) + axes[0, 0].imshow(support_img, alpha=0.6) + axes[0, 1].imshow(support_img, alpha=0.6) + axes[1, 0].imshow(support_img, alpha=0.6) + axes[1, 1].axis('off') # Hide the unused subplot + + a_pred = (a_pred + a_pred.T) / 2 + scaled_a_pred = a_pred ** 1 * line_width + max_val = np.max(scaled_a_pred) + for i in range(a_pred.shape[0]): + for j in range(i + 1, a_pred.shape[1]): + if support_w[i] > 0 and support_w[j] > 0 and a_pred[i, j] > 0: + kp1 = support_kp[i, :2] + kp2 = support_kp[j, :2] + width = scaled_a_pred[i, j] + stroke_width = width + (width / 3) + outline = mpe.withStroke(linewidth=stroke_width, foreground='black') + patch = plt.Line2D([kp1[0], kp2[0]], [kp1[1], kp2[1]], + path_effects=[outline], + linewidth=width, + color='blue', + alpha=alpha) + axes[0, 0].add_artist(patch) + + # Plot keypoints and skeleton for predicted adjacency matrix + for k in range(support_kp.shape[0]): + if support_w[k] > 0: + kp = support_kp[k, :2] + outline = mpe.withStroke(linewidth=2, foreground='black') + patch = plt.Circle(kp, radius, color=(1, 0, 0, 1), path_effects=[outline], zorder=200) + axes[0, 0].add_patch(patch) + + a_gt = (a_gt + a_gt.T) / 2 + for i in range(a_gt.shape[0]): + for j in range(i + 1, a_gt.shape[1]): + if support_w[i] > 0 and support_w[j] > 0 and a_gt[i, j] > 0: + kp1 = support_kp[i, :2] + kp2 = support_kp[j, :2] + width = a_gt[i, j] * max_val + outline = mpe.withStroke(linewidth=width+2, foreground='black') + patch = plt.Line2D([kp1[0], kp2[0]], [kp1[1], kp2[1]], + path_effects=[outline], + linewidth=width, + color='green', + alpha=alpha) + axes[0, 1].add_artist(patch) + + # Plot keypoints and skeleton for predicted adjacency matrix + for k in range(support_kp.shape[0]): + if support_w[k] > 0: + kp = support_kp[k, :2] + outline = mpe.withStroke(linewidth=3, foreground='black') + patch = plt.Circle(kp, radius, color=(1, 0, 0, 1), path_effects=[outline], zorder=200) + axes[0, 1].add_patch(patch) + # axes[0, 0].text(kp[0], kp[1], + # k, + # path_effects=[mpe.Stroke(linewidth=2, foreground='black'), mpe.Normal()], + # fontsize=12, + # color='white', + # ha="center", + # va="center", + # zorder=300) + + # Calculate the difference and plot the skeleton with color based on the difference + diff = (a_pred - a_gt) / (a_gt + 1e-10) + for k in range(support_kp.shape[0]): + if support_w[k] > 0: + kp = support_kp[k, :2] + patch = plt.Circle(kp, radius, color=(1, 0, 0, 0.75)) + axes[1, 0].add_patch(patch) + axes[1, 0].text(kp[0], kp[1], k, fontsize=8) + + cmap = shiftedColorMap(plt.cm.Spectral, midpoint=0.34) + norm = plt.Normalize(vmin=-1., vmax=2.) + + for i in range(diff.shape[0]): + for j in range(i + 1, diff.shape[1]): + if support_w[i] > 0 and support_w[j] > 0 and diff[i, j] != 0: + kp1 = support_kp[i, :2] + kp2 = support_kp[j, :2] + color = cmap(norm(diff[i, j])) + patch = plt.Line2D([kp1[0], kp2[0]], [kp1[1], kp2[1]], + linewidth=line_width/2, + color=color, + alpha=alpha) + axes[1, 0].add_artist(patch) + + # axes[0, 0].set_title('Predicted Adjacency Matrix') + # axes[0, 1].set_title('Ground-Truth Adjacency Matrix') + # axes[1, 0].set_title(r'$\frac{(a_{pred} - a_{gt})}{a_{gt}}$') + + for ax in axes[0, :]: + ax.axis('off') + for ax in axes[1, :]: + ax.axis('off') + + cbar = fig.colorbar(plt.cm.ScalarMappable(cmap=cmap, norm=norm), ax=axes[1, 0], orientation='vertical') + cbar.set_label('Difference') + + if file_name: + path = f'./heatmaps/{file_name}' + plt.savefig(f'{path}_pred.png', bbox_inches='tight', pad_inches=0) + extent = axes[0, 0].get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + fig.savefig(f'{path}_prediction.png', bbox_inches=extent) + extent = axes[0, 1].get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + fig.savefig(f'{path}_gt.png', bbox_inches=extent) + extent = axes[1, 0].get_window_extent().transformed(fig.dpi_scale_trans.inverted()) + fig.savefig(f'{path}_diff.png', bbox_inches=extent.expanded(1.6, 1.3)) + plt.cla() + + +def shiftedColorMap(cmap, start=0, midpoint=0.5, stop=1.0, name='shiftedcmap'): + ''' + Function to offset the "center" of a colormap. Useful for + data with a negative min and positive max and you want the + middle of the colormap's dynamic range to be at zero. + + Input + ----- + cmap : The matplotlib colormap to be altered + start : Offset from lowest point in the colormap's range. + Defaults to 0.0 (no lower offset). Should be between + 0.0 and `midpoint`. + midpoint : The new center of the colormap. Defaults to + 0.5 (no shift). Should be between 0.0 and 1.0. In + general, this should be 1 - vmax / (vmax + abs(vmin)) + For example if your data range from -15.0 to +5.0 and + you want the center of the colormap at 0.0, `midpoint` + should be set to 1 - 5/(5 + 15)) or 0.75 + stop : Offset from highest point in the colormap's range. + Defaults to 1.0 (no upper offset). Should be between + `midpoint` and 1.0. + ''' + cdict = { + 'red': [], + 'green': [], + 'blue': [], + 'alpha': [] + } + + # regular index to compute the colors + reg_index = np.linspace(start, stop, 257) + + # shifted index to match the data + shift_index = np.hstack([ + np.linspace(0.0, midpoint, 128, endpoint=False), + np.linspace(midpoint, 1.0, 129, endpoint=True) + ]) + + for ri, si in zip(reg_index, shift_index): + r, g, b, a = cmap(ri) + + cdict['red'].append((si, r, r)) + cdict['green'].append((si, g, g)) + cdict['blue'].append((si, b, b)) + cdict['alpha'].append((si, a, a)) + + newcmap = matplotlib.colors.LinearSegmentedColormap(name, cdict) + # plt.register_cmap(cmap=newcmap) + + return newcmap \ No newline at end of file diff --git a/EdgeCape/version.py b/EdgeCape/version.py new file mode 100644 index 0000000000000000000000000000000000000000..11e2ca150513d566dca35e045548b914b1b6bfa7 --- /dev/null +++ b/EdgeCape/version.py @@ -0,0 +1,5 @@ +# GENERATED VERSION FILE +# TIME: Wed May 31 16:07:32 2023 +__version__ = '0.2.0+818517e' +short_version = '0.2.0' +version_info = (0, 2, 0) diff --git a/README.md b/README.md index f39d3b782cd5df9a0bbc7cdd5a0e0ec662fb53fa..a214aa3e6322693b09c7989cfa7dc8a400b8e080 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,107 @@ ---- -title: EdgeCape -emoji: 🏆 -colorFrom: yellow -colorTo: blue -sdk: gradio -sdk_version: 5.7.1 -app_file: app.py -pinned: false ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# Edge Weight Prediction For Category-Agnostic Pose Estimation + + + + + + +By [Or Hirschorn](https://scholar.google.co.il/citations?user=GgFuT_QAAAAJ&hl=iw&oi=ao) and [Shai Avidan](https://scholar.google.co.il/citations?hl=iw&user=hpItE1QAAAAJ) + +This repo is the official implementation of "[Edge Weight Prediction For Category-Agnostic Pose Estimation +](https://arxiv.org/abs/2411.16665)". + +# Hugging Face Demo Coming Soon! +### Stay tuned for the upcoming demo release! + + +## 🔔 News +- **`25 November 2024`** Initial Code Release + + +## Introduction +Given only one example image and skeleton, our method refines the skeleton to enhance pose estimation on unseen categories. + +Using our method, given a support image and skeleton we can refine the structure for better pose estimation on images from unseen categories. + +## Citation +Please consider citing our paper and GraphCape if you found our work useful: +```bibtex +@misc{hirschorn2024edgeweightpredictioncategoryagnostic, + title={Edge Weight Prediction For Category-Agnostic Pose Estimation}, + author={Or Hirschorn and Shai Avidan}, + year={2024}, + eprint={2411.16665}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2411.16665}, +} + +@misc{hirschorn2023pose, + title={A Graph-Based Approach for Category-Agnostic Pose Estimation}, + author={Or Hirschorn and Shai Avidan}, + year={2024}, + eprint={2311.17891}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2311.17891}, +} +``` + +## Getting Started + +### Docker [Recommended] +We provide a docker image for easy use. +You can simply pull the docker image from docker hub, containing all the required libraries and packages: + +``` +docker pull orhir/edgecape +docker run --name edgecape -v {DATA_DIR}:/workspace/EdgeCape/EdgeCape/data/mp100 -it orhir/edgecape /bin/bash +``` +### Conda Environment +We train and evaluate our model on Python 3.8 and Pytorch 2.0.1 with CUDA 12.1. + +Please first install pytorch and torchvision following official documentation Pytorch. +Then, follow [MMPose](https://mmpose.readthedocs.io/en/latest/installation.html) to install the following packages: +``` +mmcv-full=1.7.2 +mmpose=0.29.0 +``` +Having installed these packages, run: +``` +python setup.py develop +``` + +## MP-100 Dataset +Please follow the [official guide](https://github.com/orhir/PoseAnything) to prepare the MP-100 dataset for training and evaluation, and organize the data structure properly. + +## Training + +### Training +To train the model, run: +``` +python run.py --config [path_to_config_file] --work_dir [path_to_work_dir] +``` + +## Evaluation and Pretrained Models + +### Evaluation +The evaluation on a single GPU will take approximately 30 min. + +To evaluate the pretrained model, run: +``` +python test.py [path_to_config_file] [path_to_pretrained_ckpt] +``` + +### Pretrained Models + +You can download the pretrained models from following [link](https://drive.google.com/drive/folders/1gbeeVQ-Y8Dj2FrsDatf5ZLWpzv5u8HyL?usp=sharing). + +## Acknowledgement + +Our code is based on code from: + - [MMPose](https://github.com/open-mmlab/mmpose) + - [PoseAnything](https://github.com/orhir/PoseAnything) + + +## License +This project is released under the Apache 2.0 license. \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..e0827e8b712b895edd76ceb408ce337b060fc56b --- /dev/null +++ b/app.py @@ -0,0 +1,148 @@ +import argparse +from pathlib import Path + +import gradio as gr +import matplotlib + +from gradio_utils.utils import (process_img, get_select_coords, select_skeleton, + reset_skeleton, reset_kp, process) + +LENGTH = 480 # Length of the square area displaying/editing images + +matplotlib.use('agg') +model_dir = Path('./checkpoints') +parser = argparse.ArgumentParser(description='EdgeCape Demo') +parser.add_argument('--checkpoint', + help='checkpoint path', + default='ckpt/1shot_split1.pth') +args = parser.parse_args() +checkpoint_path = args.checkpoint +device = 'cuda' +TIMEOUT = 80 + +with gr.Blocks() as demo: + gr.Markdown(''' + # We introduce EdgeCape, a novel framework that overcomes these limitations by predicting the graph's edge weights which optimizes localization. + To further leverage structural priors, we propose integrating Markovian Structural Bias, which modulates the self-attention interaction between nodes based on the number of hops between them. + We show that this improves the model’s ability to capture global spatial dependencies. + Evaluated on the MP-100 benchmark, which includes 100 categories and over 20K images, + EdgeCape achieves state-of-the-art results in the 1-shot setting and leads among similar-sized methods in the 5-shot setting, significantly improving keypoint localization accuracy. + ### [Paper](https://arxiv.org/pdf/2411.16665) | [Project Page](https://orhir.github.io/edge_cape/) + ## Instructions + 1. Upload an image of the object you want to pose. + 2. Mark keypoints on the image. + 3. Mark limbs on the image. + 4. Upload an image of the object you want to pose to the query image (**bottom**). + 5. Click **Evaluate** to pose the query image. + ''') + + global_state = gr.State({ + "images": {}, + "points": [], + "skeleton": [], + "prev_point": None, + "curr_type_point": "start", + }) + with gr.Row(): + # Upload & Preprocess Image Column + with gr.Column(): + gr.Markdown( + """

Upload & Preprocess Image

""" + ) + support_image = gr.Image( + height=LENGTH, + width=LENGTH, + type="pil", + image_mode="RGB", + label="Preprocess Image", + show_label=True, + interactive=True, + ) + + # Click Points Column + with gr.Column(): + gr.Markdown( + """

Click Points

""" + ) + kp_support_image = gr.Image( + type="pil", + label="Keypoints Image", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + show_fullscreen_button=False, + ) + with gr.Row(): + confirm_kp_button = gr.Button("Confirm Clicked Points", scale=3) + with gr.Row(): + undo_kp_button = gr.Button("Undo Clicked Points", scale=3) + + # Editing Results Column + with gr.Column(): + gr.Markdown( + """

Click Skeleton

""" + ) + skel_support_image = gr.Image( + type="pil", + label="Skeleton Image", + show_label=True, + height=LENGTH, + width=LENGTH, + interactive=False, + show_fullscreen_button=False, + ) + with gr.Row(): + pass + with gr.Row(): + undo_skel_button = gr.Button("Undo Skeleton") + + with gr.Row(): + with gr.Column(): + gr.Markdown( + """

Query Image

""" + ) + query_image = gr.Image( + type="pil", + image_mode="RGB", + label="Query Image", + show_label=True, + interactive=True, + ) + with gr.Column(): + gr.Markdown( + """

Output

""" + ) + output_img = gr.Plot(label="Output Image",) + with gr.Row(): + eval_btn = gr.Button(value="Evaluate") + with gr.Row(): + gr.Markdown("## Examples") + + support_image.change(process_img, + inputs=[support_image, global_state], + outputs=[kp_support_image, global_state]) + kp_support_image.select(get_select_coords, + [global_state], + [global_state, kp_support_image], + queue=False,) + confirm_kp_button.click(reset_skeleton, + inputs=global_state, + outputs=skel_support_image) + undo_kp_button.click(reset_kp, + inputs=global_state, + outputs=[kp_support_image, skel_support_image]) + undo_skel_button.click(reset_skeleton, + inputs=global_state, + outputs=skel_support_image) + skel_support_image.select(select_skeleton, + inputs=[global_state], + outputs=[global_state, skel_support_image]) + eval_btn.click(fn=process, + inputs=[query_image, global_state], + outputs=[output_img, global_state]) + +if __name__ == "__main__": + print("Start app", parser.parse_args()) + gr.close_all() + demo.launch(show_api=False) diff --git a/ckpt/1shot_split1.pth b/ckpt/1shot_split1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d28320cd9a5366a9ed809ec4b267341572f20be3 --- /dev/null +++ b/ckpt/1shot_split1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99691d78a2145d0b4892c8d6cf53505929fedfd569a488f171de37f56971829 +size 480718083 diff --git a/ckpt/testing_log.txt b/ckpt/testing_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7489b7377d43f951c66af4754417161a5aa6732 --- /dev/null +++ b/ckpt/testing_log.txt @@ -0,0 +1,11 @@ +** config_file: configs/test/1shot_split1.py checkpoint: ckpt/1shot_split1.pth + AUC: 0.9088144381031391 + EPE: 31.381722289753604 + NME: 0.06428522136244384 + PCK@0.05: 0.6425443164685964 + PCK@0.1: 0.8391598907887715 + PCK@0.15: 0.9041310635767583 + PCK@0.2: 0.9369288485314081 + PCK@0.25: 0.9571147947802892 + mPCK: 0.8559757828291646 +******************************************************************** diff --git a/configs/test/1shot_split1.py b/configs/test/1shot_split1.py new file mode 100644 index 0000000000000000000000000000000000000000..6d77ba0ed8248a4486bfd806edffb82e6b0bbf67 --- /dev/null +++ b/configs/test/1shot_split1.py @@ -0,0 +1,254 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict(), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split1_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split1_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split1_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/1shot_split2.py b/configs/test/1shot_split2.py new file mode 100644 index 0000000000000000000000000000000000000000..dc05bdaf0e6fe6c65bbcc0c513100d31a52d82b6 --- /dev/null +++ b/configs/test/1shot_split2.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split2_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split2_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split2_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/1shot_split3.py b/configs/test/1shot_split3.py new file mode 100644 index 0000000000000000000000000000000000000000..150d2f80af1b2d1997fcc375468f4f46c843a9c6 --- /dev/null +++ b/configs/test/1shot_split3.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split3_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split3_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split3_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/1shot_split4.py b/configs/test/1shot_split4.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad63c1843f4edb5e62b9e11a2e6a23bf1fca98e --- /dev/null +++ b/configs/test/1shot_split4.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split4_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split4_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split4_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/1shot_split5.py b/configs/test/1shot_split5.py new file mode 100644 index 0000000000000000000000000000000000000000..d460c1847a9f210fde4c172feb0c08b10e9e10e1 --- /dev/null +++ b/configs/test/1shot_split5.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split5_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split5_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split5_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/5shot_split1.py b/configs/test/5shot_split1.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b870fa5bc198a0316abec38a63258126d8914f --- /dev/null +++ b/configs/test/5shot_split1.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split1_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split1_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split1_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/5shot_split2.py b/configs/test/5shot_split2.py new file mode 100644 index 0000000000000000000000000000000000000000..91671ff0ef7c9a0209678e167d1112a7ee9e593a --- /dev/null +++ b/configs/test/5shot_split2.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split2_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split2_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split2_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/5shot_split3.py b/configs/test/5shot_split3.py new file mode 100644 index 0000000000000000000000000000000000000000..07b4fc76c6a3e625ff1176b86abbe43e0feb6ade --- /dev/null +++ b/configs/test/5shot_split3.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split3_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split3_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split3_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/5shot_split4.py b/configs/test/5shot_split4.py new file mode 100644 index 0000000000000000000000000000000000000000..c43ebcbc1b11bf9ed57c60473dbd25a76c9546b1 --- /dev/null +++ b/configs/test/5shot_split4.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split4_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split4_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split4_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/test/5shot_split5.py b/configs/test/5shot_split5.py new file mode 100644 index 0000000000000000000000000000000000000000..ffefae18e2d1a36bed143e3aba42051c94b44b60 --- /dev/null +++ b/configs/test/5shot_split5.py @@ -0,0 +1,261 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict(type='Adam', lr=1e-05) +optimizer_config = dict(grad_clip=None) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')]) +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[[0]], + inference_channel=[0], + max_kpt_num=100) +model = dict( + type='EdgeCape', + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation='relu', + normalize_before=False, + return_intermediate_dec=True, + use_bias_attn_module=True, + attn_bias=True, + max_hops=4), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True), + learn_skeleton=True, + masked_supervision=True, + masking_ratio=0.5, + model_freeze='skeleton'), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11), + freeze_backbone=True) +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton' + ]) +] +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split5_train.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + pipeline=[ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', + rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + val=dict( + type='TransformerPoseDataset', + ann_file='data/mp100/annotations/mp100_split5_val.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ]), + test=dict( + type='TestPoseDataset', + ann_file='data/mp100/annotations/mp100_split5_test.json', + img_prefix='data/mp100/images/', + data_cfg=dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=1, + num_joints=1, + dataset_channel=[[0]], + inference_channel=[0]), + valid_class_ids=None, + max_kpt_num=100, + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25], + pipeline=[ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', + 'scale', 'rotation', 'bbox_score', 'flip_pairs', + 'category_id', 'skeleton' + ]) + ])) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') +] +visualizer = dict( + type='PoseLocalVisualizer', + vis_backends=[ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend') + ], + name='visualizer') +shuffle_cfg = dict(interval=1) diff --git a/configs/train/1shot_split1.py b/configs/train/1shot_split1.py new file mode 100644 index 0000000000000000000000000000000000000000..c110649abffc39685e4b97d6868f4bd50eebfef3 --- /dev/null +++ b/configs/train/1shot_split1.py @@ -0,0 +1,180 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + encoder_config=dict(), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict(type='SkeletonPredictor') + ), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split1_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split1_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split1_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/1shot_split2.py b/configs/train/1shot_split2.py new file mode 100644 index 0000000000000000000000000000000000000000..255d380e3de2a42bd55265e3d04aa114dab24e20 --- /dev/null +++ b/configs/train/1shot_split2.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split2_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split2_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split2_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/1shot_split3.py b/configs/train/1shot_split3.py new file mode 100644 index 0000000000000000000000000000000000000000..dfbe95f01fc1e2c0418e8e355a9ee05b804d25d6 --- /dev/null +++ b/configs/train/1shot_split3.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split3_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split3_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split3_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/1shot_split4.py b/configs/train/1shot_split4.py new file mode 100644 index 0000000000000000000000000000000000000000..385f13d689d1277d2ead696cd197cfd1c80e720b --- /dev/null +++ b/configs/train/1shot_split4.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split4_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split4_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split4_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/1shot_split5.py b/configs/train/1shot_split5.py new file mode 100644 index 0000000000000000000000000000000000000000..bc76e509ac3c26bd40eaf702a8e4d902de01a7b3 --- /dev/null +++ b/configs/train/1shot_split5.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split5_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split5_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split5_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=1, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/5shot_split1.py b/configs/train/5shot_split1.py new file mode 100644 index 0000000000000000000000000000000000000000..6ffa78c699c5294b5e8c556f7d3ab09796bfbca5 --- /dev/null +++ b/configs/train/5shot_split1.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split1_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split1_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split1_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/5shot_split2.py b/configs/train/5shot_split2.py new file mode 100644 index 0000000000000000000000000000000000000000..e4801b3e4330d8cb66badddd6c302a440cb152ae --- /dev/null +++ b/configs/train/5shot_split2.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split2_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split2_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split2_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/5shot_split3.py b/configs/train/5shot_split3.py new file mode 100644 index 0000000000000000000000000000000000000000..31c8032d77b8c9b48e871521504bbfecd8535dc5 --- /dev/null +++ b/configs/train/5shot_split3.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split3_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split3_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split3_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/5shot_split4.py b/configs/train/5shot_split4.py new file mode 100644 index 0000000000000000000000000000000000000000..fa9edfadfad061d840268791555657b64e351d8b --- /dev/null +++ b/configs/train/5shot_split4.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split4_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split4_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split4_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/configs/train/5shot_split5.py b/configs/train/5shot_split5.py new file mode 100644 index 0000000000000000000000000000000000000000..9956481e5db3b7cce9447e5593ad5720dc29e0f4 --- /dev/null +++ b/configs/train/5shot_split5.py @@ -0,0 +1,193 @@ +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=20) +evaluation = dict( + interval=25, + metric=['PCK', 'NME', 'AUC', 'EPE'], + key_indicator='PCK', + gpu_collect=True, + res_folder='') +optimizer = dict( + type='Adam', + lr=1e-5, +) + +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[160, 180]) +total_epochs = 100 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=1, + dataset_joints=1, + dataset_channel=[ + [ + 0, + ], + ], + inference_channel=[ + 0, + ], + max_kpt_num=100) + +# model settings +model = dict( + type='EdgeCape', + + encoder_config=dict( + type='SwinTransformerV2', + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=16, + drop_path_rate=0.2, + img_size=256 + ), + keypoint_head=dict( + type='TwoStageHead', + in_channels=768, + transformer=dict( + type='TwoStageSupportRefineTransformer', + d_model=256, + nhead=8, + num_encoder_layers=3, + num_decoder_layers=3, + + dim_feedforward=768, + dropout=0.1, + similarity_proj_dim=256, + dynamic_proj_dim=128, + activation="relu", + normalize_before=False, + return_intermediate_dec=True), + share_kpt_branch=False, + num_decoder_layer=3, + with_heatmap_loss=False, + + heatmap_loss_weight=2.0, + skeleton_loss_weight=1.0, + positional_encoding=dict( + type='SinePositionalEncoding', num_feats=128, normalize=True), + skeleton_head=dict( + type='SkeletonPredictor' + )), + # training and testing settings + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[64, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=15, + scale_factor=0.15), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton', + ]), +] + +valid_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffineFewShot'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTargetFewShot', sigma=1), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs', 'category_id', + 'skeleton', + ]), +] + +test_pipeline = valid_pipeline + +data_root = 'data/mp100' +data = dict( + samples_per_gpu=16, + workers_per_gpu=8, + train=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split5_train.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + pipeline=train_pipeline), + val=dict( + type='TransformerPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split5_val.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=100, + pipeline=valid_pipeline), + test=dict( + type='TestPoseDataset', + ann_file=f'{data_root}/annotations/mp100_split5_test.json', + img_prefix=f'{data_root}/images/', + # img_prefix=f'{data_root}', + data_cfg=data_cfg, + valid_class_ids=None, + max_kpt_num=channel_cfg['max_kpt_num'], + num_shots=5, + num_queries=15, + num_episodes=200, + pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25], + pipeline=test_pipeline), +) +vis_backends = [ + dict(type='LocalVisBackend'), + dict(type='TensorboardVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +shuffle_cfg = dict(interval=1) diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..1a58cfbef254808f6a166bcf338beed52dd29d0a --- /dev/null +++ b/demo.py @@ -0,0 +1,273 @@ +import argparse +import copy +import pickle +import random +import cv2 +import numpy as np +import torch +from mmcv import Config, DictAction +from mmcv.cnn import fuse_conv_bn +from mmcv.runner import load_checkpoint +from mmpose.core import wrap_fp16_model +from mmpose.models import build_posenet +from torchvision import transforms +from EdgeCape import * # noqa +import torchvision.transforms.functional as F +from EdgeCape.models.utils.visualization import old_plot_results, plot_results + +COLORS = [ + [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], + [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255], + [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255], + [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]] + + +class Resize_Pad: + def __init__(self, w=256, h=256): + self.w = w + self.h = h + + def __call__(self, image): + _, w_1, h_1 = image.shape + ratio_1 = w_1 / h_1 + # check if the original and final aspect ratios are the same within a margin + if round(ratio_1, 2) != 1: + # padding to preserve aspect ratio + if ratio_1 > 1: # Make the image higher + hp = int(w_1 - h_1) + hp = hp // 2 + image = F.pad(image, (hp, 0, hp, 0), 0, "constant") + return F.resize(image, [self.h, self.w]) + else: + wp = int(h_1 - w_1) + wp = wp // 2 + image = F.pad(image, (0, wp, 0, wp), 0, "constant") + return F.resize(image, [self.h, self.w]) + else: + return F.resize(image, [self.h, self.w]) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Pose Anything Demo') + parser.add_argument('--support', help='Image file') + parser.add_argument('--query', help='Image file') + parser.add_argument('--config', default=None, help='test config file path') + parser.add_argument('--checkpoint', default=None, help='checkpoint file') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + args = parser.parse_args() + return args + + +def merge_configs(cfg1, cfg2): + # Merge cfg2 into cfg1 + # Overwrite cfg1 if repeated, ignore if value is None. + cfg1 = {} if cfg1 is None else cfg1.copy() + cfg2 = {} if cfg2 is None else cfg2 + for k, v in cfg2.items(): + if v: + cfg1[k] = v + return cfg1 + + +def main(): + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + + args = parse_args() + cfg = Config.fromfile(args.config) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + cfg.data.test.test_mode = True + + # Load data + support_img = cv2.imread(args.support) + query_img = cv2.imread(args.query) + if support_img is None or query_img is None: + raise ValueError('Fail to read images') + + preprocess = transforms.Compose([ + transforms.ToTensor(), + Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)]) + + # frame = copy.deepcopy(support_img) + padded_support_img = preprocess(support_img).cpu().numpy().transpose(1, 2, 0) * 255 + frame = copy.deepcopy(padded_support_img.astype(np.uint8).copy()) + kp_src = [] + skeleton = [] + count = 0 + prev_pt = None + prev_pt_idx = None + color_idx = 0 + + def selectKP(event, x, y, flags, param): + nonlocal kp_src, frame + # if we are in points selection mode, the mouse was clicked, + # list of points with the (x, y) location of the click + # and draw the circle + + if event == cv2.EVENT_LBUTTONDOWN: + kp_src.append((x, y)) + cv2.circle(frame, (x, y), 2, (0, 0, 255), 1) + cv2.imshow("Source", frame) + + if event == cv2.EVENT_RBUTTONDOWN: + kp_src = [] + frame = copy.deepcopy(support_img) + cv2.imshow("Source", frame) + + def draw_line(event, x, y, flags, param): + nonlocal skeleton, kp_src, frame, count, prev_pt, prev_pt_idx, marked_frame, color_idx + if event == cv2.EVENT_LBUTTONDOWN: + closest_point = min(kp_src, key=lambda p: (p[0] - x) ** 2 + (p[1] - y) ** 2) + closest_point_index = kp_src.index(closest_point) + if color_idx < len(COLORS): + c = COLORS[color_idx] + else: + c = random.choices(range(256), k=3) + color = color_idx + cv2.circle(frame, closest_point, 2, c, 1) + if count == 0: + prev_pt = closest_point + prev_pt_idx = closest_point_index + count = count + 1 + cv2.imshow("Source", frame) + else: + cv2.line(frame, prev_pt, closest_point, c, 2) + cv2.imshow("Source", frame) + count = 0 + skeleton.append((prev_pt_idx, closest_point_index)) + color_idx = color_idx + 1 + elif event == cv2.EVENT_RBUTTONDOWN: + frame = copy.deepcopy(marked_frame) + cv2.imshow("Source", frame) + count = 0 + color_idx = 0 + skeleton = [] + prev_pt = None + + + cv2.namedWindow("Source", cv2.WINDOW_NORMAL) + cv2.resizeWindow('Source', 800, 600) + cv2.setMouseCallback("Source", selectKP) + cv2.imshow("Source", frame) + + # keep looping until points have been selected + while len(kp_src) < 1: + print('Press any key when finished marking the points!! ') + cv2.waitKey(0) + + marked_frame = copy.deepcopy(frame) + cv2.setMouseCallback("Source", draw_line) + print('Press any key when finished creating skeleton!! ') + while True: + if cv2.waitKey(1) > 0: + break + + kp_src = torch.tensor(kp_src).float() + + preprocess = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)]) + + + support_img = preprocess(support_img).flip(0)[None] + query_img = preprocess(query_img).flip(0)[None] + # Create heatmap from keypoints + genHeatMap = TopDownGenerateTargetFewShot() + data_cfg = cfg.data_cfg + data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size]) + data_cfg['joint_weights'] = None + data_cfg['use_different_joint_weights'] = False + kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1) + kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1) + target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=2) + target_s = torch.tensor(target_s).float()[None] + target_weight_s = torch.tensor(target_weight_s).float()[None] + + original_support_img = support_img.clone() + + data = { + 'img_s': [support_img.cuda()], + 'img_q': query_img.cuda(), + 'target_s': [target_s.cuda()], + 'target_weight_s': [target_weight_s.cuda()], + 'target_q': None, + 'target_weight_q': None, + 'return_loss': False, + 'img_metas': [{'sample_skeleton': [skeleton], + 'query_skeleton': skeleton, + 'sample_joints_3d': [kp_src_3d.cuda()], + 'query_joints_3d': kp_src_3d.cuda(), + 'sample_center': [kp_src.mean(dim=0)], + 'query_center': kp_src.mean(dim=0), + 'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]], + 'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0], + 'sample_rotation': [0], + 'query_rotation': 0, + 'sample_bbox_score': [1], + 'query_bbox_score': 1, + 'query_image_file': '', + 'sample_image_file': [''], + }] + } + + # Load model + model = build_posenet(cfg.model).cuda() + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + load_checkpoint(model, args.checkpoint, map_location='cpu') + if args.fuse_conv_bn: + model = fuse_conv_bn(model) + model.eval() + + with torch.no_grad(): + outputs = model(**data) + + # visualize results + vis_s_weight = target_weight_s[0] + vis_q_weight = target_weight_s[0] + vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0) + vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0) + support_kp = kp_src_3d + _, original_skeleton = model.keypoint_head.skeleton_head.adj_mx_from_edges(num_pts=outputs['points'].shape[2], + skeleton=[skeleton], + mask=target_weight_s.squeeze(-1).bool(), + device=target_weight_s.device) + skeleton = outputs['skeleton'] + plot_results(vis_s_image, + vis_q_image, + support_kp, + vis_s_weight, + None, + vis_s_weight, + skeleton, + None, + torch.tensor(outputs['points']).squeeze(), + out_dir='demo', + original_skeleton=original_skeleton[0].cpu().numpy(), + img_alpha=1.0, + radius=3, + ) + + +if __name__ == '__main__': + main() diff --git a/gradio_utils/__pycache__/utils.cpython-39.pyc b/gradio_utils/__pycache__/utils.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32c19762d57d865f88668d45c6a38e45b75db866 Binary files /dev/null and b/gradio_utils/__pycache__/utils.cpython-39.pyc differ diff --git a/gradio_utils/utils.py b/gradio_utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2c906fb26ed3041907d49aa6d4e034d447760e67 --- /dev/null +++ b/gradio_utils/utils.py @@ -0,0 +1,407 @@ +import random +import collections +import gradio as gr +import numpy as np +import psutil +import torch +from PIL import ImageDraw, Image, ImageEnhance +from matplotlib import pyplot as plt +from mmcv import Config +from mmcv.runner import load_checkpoint +from mmpose.core import wrap_fp16_model +from mmpose.models import build_posenet +from torchvision import transforms +import matplotlib.patheffects as mpe +from demo import Resize_Pad +from EdgeCape.models import * + + +def process_img(support_image, global_state): + global_state['images']['image_orig'] = support_image + global_state['images']['image_kp'] = support_image + reset_kp(global_state) + return support_image, global_state + + +def adj_mx_from_edges(num_pts, skeleton, device='cuda', normalization_fix=True): + adj_mx = torch.empty(0, device=device) + batch_size = len(skeleton) + for b in range(batch_size): + edges = torch.tensor(skeleton[b]) + adj = torch.zeros(num_pts, num_pts, device=device) + adj[edges[:, 0], edges[:, 1]] = 1 + adj_mx = torch.concatenate((adj_mx, adj.unsqueeze(0)), dim=0) + trans_adj_mx = torch.transpose(adj_mx, 1, 2) + cond = (trans_adj_mx > adj_mx).float() + adj = adj_mx + trans_adj_mx * cond - adj_mx * cond + return adj + + +def plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w, + skeleton=None, prediction=None, radius=6, in_color=None, + original_skeleton=None, img_alpha=0.6, target_keypoints=None): + h, w, c = support_img.shape + prediction = prediction[-1] * h + if isinstance(prediction, torch.Tensor): + prediction = prediction.cpu().numpy() + if isinstance(skeleton, list): + skeleton = adj_mx_from_edges(num_pts=100, skeleton=[skeleton]).cpu().numpy()[0] + original_skeleton = skeleton + support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img)) + query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img)) + error_mask = None + for id, (img, w, keypoint, adj) in enumerate(zip([support_img, support_img, query_img], + [support_w, support_w, query_w], + # [support_kp, query_kp])): + [support_kp, support_kp, prediction], + [original_skeleton, skeleton, skeleton])): + color = in_color + f, axes = plt.subplots() + plt.imshow(img, alpha=img_alpha) + + # On qeury image plot + if id == 2 and target_keypoints is not None: + error = np.linalg.norm(keypoint - target_keypoints, axis=-1) + error_mask = error > (256 * 0.05) + + for k in range(keypoint.shape[0]): + if w[k] > 0: + kp = keypoint[k, :2] + c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6) + if error_mask is not None and error_mask[k]: + c = (1, 1, 0, 0.75) + patch = plt.Circle(kp, + radius, + color=c, + path_effects=[mpe.withStroke(linewidth=8, foreground='black'), + mpe.withStroke(linewidth=4, foreground='white'), + mpe.withStroke(linewidth=2, foreground='black'), + ], + zorder=260) + axes.add_patch(patch) + axes.text(kp[0], kp[1], k, fontsize=10, color='black', ha="center", va="center", zorder=320, ) + else: + patch = plt.Circle(kp, + radius, + color=c, + path_effects=[mpe.withStroke(linewidth=2, foreground='black')], + zorder=200) + axes.add_patch(patch) + axes.text(kp[0], kp[1], k, fontsize=(radius + 4), color='white', ha="center", va="center", + zorder=300, + path_effects=[ + mpe.withStroke(linewidth=max(1, int((radius + 4) / 5)), foreground='black')]) + # axes.text(kp[0], kp[1], k) + plt.draw() + + if adj is not None: + # Make max value 6 + draw_skeleton = adj ** 1 + max_skel_val = np.max(draw_skeleton) + draw_skeleton = draw_skeleton / max_skel_val * 6 + for i in range(1, keypoint.shape[0]): + for j in range(0, i): + if w[i] > 0 and w[j] > 0 and original_skeleton[i][j] > 0: + if color is None: + num_colors = int((skeleton > 0.05).sum() / 2) + color = iter(plt.cm.rainbow(np.linspace(0, 1, num_colors + 1))) + c = next(color) + elif isinstance(color, str): + c = color + elif isinstance(color, collections.Iterable): + c = next(color) + else: + raise ValueError("Color must be a string or an iterable") + if w[i] > 0 and w[j] > 0 and skeleton[i][j] > 0: + width = draw_skeleton[i][j] + stroke_width = width + (width / 3) + patch = plt.Line2D([keypoint[i, 0], keypoint[j, 0]], + [keypoint[i, 1], keypoint[j, 1]], + linewidth=width, color=c, alpha=0.6, + path_effects=[mpe.withStroke(linewidth=stroke_width, foreground='black')], + zorder=1) + axes.add_artist(patch) + + plt.axis('off') # command for hiding the axis. + plt.subplots_adjust(0, 0, 1, 1, 0, 0) + return plt + + +def process(query_img, state, + cfg_path='configs/test/1shot_split1.py', + checkpoint_path='ckpt/1shot_split1.pth'): + cfg = Config.fromfile(cfg_path) + width, height, _ = state['original_support_image'].shape + kp_src_np = np.array(state['kp_src']).copy().astype(np.float32) + kp_src_np[:, 0] = kp_src_np[:, 0] / (width // 4) * cfg.model.encoder_config.img_size + kp_src_np[:, 1] = kp_src_np[:, 1] / (height // 4) * cfg.model.encoder_config.img_size + kp_src_np = np.flip(kp_src_np, 1).copy() + kp_src_tensor = torch.tensor(kp_src_np).float() + preprocess = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + Resize_Pad(cfg.model.encoder_config.img_size, + cfg.model.encoder_config.img_size)]) + + if len(state['skeleton']) == 0: + state['skeleton'] = [(0, 0)] + + support_img = preprocess(state['images']['image_orig']).flip(0)[None] + np_query = np.array(query_img)[:, :, ::-1].copy() + q_img = preprocess(np_query).flip(0)[None] + # Create heatmap from keypoints + genHeatMap = TopDownGenerateTargetFewShot() + data_cfg = cfg.data_cfg + data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, + cfg.model.encoder_config.img_size]) + data_cfg['joint_weights'] = None + data_cfg['use_different_joint_weights'] = False + kp_src_3d = torch.cat( + (kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1) + kp_src_3d_weight = torch.cat( + (torch.ones_like(kp_src_tensor), + torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1) + target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, + kp_src_3d, + kp_src_3d_weight, + sigma=1) + target_s = torch.tensor(target_s).float()[None] + target_weight_s = torch.ones_like( + torch.tensor(target_weight_s).float()[None]) + + data = { + 'img_s': [support_img], + 'img_q': q_img, + 'target_s': [target_s], + 'target_weight_s': [target_weight_s], + 'target_q': None, + 'target_weight_q': None, + 'return_loss': False, + 'img_metas': [{'sample_skeleton': [state['skeleton']], + 'query_skeleton': state['skeleton'], + 'sample_joints_3d': [kp_src_3d], + 'query_joints_3d': kp_src_3d, + 'sample_center': [kp_src_tensor.mean(dim=0)], + 'query_center': kp_src_tensor.mean(dim=0), + 'sample_scale': [ + kp_src_tensor.max(dim=0)[0] - + kp_src_tensor.min(dim=0)[0]], + 'query_scale': kp_src_tensor.max(dim=0)[0] - + kp_src_tensor.min(dim=0)[0], + 'sample_rotation': [0], + 'query_rotation': 0, + 'sample_bbox_score': [1], + 'query_bbox_score': 1, + 'query_image_file': '', + 'sample_image_file': [''], + }] + } + # Load model + model = build_posenet(cfg.model) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + load_checkpoint(model, checkpoint_path, map_location='cpu') + model.eval() + with torch.no_grad(): + outputs = model(**data) + # visualize results + vis_s_weight = target_weight_s[0] + vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0) + vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0) + support_kp = kp_src_3d + out = plot_results(vis_s_image, + vis_q_image, + support_kp, + vis_s_weight, + None, + vis_s_weight, + outputs['skeleton'], + torch.tensor(outputs['points']).squeeze(), + original_skeleton=state['skeleton'], + img_alpha=1.0, + ) + return out, state + + +def update_examples(support_img, posed_support, query_img, state, r=0.015, width=0.02): + state['color_idx'] = 0 + state['original_support_image'] = np.array(support_img)[:, :, ::-1].copy() + support_img, posed_support, _ = set_query(support_img, state, example=True) + w, h = support_img.size + draw_pose = ImageDraw.Draw(support_img) + draw_limb = ImageDraw.Draw(posed_support) + r = int(r * w) + width = int(width * w) + for pixel in state['kp_src']: + leftUpPoint = (pixel[1] - r, pixel[0] - r) + rightDownPoint = (pixel[1] + r, pixel[0] + r) + twoPointList = [leftUpPoint, rightDownPoint] + draw_pose.ellipse(twoPointList, fill=(255, 0, 0, 255)) + draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255)) + for limb in state['skeleton']: + point_a = state['kp_src'][limb[0]][::-1] + point_b = state['kp_src'][limb[1]][::-1] + if state['color_idx'] < len(COLORS): + c = COLORS[state['color_idx']] + state['color_idx'] += 1 + else: + c = random.choices(range(256), k=3) + draw_limb.line([point_a, point_b], fill=tuple(c), width=width) + return support_img, posed_support, query_img, state + + +def get_select_coords(global_state, + evt: gr.SelectData + ): + """This function only support click for point selection + """ + xy = evt.index + global_state["points"].append(xy) + # point_idx = get_latest_points_pair(points) + # if point_idx is None: + # points[0] = {'start': xy, 'target': None} + # print(f'Click Image - Start - {xy}') + # elif points[point_idx].get('target', None) is None: + # points[point_idx]['target'] = xy + # print(f'Click Image - Target - {xy}') + # else: + # points[point_idx + 1] = {'start': xy, 'target': None} + # print(f'Click Image - Start - {xy}') + + image_raw = global_state['images']['image_kp'] + image_draw = update_image_draw( + image_raw, + xy, + global_state + ) + global_state['images']['image_kp'] = image_draw + return global_state, image_draw + +def get_closest_point_idx(pts_list, xy): + x, y = xy + closest_point = min(pts_list, key=lambda p: (p[0] - x) ** 2 + (p[1] - y) ** 2) + closest_point_index = pts_list.index(closest_point) + return closest_point_index + + +def reset_skeleton(global_state): + image = global_state["images"]["image_kp"] + global_state["images"]["image_skel"] = image + global_state["skeleton"] = [] + global_state["curr_type_point"] = "start" + global_state["prev_point"] = None + return image + + +def reset_kp(global_state): + image = global_state["images"]["image_orig"] + global_state["images"]["image_kp"] = image + global_state["images"]["image_skel"] = image + global_state["skeleton"] = [] + global_state["points"] = [] + global_state["curr_type_point"] = "start" + global_state["prev_point"] = None + return image, image + + +def select_skeleton(global_state, + evt: gr.SelectData, + ): + xy = evt.index + pts_list = global_state["points"] + closest_point_idx = get_closest_point_idx(pts_list, xy) + image_raw = global_state['images']['image_skel'] + if global_state["curr_type_point"] == "end": + prev_point_idx = global_state["prev_point_idx"] + prev_point = pts_list[prev_point_idx] + points = [prev_point, xy] + image_draw = draw_limbs_on_image(image_raw, + points + ) + global_state['images']['image_skel'] = image_draw + global_state['skeleton'].append([prev_point_idx, closest_point_idx]) + global_state["curr_type_point"] = "start" + global_state["prev_point_idx"] = None + else: + global_state["prev_point_idx"] = closest_point_idx + global_state["curr_type_point"] = "end" + return global_state, global_state['images']['image_skel'] + + +def reverse_point_pairs(points): + new_points = [] + for p in points: + new_points.append([p[1], p[0]]) + return new_points + + +def update_image_draw(image, points, global_state): + if len(global_state["points"]) < 2: + alpha = 0.5 + else: + alpha = 1.0 + image_draw = draw_points_on_image(image, points, alpha=alpha) + return image_draw + + +def print_memory_usage(): + # Print system memory usage + print(f"System memory usage: {psutil.virtual_memory().percent}%") + + # Print GPU memory usage + if torch.cuda.is_available(): + device = torch.device("cuda") + print(f"GPU memory usage: {torch.cuda.memory_allocated() / 1e9} GB") + print( + f"Max GPU memory usage: {torch.cuda.max_memory_allocated() / 1e9} GB") + device_properties = torch.cuda.get_device_properties(device) + available_memory = device_properties.total_memory - \ + torch.cuda.max_memory_allocated() + print(f"Available GPU memory: {available_memory / 1e9} GB") + else: + print("No GPU available") + +def draw_limbs_on_image(image, + points,): + color = tuple(random.choices(range(256), k=3)) + overlay_rgba = Image.new("RGBA", image.size, 0) + overlay_draw = ImageDraw.Draw(overlay_rgba) + p_start, p_target = points + if p_start is not None and p_target is not None: + p_draw = int(p_start[0]), int(p_start[1]) + t_draw = int(p_target[0]), int(p_target[1]) + overlay_draw.line( + (p_draw[0], p_draw[1], t_draw[0], t_draw[1]), + fill=color, + width=10, + ) + + return Image.alpha_composite(image.convert("RGBA"), + overlay_rgba).convert("RGB") + + +def draw_points_on_image(image, + points, + radius_scale=0.01, + alpha=1.): + if alpha < 1: + enhancer = ImageEnhance.Brightness(image) + image = enhancer.enhance(1.1) + overlay_rgba = Image.new("RGBA", image.size, 0) + overlay_draw = ImageDraw.Draw(overlay_rgba) + p_color = (255, 0, 0) + rad_draw = int(image.size[0] * radius_scale) + if points is not None: + p_draw = int(points[0]), int(points[1]) + overlay_draw.ellipse( + ( + p_draw[0] - rad_draw, + p_draw[1] - rad_draw, + p_draw[0] + rad_draw, + p_draw[1] + rad_draw, + ), + fill=p_color, + ) + + return Image.alpha_composite(image.convert("RGBA"), overlay_rgba).convert("RGB") diff --git a/rename_ckpt.py b/rename_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..68ee79b5fc5e55e9e4d02d8b4c3fba69da9725cf --- /dev/null +++ b/rename_ckpt.py @@ -0,0 +1,29 @@ +import os +import sys + +import torch + +def load_state_dicts(folder_path): + state_dicts = {} + for filename in os.listdir(folder_path): + if filename.endswith(".pth"): + print('Processing {}'.format(filename)) + file_path = os.path.join(folder_path, filename) + state_dict = torch.load(file_path) + new_state_dict = {"state_dict": {}, + "optimizer": state_dict['optimizer'], + "meta": state_dict['meta'], + } + for key in state_dict['state_dict'].keys(): + if 'spatial_pos_encoder' in key or 'skeleton_head.MLP' in key or 'skeleton_head.adj_output_mlp' in key: + continue + new_key = key.replace("keypoint_head.", "keypoint_head_module.").replace('bias_function_prior_weight', 'markov_structural_mlp') + new_state_dict['state_dict'][new_key] = state_dict['state_dict'][key] + new_file_path = os.path.join(folder_path, f'{filename}') + print(f'Saving to {new_file_path}') + torch.save(new_state_dict, new_file_path) + return state_dicts + +if __name__ == "__main__": + folder_path = sys.argv[1] + load_state_dicts(folder_path) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1879427fbda711317e9a10993320639d3104d72f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +json_tricks +numpy +opencv-python +pillow +xtcocotools +scipy diff --git a/run.py b/run.py new file mode 100644 index 0000000000000000000000000000000000000000..63011c2542113137879fcb39bab45b6d04be0ed3 --- /dev/null +++ b/run.py @@ -0,0 +1,108 @@ +import re +import subprocess +import os +import argparse +from mmcv import Config, DictAction + + +def init_parser(): + # Get config and work_dir from user + parser = argparse.ArgumentParser(description='Run the pipeline') + parser.add_argument('--config', help='config file', required=True) + parser.add_argument('--work_dir', help='work directory', required=True) + parser.add_argument('--best', action='store_true', help='work directory') + parser.add_argument('--supervision', type=str, default='decoder', help='adj supervision') + parser.add_argument('--ft_epochs', type=int, default=100, help='work directory') + parser.add_argument('--masking_ratio', type=float, default=0.5, help='work directory') + parser.add_argument('--lamda_masking', type=float, default=1.0, help='work directory') + args = parser.parse_args() + return args + + +def get_best_model(work_dir): + if os.path.exists(work_dir): + file_names = [filename for filename in os.listdir(work_dir) if filename.startswith("best_")] + if len(file_names) > 0: + file_name = file_names[0] + ckpt_path = f'{work_dir}/{file_name}' + else: + ckpt_path = f'{work_dir}/latest.pth' + return ckpt_path + + +def main(): + args = init_parser() + config = args.config + work_dir = args.work_dir + if args.best: + work_dir = f'{work_dir}_best_ckpt' + + if not os.path.exists(work_dir): + os.makedirs(work_dir) + subprocess.run(['cp', config, work_dir]) + + # -----------------------------BASE MODEL TRAINING-------------------------------- + base_workdir = f'{work_dir}/base' + cfg = Config.fromfile(args.config) + num_epochs = cfg.total_epochs + final_epoch_path = f'{base_workdir}/epoch_{num_epochs}.pth' + if not os.path.exists(final_epoch_path): + + print("Running Base Model Training") + subprocess.run(['python', 'train.py', '--config', config, '--work-dir', base_workdir]) + + # -----------------------------SKELETON MODEL TRAINING-------------------------------- + skeleton_work_dir = f'{work_dir}/base_skeleton' + skeleton_final_epoch_path = f'{skeleton_work_dir}/epoch_{args.ft_epochs}.pth' + + if args.best: + best_ckpt = get_best_model(base_workdir) + load_from = best_ckpt + else: + load_from = final_epoch_path + + new_cfg = Config.fromfile(args.config) + new_cfg.load_from = load_from + new_cfg.total_epochs = args.ft_epochs + new_cfg.model.freeze_backbone = True + new_cfg.model.keypoint_head.skeleton_head['learn_skeleton'] = True + new_cfg.model.keypoint_head.learn_skeleton = True + new_cfg.model.keypoint_head.masked_supervision = True + new_cfg.model.keypoint_head.masking_ratio = args.masking_ratio + new_cfg.model.keypoint_head.skeleton_loss_weight = args.lamda_masking + Config.dump(new_cfg, f'{work_dir}/skeleton_config.py') + + if not os.path.exists(skeleton_final_epoch_path): + print("Running Base Model Training") + subprocess.run( + ['python', 'train.py', '--config', f'{work_dir}/skeleton_config.py', '--work-dir', skeleton_work_dir]) + + # -----------------------------BIAS MODEL TRAINING-------------------------------- + bias_work_dir = f'{work_dir}/base_skeleton_bias' + bias_final_epoch_path = f'{bias_work_dir}/epoch_{args.ft_epochs}.pth' + if args.best: + best_ckpt = get_best_model(skeleton_work_dir) + load_from = best_ckpt + else: + load_from = skeleton_final_epoch_path + + new_cfg.load_from = load_from + new_cfg.model.keypoint_head.transformer.use_bias_attn_module = True + new_cfg.model.keypoint_head.transformer.attn_bias = True + new_cfg.model.keypoint_head.transformer.max_hops = 4 + new_cfg.model.keypoint_head.model_freeze = 'skeleton' + Config.dump(new_cfg, f'{work_dir}/bias_config.py') + + if not os.path.exists(bias_final_epoch_path): + print("Running Bias Model Training") + subprocess.run( + ['python', 'train.py', '--config', f'{work_dir}/bias_config.py', '--work-dir', bias_work_dir]) + + # -----------------------------EVALUATION-------------------------------- + best_ckpt = get_best_model(bias_work_dir) + subprocess.run(['python', 'test.py', f'{work_dir}/bias_config.py', f'{bias_work_dir}/latest.pth']) + subprocess.run(['python', 'test.py', f'{work_dir}/bias_config.py', best_ckpt]) + + +if __name__ == '__main__': + main() diff --git a/runai.py b/runai.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa0f0594e34f926e12a11ba20312ec4aee710a7 --- /dev/null +++ b/runai.py @@ -0,0 +1,371 @@ +# Description: Script to run multiple experiments on runai +import re +import subprocess +import os +import argparse +import time +from prettytable import PrettyTable + +class Bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +def pretty_table(dct): + table = PrettyTable(['Job', 'Status']) + for c in sorted(dct.keys()): + table.add_row([c, dct[c]]) + print(table) + + +def init_parser(): + parser = argparse.ArgumentParser(prog="RUNAI SCRIPT") + parser.add_argument('action', type=str, default=None, help='Train or Test', choices=['train', 'test', 'run']) + parser.add_argument('--config_folder', type=str, default=None, help='Run all configs in folder') + parser.add_argument('--config', type=str, default=None, help='Run all configs in folder') + parser.add_argument('--name', type=str, default=None, help='prefix') + parser.add_argument('--delete', action='store_true', help='Delete job') + parser.add_argument('--delete_fail', action='store_true', help='Delete job') + parser.add_argument('--delete_pending', action='store_true', help='Delete job') + parser.add_argument('--log', action='store_true', help='Show logs') + parser.add_argument('--delete_folder', action='store_true', help='Delete workdir folder') + parser.add_argument('--permute_keypoints', action='store_true', help='Delete workdir folder') + parser.add_argument('--dist', action='store_true', help='Distributed Training') + parser.add_argument('--find_best', action='store_true', help='Find best according to val') + parser.add_argument('--results', action='store_true', help='Show Results') + parser.add_argument('--no_base', action='store_true', help='Skip base models') + parser.add_argument('--show_cmd', action='store_true', help='Show command') + parser.add_argument('--large', action='store_true', help='Use large node') + parser.add_argument('--eval_three', action='store_true', help='Evaluate on 3 ckpts') + parser.add_argument('--pck', type=float, default=0.2, help='PCK threshold') + parser.add_argument('--auc', action='store_true', help='Evaluate AUC') + parser.add_argument('--mpck', action='store_true', help='Evaluate mPCK') + parser.add_argument('--check_logs', action='store_true', help='check runai logs instead of workdir') + parser.add_argument('--stat', action='store_true', help='check runai status') + parser.add_argument('--CVPR24', action='store_true', help='run on CVPR24 legacy folder') + parser.add_argument('--run_best_ckpt', action='store_true', help='run on CVPR24 legacy folder') + parser.add_argument('--num_samples', type=int, default=32, help='PCK threshold') + parser.add_argument('--ft_epochs', type=int, default=None, help='Num of FT epochs') + parser.add_argument('--masking', type=float, default=None, help='Num of FT epochs') + parser.add_argument('--masking_lamda', type=float, default=None, help='Num of FT epochs') + + return parser.parse_args() + + +def check_status(job_name): + status = None + status_command = f'runai describe job {job_name}' + log = subprocess.run(status_command, shell=True, capture_output=True) + log = log.stdout.decode('utf-8') + pattern = r"Status:\s+(\w+)" + match = re.search(pattern, log) + if match: + status = match.group(1) + return status + + +def train_is_running(job_name, status=['Running', 'Pending', 'Failed']): + run_status = check_status(job_name) + for stat in status: + if run_status == stat: + print(f'{Bcolors.FAIL}{job_name} is {stat}{Bcolors.ENDC}') + return True + return False + + +def get_best_run(workdir_path, config, find_best): + file_name = None + ckpt_path = f'{workdir_path}/latest.pth' + if find_best == 'best': + local_path = f'work_dir_runai/{config.split(".")[0]}' + if os.path.exists(local_path): + file_names = [filename for filename in os.listdir(local_path) if filename.startswith("best_")] + if len(file_names) > 0: + file_name = file_names[0] + ckpt_path = f'{workdir_path}/{file_name}' + elif find_best == 'epoch_100': + local_path = f'work_dir_runai/{config.split(".")[0]}' + if os.path.exists(local_path): + file_name = 'epoch_100.pth' + if len(file_name) > 0: + ckpt_path = f'{workdir_path}/{file_name}' + return ckpt_path, file_name + +def check_runai_logs(job_name): + os_command = f'runai logs {job_name}' + # status = subprocess.run(os_command, shell=True, capture_output=True) + # status = status.decode('utf-8') + status = subprocess.run(os_command, shell=True, capture_output=True, text=True) + status = status.stdout + return status + + +def get_run_name(config, args, run): + run = run.replace('_', '-') + lwr_config = config.lower() + train_job_name = f'or-{lwr_config.split(".")[0].replace("_", "-")}' + if len(train_job_name) > 60: + renamed_config = name_abriviator(lwr_config) + train_job_name = f'or-{renamed_config.split(".")[0].replace("_", "-")}'[:60] + test_job_name = f'ev-{run}-{lwr_config.split(".")[0].replace("_", "-")}' + if len(test_job_name) > 40: + renamed_config = name_abriviator(lwr_config) + test_job_name = f'ev-{run}-{renamed_config.split(".")[0].replace("_", "-")}'[:58] + job_names = [train_job_name, test_job_name] + for i in range(len(job_names)): + if job_names[i].endswith('-'): + job_names[i] = job_names[i][:-1] + if args.name is not None: + job_names[i] = f'{args.name}-{job_names[i]}' + return job_names + + +def name_abriviator(name): + replace_dict = { + 'encoder': 'enc', + 'decoder': 'dec', + 'look_twice': 'lt', + 'cross_category': 'cc', + 'max_hops': 'hops', + 'lamda': 'l', + 'symmetric': 'sym', + 'auxiliary': 'aux', + 'batch_size': 'bs', + } + for key, value in replace_dict.items(): + name = name.replace(key, value) + return name + + +def check_skip(lwr_config, args): + if args.no_base and 'base' in lwr_config: + print(f'Skipping {Bcolors.OKCYAN}{lwr_config}{Bcolors.ENDC} - base model') + return True + # if not args.action == "train" and ('cross_category' in lwr_config or 'cross_cat' in lwr_config): + # print( + # f'Skipping {Bcolors.OKCYAN}{lwr_config}{Bcolors.ENDC} - test on cross_caregory, validation is the same as test') + # return True + return False + + +def print_results(results): + print(f'\n\n\n{Bcolors.OKGREEN}Scores{Bcolors.ENDC}') + config_length = max(15, max(len(key) for key in results.keys())) + config_column_width = config_length + 2 + print(f'| {"Config":<{config_column_width}} | {"Max Value":<11} | {"Latest Value":<13} | {"Best Value":<10} | {"Best Epoch":<10} |') + print(f'|{"-" * (config_column_width + 2)}|{"-" * 13}|{"-" * 15}|{"-" * 13}|{"-" * 11}|') + for config, val_dict in sorted(results.items()): + config_print = config.split('/')[-1].replace('.py', '') + other_results = val_dict.copy() + del other_results['latest'] + best_key = max(other_results, key=other_results.get) + latest_val = parse_result(val_dict['latest'], Bcolors.OKBLUE) + best_val = parse_result(val_dict[best_key], Bcolors.HEADER) + if val_dict['latest'] is None and val_dict[best_key] is None: + max_val = f'{Bcolors.WARNING}No results{Bcolors.ENDC}' + elif val_dict['latest'] is None: + max_val = best_val + elif val_dict[best_key] is None: + max_val = latest_val + else: + max_val = latest_val if val_dict['latest'] > val_dict[best_key] else best_val + # print as a table: config, max_val, latest_val, best_val + print(f'| {config_print:<{config_column_width}} | {max_val:<20} | {latest_val:<22} | {best_val:<20} |{best_key:<10} |') + + # print(f'{config_print}: {round(max_val * 100, 2)} ' + # f'Latest: {latest_val} {best_key}: {best_val}') + + +def parse_result(value, color): + if value is None: + return f'{Bcolors.WARNING}No results{Bcolors.ENDC}' + else: + return f'{color}{round(value * 100, 2)}{Bcolors.ENDC}' + + +def main(): + delay = 1 + args = init_parser() + scores = {} + stat = {} + best_run = None + if args.config_folder: + configs = [] + # list all py files in folder and subfolders + if '*' in args.config_folder: + config_folder = args.config_folder.strip("'") + parent_folder = os.path.relpath(os.path.join(config_folder, os.pardir)) + configs = [os.path.join(parent_folder, f) for f in os.listdir(parent_folder) if config_folder.split('*')[0] in os.path.join(parent_folder, f)] + else: + matched_folders = [args.config_folder] + for matched_folder in matched_folders: + for root, dirs, files in os.walk(matched_folder): + for file in files: + if file.endswith(".py"): + configs.append(os.path.join(root, file)) + else: + configs = [args.config] + print(f"{Bcolors.OKGREEN}Running {args.action} on {len(configs)} configs{Bcolors.ENDC}") + if args.action == "test" and not args.eval_three and not args.find_best: + runs = ['latest', 'best'] + elif args.eval_three: + runs = ['latest', 'best', 'epoch_100'] + elif args.find_best: + runs = ['best'] + else: + runs = ['latest'] + for config_path in sorted(configs): + for run in runs: + config = config_path.split("/")[-2] + "_" + config_path.split("/")[-1].replace('_config', '') + if args.CVPR24: + workdir_path = f'/storage/orhir/capeformer_legacy/{config.split(".")[0]}' + else: + workdir_path = f'/storage/orhir/capeformer/{config.split(".")[0]}' + local_workdir_path = f'work_dir_runai/{config.split(".")[0]}' + lwr_config = config.lower() + if check_skip(lwr_config, args): + continue + if args.action == "train" or args.action == "run": + gpu = 4 if args.dist else 1 + resource = f' -g {gpu}' + else: + # resource = f' --gpu-memory 4G --cpu 2 --memory 4G' + resource = f' -g 0.3' + if args.large: + resource += f' --node-pools blaufer' + if args.stat: + train_job_name, job_name = get_run_name(config, args, run) + if args.action == "train" or args.action == "run": + job_name = train_job_name + print(f'{"-" * 30 + Bcolors.OKCYAN + job_name + Bcolors.ENDC + "-" * 30}') + status = check_status(job_name) + stat[job_name] = status + continue + # else: + # resource += f' --node-pools faculty' + if args.action == "train": + job_name, _ = get_run_name(config, args, run) + if args.dist: + py_command = (f'python -m torch.distributed.launch ' + f'--nproc_per_node={gpu} --master_port=29500 ' + f'train.py --gpus {gpu} --config {config_path} ' + f'--work-dir {workdir_path} --autoscale-lr ' + f'--launcher pytorch') + else: + py_command = (f'python train.py ' + f' --config {config_path}' + f' --work-dir {workdir_path}') + elif args.action == "run": + job_name, _ = get_run_name(config, args, run) + if args.masking is not None: + masking_precent = int(args.masking * 100) + workdir_path = f'/storage/orhir/capeformer/CVPR25_ablation_mask_{masking_precent}' + job_name += f'-{masking_precent}' + if args.masking_lamda: + workdir_path = f'/storage/orhir/capeformer/CVPR25_ablation_mask_lamda_{int(args.masking_lamda)}' + job_name += f'-lamda-{int(args.masking_lamda)}' + py_command = (f'python run.py ' + f' --config {config_path}' + f' --work_dir {workdir_path}') + if args.run_best_ckpt: + py_command += ' --best' + job_name += '-best' + if args.ft_epochs: + py_command += f' --ft_epochs {args.ft_epochs}' + if args.masking: + py_command += f' --masking_ratio {args.masking}' + if args.masking_lamda: + py_command += f' --lamda_masking {args.masking_lamda}' + else: + train_job_name, job_name = get_run_name(config, args, run) + ckpt_path, best_run = get_best_run(workdir_path, config, run) + py_command = f'python test.py {config_path} {ckpt_path} --num_samples {args.num_samples}' + if args.permute_keypoints: + py_command += ' --permute_keypoints' + job_name = (job_name + '-permute-keypoints')[:60] + print(f'{"-" * 30 + Bcolors.OKCYAN + job_name + Bcolors.ENDC + "-" * 30}') + if args.log: + os_command = f'runai logs {job_name}' + elif args.delete_fail: + if not train_is_running(job_name, ['Failed', 'Error']): + print("Job not failed, skipping...") + continue + os_command = f'runai delete job {job_name}' + elif args.delete_pending: + if not train_is_running(job_name, ['Pending']): + continue + os_command = f'runai delete job {job_name}' + elif args.delete: + os_command = f'runai delete job {job_name}' + elif args.results: + if args.check_logs: + # First check if the job is completed + status = check_runai_logs(job_name) + else: + if args.action == 'run': + log_file = os.path.join(f'work_dir_runai/{config.split(".")[0]}', + 'base_skeleton_bias', + 'testing_log.txt') + else: + log_file = os.path.join(f'work_dir_runai/{config.split(".")[0]}', + 'testing_log.txt') + if os.path.exists(log_file): + with open(log_file, 'r') as f: + status = f.read() + # Parse config: + match = re.search(f'\*\*[\s\S]*?checkpoint:\s*.*?{run}[\s\S]*?(AUC:[\s\S]*?mPCK:\s*[\d.]+)', status) + if match: + status = match.group(1) + else: + status = '' + delay = 0 + else: + status = check_runai_logs(job_name) + if args.auc and 'AUC' in status: + score = float(status.split('AUC: ')[1].split('\n')[0]) + elif args.mpck and 'mPCK' in status: + score = float(status.split('mPCK: ')[1].split('\n')[0]) + elif f'PCK@{args.pck}:' in status: + score = float(status.split(f'PCK@{args.pck}: ')[1].split('\n')[0]) + else: + score = None + best_run = best_run.replace('best_PCK_', '').strip('.pth') if best_run else "No Best" + key = 'latest' if run == 'latest' else best_run + if config in scores: + scores[config][key] = score + else: + scores[config] = {key: score} + continue + else: + if args.action == 'test': + if not train_is_running(train_job_name, ['Completed', 'Succeeded']): + print('Train not completed') + continue + os_command = (f'runai submit --pvc=storage:/storage -i orhir/capeformer ' + f' --name {job_name} {resource} --large-shm ' + f' --command -- {py_command}') + # print(os_command) + if args.show_cmd: + print(f'{Bcolors.OKGREEN}{os_command}{Bcolors.ENDC}') + subprocess.run(os_command, shell=True) + if args.delete_folder: + if os.path.exists(local_workdir_path): + subprocess.run(f'rm -rf {local_workdir_path}', shell=True) + else: + subprocess.run(f'echo {Bcolors.WARNING}No workdir folder to delete{Bcolors.ENDC}', shell=True) + # print(f'\n{"-" * 150}') + time.sleep(delay) + if args.results: + print_results(scores) + if args.stat: + pretty_table(stat) + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..e641c8d2f5f61b7b69b1554fe494fc6ea26b481f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,22 @@ +[bdist_wheel] +universal=1 + +[aliases] +test=pytest + +[tool:pytest] +addopts=tests/ + +[yapf] +based_on_style = pep8 +blank_line_before_nested_class_or_def = true +split_before_expression_after_opening_paren = true + +[isort] +line_length = 79 +multi_line_output = 0 +known_standard_library = pkg_resources,setuptools +known_first_party = mmpose +known_third_party = cv2,json_tricks,mmcv,mmdet,munkres,numpy,xtcocotools,torch +no_lines_before = STDLIB,LOCALFOLDER +default_section = THIRDPARTY diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f7a432c91de128e7f6fcfd0f7ef109bbffbc47 --- /dev/null +++ b/setup.py @@ -0,0 +1,111 @@ +import os +import subprocess +import time +from setuptools import find_packages, setup + + +def readme(): + with open('README.md', encoding='utf-8') as f: + content = f.read() + return content + + +version_file = 'EdgeCape/version.py' + + +def get_git_hash(): + + def _minimal_ext_cmd(cmd): + # construct minimal environment + env = {} + for k in ['SYSTEMROOT', 'PATH', 'HOME']: + v = os.environ.get(k) + if v is not None: + env[k] = v + # LANGUAGE is used on win32 + env['LANGUAGE'] = 'C' + env['LANG'] = 'C' + env['LC_ALL'] = 'C' + out = subprocess.Popen( + cmd, stdout=subprocess.PIPE, env=env).communicate()[0] + return out + + try: + out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) + sha = out.strip().decode('ascii') + except OSError: + sha = 'unknown' + + return sha + + +def get_hash(): + if os.path.exists('.git'): + sha = get_git_hash()[:7] + elif os.path.exists(version_file): + try: + from EdgeCape.version import __version__ + sha = __version__.split('+')[-1] + except ImportError: + raise ImportError('Unable to get git version') + else: + sha = 'unknown' + + return sha + + +def write_version_py(): + content = """# GENERATED VERSION FILE +# TIME: {} +__version__ = '{}' +short_version = '{}' +version_info = ({}) +""" + sha = get_hash() + with open('EdgeCape/VERSION', 'r') as f: + SHORT_VERSION = f.read().strip() + VERSION_INFO = ', '.join(SHORT_VERSION.split('.')) + VERSION = SHORT_VERSION + '+' + sha + + version_file_str = content.format(time.asctime(), VERSION, SHORT_VERSION, + VERSION_INFO) + with open(version_file, 'w') as f: + f.write(version_file_str) + + +def get_version(): + with open(version_file, 'r') as f: + exec(compile(f.read(), version_file, 'exec')) + return locals()['__version__'] + + +def get_requirements(filename='requirements.txt'): + here = os.path.dirname(os.path.realpath(__file__)) + with open(os.path.join(here, filename), 'r') as f: + requires = [line.replace('\n', '') for line in f.readlines()] + return requires + + +if __name__ == '__main__': + write_version_py() + setup( + name='edgecape', + version=get_version(), + description='A template for pytorch projects.', + long_description=readme(), + packages=find_packages(exclude=('configs', 'tools', 'demo')), + package_data={'edgecape.ops': ['*/*.so']}, + classifiers=[ + 'Development Status :: 4 - Beta', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + ], + license='Apache License 2.0', + setup_requires=['pytest-runner', 'cython', 'numpy'], + tests_require=['pytest', 'xdoctest'], + install_requires=get_requirements(), + zip_safe=False) diff --git a/test.py b/test.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1afcdb1f1321d0b9bbd6877af424acfc3977b0 --- /dev/null +++ b/test.py @@ -0,0 +1,164 @@ +import argparse +import os +import os.path as osp +import random +import uuid + +import mmcv +import numpy as np +import torch +from mmcv import Config, DictAction +from mmcv.cnn import fuse_conv_bn +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import get_dist_info, init_dist, load_checkpoint +from EdgeCape import * # noqa +from EdgeCape.datasets import build_dataset +from EdgeCape.apis.test import multi_gpu_test, single_gpu_test +from mmpose.core import wrap_fp16_model +from mmpose.datasets import build_dataloader +from mmpose.models import build_posenet + + +def parse_args(): + parser = argparse.ArgumentParser(description='mmpose test model') + parser.add_argument('config', default=None, help='test config file path') + parser.add_argument('checkpoint', default=None, help='checkpoint file') + parser.add_argument('--out', help='output result file') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase the inference speed') + parser.add_argument( + '--eval', + default=None, + nargs='+', + help='evaluation metric, which depends on the dataset,' + ' e.g., "mAP" for MSCOCO') + parser.add_argument( + '--permute_keypoints', + action='store_true', + help='whether to randomly permute keypoints') + parser.add_argument( + '--gpu_collect', + action='store_true', + help='whether to use gpu to collect results') + parser.add_argument('--tmpdir', help='tmp dir for writing some results') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument('--num_samples', type=int, default=1) + + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + return args + + +def merge_configs(cfg1, cfg2): + # Merge cfg2 into cfg1 + # Overwrite cfg1 if repeated, ignore if value is None. + cfg1 = {} if cfg1 is None else cfg1.copy() + cfg2 = {} if cfg2 is None else cfg2 + for k, v in cfg2.items(): + if v: + cfg1[k] = v + return cfg1 + + +def main(): + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + uuid.UUID(int=0) + + args = parse_args() + + cfg = Config.fromfile(args.config) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + # cfg.model.pretrained = None + cfg.data.test.test_mode = True + + args.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + mmcv.mkdir_or_exist(osp.abspath(args.work_dir)) + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + + # build the dataloader + dataset = build_dataset(cfg.data.test, dict(test_mode=True)) + dataloader_setting = dict( + samples_per_gpu=args.num_samples, + workers_per_gpu=cfg.data.get('workers_per_gpu', 12), + dist=distributed, + shuffle=False, + drop_last=False) + dataloader_setting = dict(dataloader_setting, + **cfg.data.get('test_dataloader', {})) + data_loader = build_dataloader(dataset, **dataloader_setting) + # build the model and load checkpoint + model = build_posenet(cfg.model) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + load_checkpoint(model, args.checkpoint, map_location='cpu') + + if args.fuse_conv_bn: + model = fuse_conv_bn(model) + + if not distributed: + model = MMDataParallel(model, device_ids=[0]) + outputs = single_gpu_test(model, data_loader) + else: + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False) + outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) + + rank, _ = get_dist_info() + eval_config = cfg.get('evaluation', {}) + eval_config = merge_configs(eval_config, dict(metric=args.eval)) + + if rank == 0: + if args.out: + print(f'\nwriting results to {args.out}') + mmcv.dump(outputs, args.out) + + results = dataset.evaluate(outputs, **eval_config) + print('\n') + for k, v in sorted(results.items()): + print(f'{k}: {v}') + + # save testing log + test_log_path = osp.dirname(args.checkpoint) + test_log_file = "testing_log.txt" + test_log = osp.join(test_log_path, test_log_file) + with open(test_log, 'a') as f: + f.write("** config_file: " + args.config + "\t checkpoint: " + args.checkpoint + "\t \n") + for k, v in sorted(results.items()): + f.write(f'\t {k}: {v}'+'\n') + f.write("********************************************************************\n") + +if __name__ == '__main__': + main() diff --git a/train.py b/train.py new file mode 100644 index 0000000000000000000000000000000000000000..1f64fff608890a9b8e0915ecb8f22c90e17eb7c9 --- /dev/null +++ b/train.py @@ -0,0 +1,201 @@ +import argparse +import copy +import os +import os.path as osp +import time +import random +import uuid +import numpy as np +import mmcv +import torch +from mmcv import Config, DictAction +from mmcv.runner import get_dist_info, init_dist, set_random_seed +from mmcv.utils import get_git_hash + +from EdgeCape import * # noqa +from EdgeCape.apis import train_model +from EdgeCape.datasets import build_dataset + +from mmpose import __version__ +from mmpose.models import build_posenet +from mmpose.utils import collect_env, get_root_logger + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a pose model') + parser.add_argument('--config', default=None, help='train config file path') + parser.add_argument('--work-dir', default=None, help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument('--load-from', help='the checkpoint file to load from') + parser.add_argument( + '--auto-resume', type=bool, default=True, help='automatically detect the latest checkpoint in word dir and resume from it.') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=None, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + default={}, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. For example, ' + "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'") + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local-rank', type=int, default=0) + parser.add_argument( + '--autoscale-lr', + action='store_true', + help='automatically scale lr with the number of gpus') + parser.add_argument( + '--show', + action='store_true', + help='whether to display the prediction results in a window.') + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + return args + + +def main(): + args = parse_args() + # torch.autograd.set_detect_anomaly(True) + cfg = Config.fromfile(args.config) + + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + torch.backends.cudnn.benchmark = True + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI + # > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + # auto resume + if args.auto_resume: + checkpoint = os.path.join(args.work_dir, 'latest.pth') + if os.path.exists(checkpoint): + cfg.resume_from = checkpoint + if args.load_from is not None: + cfg.load_from = args.load_from + if args.resume_from is not None: + cfg.resume_from = args.resume_from + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + + if args.autoscale_lr: + # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) + cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + os.environ['NCCL_BLOCKING_WAIT'] = '0' # not to enforce timeout + os.environ['NCCL_P2P_DISABLE'] = '1' + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + args.seed = 1 + args.deterministic = True + if args.seed is not None: + logger.info(f'Set random seed to {args.seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(args.seed, deterministic=args.deterministic) + cfg.seed = args.seed + meta['seed'] = args.seed + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + uuid.UUID(int=args.seed) + + model = build_posenet(cfg.model) + train_datasets = [build_dataset(cfg.data.train)] + + # if len(cfg.workflow) == 2: + # val_dataset = copy.deepcopy(cfg.data.val) + # val_dataset.pipeline = cfg.data.train.pipeline + # datasets.append(build_dataset(val_dataset)) + + val_dataset = copy.deepcopy(cfg.data.val) + val_dataset = build_dataset(val_dataset, dict(test_mode=True)) + + if cfg.checkpoint_config is not None: + # save mmpose version, config file content + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmpose_version=__version__ + get_git_hash(digits=7), + config=cfg.pretty_text, + ) + train_model( + model, + train_datasets, + val_dataset, + cfg, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) + + +if __name__ == '__main__': + main()