diff --git a/EdgeCape/VERSION b/EdgeCape/VERSION
new file mode 100644
index 0000000000000000000000000000000000000000..0ea3a944b399d25f7e1b8fe684d754eb8da9fe7f
--- /dev/null
+++ b/EdgeCape/VERSION
@@ -0,0 +1 @@
+0.2.0
diff --git a/EdgeCape/__init__.py b/EdgeCape/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d12119c91b4a54a136b3a13a5a695bfa90d27ea8
--- /dev/null
+++ b/EdgeCape/__init__.py
@@ -0,0 +1,3 @@
+from .core import *  # noqa
+from .datasets import *  # noqa
+from .models import *  # noqa
diff --git a/EdgeCape/__pycache__/__init__.cpython-39.pyc b/EdgeCape/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbaf1a61d0188e43097b7cd4adf0d9542a9f3178
Binary files /dev/null and b/EdgeCape/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/apis/__init__.py b/EdgeCape/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..500c844f99bf7725c185e94c289ccf5613d09da5
--- /dev/null
+++ b/EdgeCape/apis/__init__.py
@@ -0,0 +1,5 @@
+from .train import train_model
+
+__all__ = [
+    'train_model'
+]
diff --git a/EdgeCape/apis/__pycache__/__init__.cpython-39.pyc b/EdgeCape/apis/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a92d25ced6e51a27dc7c78a1609980cff0b45372
Binary files /dev/null and b/EdgeCape/apis/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/apis/__pycache__/test.cpython-39.pyc b/EdgeCape/apis/__pycache__/test.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce1b2e7a94659701418f4c76e640a0f79f64b183
Binary files /dev/null and b/EdgeCape/apis/__pycache__/test.cpython-39.pyc differ
diff --git a/EdgeCape/apis/__pycache__/train.cpython-39.pyc b/EdgeCape/apis/__pycache__/train.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24b3ed6df4a3eb358d92637846f83dc9787493ca
Binary files /dev/null and b/EdgeCape/apis/__pycache__/train.cpython-39.pyc differ
diff --git a/EdgeCape/apis/test.py b/EdgeCape/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a28324ffe4c154bf5af4a5d1226d01c13d527124
--- /dev/null
+++ b/EdgeCape/apis/test.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader):
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        batch_size = len(next(iter(data.values()))[0])
+        # results.append(result)
+        if 'preds' in result:
+            for i in range(batch_size):
+                results.append({
+                    'preds': result['preds'][i][None],
+                    'boxes': result['boxes'][i][None],
+                    'bbox_ids': [result['bbox_ids'][i]],
+                    'image_paths': [result['image_paths'][i]],
+                })
+        # use the first key as main key to calculate the batch size
+        # for _ in range(batch_size):
+        prog_bar.update(batch_size)
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.append(result)
+
+        if rank == 0:
+            # use the first key as main key to calculate the batch size
+            batch_size = len(next(iter(data.values())))
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results in cpu mode.
+
+    It saves the results on different gpus to 'tmpdir' and collects
+    them by the rank 0 worker.
+
+    Args:
+        result_part (list): Results to be collected
+        size (int): Result size.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. Default: None
+
+    Returns:
+        list: Ordered results.
+    """
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # synchronizes all processes to make sure tmpdir exist
+    dist.barrier()
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    # synchronizes all processes for loading pickle file
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+
+    # load results of all parts from tmp dir
+    part_list = []
+    for i in range(world_size):
+        part_file = osp.join(tmpdir, f'part_{i}.pkl')
+        part_list.append(mmcv.load(part_file))
+    # sort the results
+    ordered_results = []
+    for res in zip(*part_list):
+        ordered_results.extend(list(res))
+    # the dataloader may pad some samples
+    ordered_results = ordered_results[:size]
+    # remove tmp dir
+    shutil.rmtree(tmpdir)
+    return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    """Collect results in gpu mode.
+
+    It encodes results to gpu tensors and use gpu communication for results
+    collection.
+
+    Args:
+        result_part (list): Results to be collected
+        size (int): Result size.
+
+    Returns:
+        list: Ordered results.
+    """
+
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_list.append(
+                pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    return None
diff --git a/EdgeCape/apis/train.py b/EdgeCape/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3113e274e96890b32b5a6aaba31532d2258a743
--- /dev/null
+++ b/EdgeCape/apis/train.py
@@ -0,0 +1,124 @@
+import os
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
+                         build_optimizer)
+
+from mmpose.core import DistEvalHook, EvalHook, Fp16OptimizerHook
+from mmpose.datasets import build_dataloader
+from mmpose.utils import get_root_logger
+from EdgeCape.core.custom_hooks.shuffle_hooks import ShufflePairedSamplesHook
+
+def train_model(model,
+                dataset,
+                val_dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """Train model entry function.
+
+    Args:
+        model (nn.Module): The model to be trained.
+        dataset (Dataset): Train dataset.
+        cfg (dict): The config dict for training.
+        distributed (bool): Whether to use distributed training.
+            Default: False.
+        validate (bool): Whether to do evaluation. Default: False.
+        timestamp (str | None): Local time for runner. Default: None.
+        meta (dict | None): Meta dict to record some important information.
+            Default: None
+    """
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    dataloader_setting = dict(
+        samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
+        workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
+        # cfg.gpus will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
+        dist=distributed,
+        seed=cfg.seed,
+        pin_memory=False,
+    )
+    dataloader_setting = dict(dataloader_setting,
+                              **cfg.data.get('train_dataloader', {}))
+
+    data_loaders = [
+        build_dataloader(ds, **dataloader_setting) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', True) # NOTE: True has been modified to False for faster training.
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir=cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+
+    shuffle_cfg = cfg.get('shuffle_cfg', None)
+    if shuffle_cfg is not None:
+        for data_loader in data_loaders:
+            runner.register_hook(ShufflePairedSamplesHook(data_loader, **shuffle_cfg))
+
+    # register eval hooks
+    if validate:
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['res_folder'] = os.path.join(cfg.work_dir, eval_cfg['res_folder'])
+        dataloader_setting = dict(
+            # samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
+            # cfg.gpus will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            shuffle=False,
+            pin_memory=False,
+        )
+        dataloader_setting = dict(dataloader_setting,
+                                    **cfg.data.get('val_dataloader', {}))
+        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
diff --git a/EdgeCape/core/__init__.py b/EdgeCape/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/EdgeCape/core/__init__.py
@@ -0,0 +1 @@
+
diff --git a/EdgeCape/core/__pycache__/__init__.cpython-39.pyc b/EdgeCape/core/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..621d6daae565649aeb11af5f270aaa93cfb7c936
Binary files /dev/null and b/EdgeCape/core/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/core/custom_hooks/__pycache__/shuffle_hooks.cpython-39.pyc b/EdgeCape/core/custom_hooks/__pycache__/shuffle_hooks.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16d7f9a3d09ef946f8d3b4872569f0efb0a61a51
Binary files /dev/null and b/EdgeCape/core/custom_hooks/__pycache__/shuffle_hooks.cpython-39.pyc differ
diff --git a/EdgeCape/core/custom_hooks/shuffle_hooks.py b/EdgeCape/core/custom_hooks/shuffle_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4fa43b816ad2c8fb3f93a587d25ac4e02e4e18b
--- /dev/null
+++ b/EdgeCape/core/custom_hooks/shuffle_hooks.py
@@ -0,0 +1,28 @@
+from mmcv.runner import Hook
+from torch.utils.data import DataLoader
+from mmpose.utils import get_root_logger
+
+class ShufflePairedSamplesHook(Hook):
+    """Non-Distributed ShufflePairedSamples.
+    After each training epoch, run FewShotKeypointDataset.random_paired_samples()
+    """
+
+    def __init__(self,
+                 dataloader,
+                 interval=1):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.logger = get_root_logger()
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        # self.logger.info("Run random_paired_samples()")
+        # self.logger.info(f"Before: {self.dataloader.dataset.paired_samples[0]}")
+        self.dataloader.dataset.random_paired_samples()
+        # self.logger.info(f"After: {self.dataloader.dataset.paired_samples[0]}")
diff --git a/EdgeCape/datasets/__init__.py b/EdgeCape/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25529624cf32c145ca0bf686af2899bb386d5d28
--- /dev/null
+++ b/EdgeCape/datasets/__init__.py
@@ -0,0 +1,3 @@
+from .builder import *  # noqa
+from .datasets import *  # noqa
+from .pipelines import *  # noqa
diff --git a/EdgeCape/datasets/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4de4c2a0d6795b20c1e4f4c84dd001473552f0bb
Binary files /dev/null and b/EdgeCape/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/__pycache__/builder.cpython-39.pyc b/EdgeCape/datasets/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30750e1d1d6a20d048432ded43899d35cc9af8a3
Binary files /dev/null and b/EdgeCape/datasets/__pycache__/builder.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/builder.py b/EdgeCape/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..3468bf9ea271a54a04666699c01a585bd4c387d7
--- /dev/null
+++ b/EdgeCape/datasets/builder.py
@@ -0,0 +1,55 @@
+from mmcv.utils import build_from_cfg
+from torch.utils.data.dataset import ConcatDataset
+
+from mmpose.datasets.dataset_wrappers import RepeatDataset
+from mmpose.datasets.builder import DATASETS
+
+
+def _concat_cfg(cfg):
+    replace = ['ann_file', 'img_prefix']
+    channels = ['num_joints', 'dataset_channel']
+    concat_cfg = []
+    for i in range(len(cfg['type'])):
+        cfg_tmp = cfg.deepcopy()
+        cfg_tmp['type'] = cfg['type'][i]
+        for item in replace:
+            assert item in cfg_tmp
+            assert len(cfg['type']) == len(cfg[item]), (cfg[item])
+            cfg_tmp[item] = cfg[item][i]
+        for item in channels:
+            assert item in cfg_tmp['data_cfg']
+            assert len(cfg['type']) == len(cfg['data_cfg'][item])
+            cfg_tmp['data_cfg'][item] = cfg['data_cfg'][item][i]
+        concat_cfg.append(cfg_tmp)
+    return concat_cfg
+
+
+def _check_vaild(cfg):
+    replace = ['num_joints', 'dataset_channel']
+    if isinstance(cfg['data_cfg'][replace[0]], (list, tuple)):
+        for item in replace:
+            cfg['data_cfg'][item] = cfg['data_cfg'][item][0]
+    return cfg
+
+
+def build_dataset(cfg, default_args=None):
+    """Build a dataset from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        default_args (dict, optional): Default initialization arguments.
+            Default: None.
+
+    Returns:
+        Dataset: The constructed dataset.
+    """
+    if isinstance(cfg['type'], (list, tuple)): # In training, type=TransformerPoseDataset
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in _concat_cfg(cfg)])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    else:
+        cfg = _check_vaild(cfg)
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+    return dataset
diff --git a/EdgeCape/datasets/datasets/__init__.py b/EdgeCape/datasets/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..21977d38975ec150e01f01e4ee44ffbfc6ded25e
--- /dev/null
+++ b/EdgeCape/datasets/datasets/__init__.py
@@ -0,0 +1,6 @@
+from .mp100 import (FewShotKeypointDataset, FewShotBaseDataset,
+                      TransformerBaseDataset, TransformerPoseDataset,)
+
+__all__ = ['FewShotBaseDataset', 'FewShotKeypointDataset',
+           'TransformerBaseDataset', 'TransformerPoseDataset',
+           ]
diff --git a/EdgeCape/datasets/datasets/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/datasets/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fc96fe82b2013fb373442481f9c23ac0cf483de
Binary files /dev/null and b/EdgeCape/datasets/datasets/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__init__.py b/EdgeCape/datasets/datasets/mp100/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c22428d55a700d541e27d8f2d5b0c168e9c693
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/__init__.py
@@ -0,0 +1,13 @@
+from .fewshot_dataset import FewShotKeypointDataset
+from .fewshot_base_dataset import FewShotBaseDataset
+from .transformer_dataset import TransformerPoseDataset
+from .transformer_base_dataset import TransformerBaseDataset
+from .test_base_dataset import TestBaseDataset
+from .test_dataset import TestPoseDataset
+from .custom_test_dataset import CustomTestPoseDataset
+
+__all__ = [
+    'FewShotKeypointDataset', 'FewShotBaseDataset',
+    'TransformerPoseDataset', 'TransformerBaseDataset',
+    'TestBaseDataset', 'TestPoseDataset', 'CustomTestPoseDataset'
+]
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d43ce586c0ed7efed0387fda6ccf3a7bbfeb129
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/custom_test_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/custom_test_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e67a35f10822a0f8ea06d82fcb81cf6bec33d7ce
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/custom_test_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_base_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_base_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4877d87ccbe185bcd8cbf1c7f635bc46c55dbc5e
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_base_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d1c9df69ad6d912446ce6223d32d8257c388d22
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/fewshot_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/test_base_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/test_base_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d1d9119202ff6747398d9023409080d9e63a940
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/test_base_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/test_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/test_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dac9750321961c677dc87af5d1de840eb9efefef
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/test_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_base_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_base_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f52dcb1165551f763dcb19ff3c84c2993247972c
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_base_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_dataset.cpython-39.pyc b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_dataset.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd23318bc903674f986ab7063222b0f373f79e76
Binary files /dev/null and b/EdgeCape/datasets/datasets/mp100/__pycache__/transformer_dataset.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/datasets/mp100/custom_test_dataset.py b/EdgeCape/datasets/datasets/mp100/custom_test_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..798b8d98ad501b7dcc793fc01d7a0781ff1bca88
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/custom_test_dataset.py
@@ -0,0 +1,355 @@
+from mmpose.datasets import DATASETS
+import random
+import numpy as np
+import os
+from collections import OrderedDict
+from xtcocotools.coco import COCO
+from .test_base_dataset import TestBaseDataset
+
+@DATASETS.register_module()
+class CustomTestPoseDataset(TestBaseDataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 valid_class_ids,
+                 max_kpt_num=None,
+                 num_shots=1,
+                 num_queries=100,
+                 num_episodes=1,
+                 pck_threshold_list=[0.05, 0.1, 0.15, 0.20, 0.25],
+                 test_mode=True):
+        super().__init__(
+            ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode, PCK_threshold_list=pck_threshold_list)
+
+        self.ann_info['flip_pairs'] = []
+
+        self.ann_info['upper_body_ids'] = []
+        self.ann_info['lower_body_ids'] = []
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.ann_info['joint_weights'] = np.array([1.,],
+            dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
+
+        self.coco = COCO(ann_file)
+
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+        self.img_ids = self.coco.getImgIds()
+
+        cat = None
+        relevant_names = [
+            '000000052046',
+            '000000052152'
+
+            # '000000027059',
+            # '000000030361'
+            # '000000027936',
+            # 'Pileated_Woodpecker_0004_180307', 'American_Three_Toed_Woodpecker_0019_179870'
+            # '000000016379', '000000008869'
+            # 'commonwarthog_115',
+            # 'commonwarthog_78'
+            # '000000027059', '000000030361', '000000027936'
+            # 'klipspringer_66', '000000008333', '000000026814', '000000047543', '000000052080', 'Common_Tern_0050_148928'
+        ]
+        if len(relevant_names) > 0:
+            if cat is not None:
+                relevant_names = [os.path.join(cat, name) for name in relevant_names]
+                self.img_ids = [img_id for img_id in self.img_ids if self.id2name[img_id] in relevant_names]
+            else:
+                new_ids = []
+                for relevant_name in relevant_names:
+                    new_ids += [img_id for img_id in self.img_ids if relevant_name in self.id2name[img_id]]
+                self.img_ids = new_ids
+        else:
+            self.img_ids = [img_id for img_id in self.img_ids if cat == self.id2name[img_id].split('/')[0]]
+
+        self.classes = [
+            cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
+        ]
+
+        self.num_classes = len(self.classes)
+        self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
+        self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
+
+        if valid_class_ids is not None: # None by default
+            self.valid_class_ids = valid_class_ids
+        else:
+            self.valid_class_ids = self.coco.getCatIds()
+
+        self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
+        self.cats = self.coco.cats
+        self.max_kpt_num = max_kpt_num
+
+        # Also update self.cat2obj
+        self.db = self._get_db()
+
+        self.num_shots = num_shots
+
+        if not test_mode:
+            # Update every training epoch
+            self.random_paired_samples()
+        else:
+            self.num_queries = num_queries
+            self.num_episodes = num_episodes
+            self.make_paired_samples()
+
+
+    def random_paired_samples(self):
+        num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
+
+        # balance the dataset
+        max_num_data = max(num_datas)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for i in range(max_num_data):
+                shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
+                all_samples.append(shot)
+
+        self.paired_samples = np.array(all_samples)
+        np.random.shuffle(self.paired_samples)
+
+    def make_paired_samples(self):
+        random.seed(1)
+        np.random.seed(0)
+        all_samples = []
+        self.num_episodes = 1000
+        for cls in self.valid_class_ids:
+            for _ in range(self.num_episodes):
+                if self.cat2obj[cls] == []:
+                    continue
+                self.num_queries = 1
+                self.num_shots = 1
+                if len(self.cat2obj[cls]) < self.num_shots + self.num_queries:
+                    shots = random.choices(self.cat2obj[cls], k=self.num_shots + self.num_queries)
+                else:
+                    shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
+                sample_ids = shots[:self.num_shots]
+                query_ids = shots[self.num_shots:]
+                for query_id in query_ids:
+                    all_samples.append(sample_ids + [query_id])
+                    all_samples.append([query_id] + [query_id])
+
+        self.paired_samples = np.array(list(set(tuple(x) for x in all_samples)))
+
+    def _select_kpt(self, obj, kpt_id):
+        obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1]
+        obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1]
+        obj['kpt_id'] = kpt_id
+
+        return obj
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_db(self):
+        """Ground truth bbox and keypoints."""
+        self.obj_id = 0
+
+        self.cat2obj = {}
+        for i in self.coco.getCatIds():
+            self.cat2obj.update({i: []})
+
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+
+            category_id = obj['category_id']
+            # the number of keypoint for this specific category
+            cat_kpt_num = int(len(obj['keypoints']) / 3)
+            if self.max_kpt_num is None:
+                kpt_num = cat_kpt_num
+            else:
+                kpt_num = self.max_kpt_num
+
+            joints_3d = np.zeros((kpt_num, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:cat_kpt_num, :2] = keypoints[:, :2]
+            joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = os.path.join(self.img_prefix, self.id2name[img_id])
+
+            self.cat2obj[category_id].append(self.obj_id)
+
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': obj['clean_bbox'][:4],
+                'bbox_score': 1,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'category_id': category_id,
+                'cat_kpt_num': cat_kpt_num,
+                'bbox_id': self.obj_id,
+                'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
+            })
+            bbox_id = bbox_id + 1
+            self.obj_id += 1
+
+        return rec
+
+    def _xywh2cs(self, x, y, w, h):
+        """This encodes bbox(x,y,w,w) into (center, scale)
+
+        Args:
+            x, y, w, h
+
+        Returns:
+            tuple: A tuple containing center and scale.
+
+            - center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            - scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+        #
+        # if (not self.test_mode) and np.random.rand() < 0.3:
+        #     center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * 1.25
+
+        return center, scale
+
+    def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            outputs (list(preds, boxes, image_path, output_heatmap))
+                :preds (np.ndarray[N,K,3]): The first two dimensions are
+                    coordinates, score is the third dimension of the array.
+                :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                    , scale[1],area, score]
+                :image_paths (list[str]): For example, ['C', 'a', 'p', 't',
+                    'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
+                    'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
+                    '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
+                    'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
+                    'j', 'p', 'g']
+                :output_heatmap (np.ndarray[N, K, H, W]): model outpus.
+
+            res_folder (str): Path of directory to save the results.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            boxes = output['boxes']
+            image_paths = output['image_paths']
+            bbox_ids = output['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        return name_value
diff --git a/EdgeCape/datasets/datasets/mp100/fewshot_base_dataset.py b/EdgeCape/datasets/datasets/mp100/fewshot_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7f067869603605f0f9189e96d59a5e577307cf
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/fewshot_base_dataset.py
@@ -0,0 +1,223 @@
+import copy
+from abc import ABCMeta, abstractmethod
+import json_tricks as json
+import numpy as np
+
+from mmcv.parallel import DataContainer as DC
+from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe,
+                                                  keypoint_pck_accuracy)
+from torch.utils.data import Dataset
+from mmpose.datasets import DATASETS
+from mmpose.datasets.pipelines import Compose
+
+@DATASETS.register_module()
+class FewShotBaseDataset(Dataset, metaclass=ABCMeta):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 test_mode=False):
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.annotations_path = ann_file
+        if not img_prefix.endswith('/'):
+            img_prefix = img_prefix + '/'
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['flip_pairs'] = None
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.db = []
+        self.num_shots = 1
+        self.paired_samples = []
+        self.pipeline = Compose(self.pipeline)
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def _select_kpt(self, obj, kpt_id):
+        """Select kpt."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        """Evaluate keypoint results."""
+        raise NotImplementedError
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self,
+                       res_file,
+                       metrics,
+                       pck_thr=0.2,
+                       pckh_thr=0.7,
+                       auc_nor=30):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.paired_samples)
+
+        outputs = []
+        gts = []
+        masks = []
+        threshold_bbox = []
+        threshold_head_box = []
+
+        for pred, pair in zip(preds, self.paired_samples):
+            item = self.db[pair[-1]]
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+
+            mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0)
+            for id_s in pair[:-1]:
+                mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0))
+            masks.append(np.bitwise_and(mask_query, mask_sample))
+
+            if 'PCK' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+            if 'PCKh' in metrics:
+                head_box_thr = item['head_size']
+                threshold_head_box.append(
+                    np.array([head_box_thr, head_box_thr]))
+
+        if 'PCK' in metrics:
+            pck_avg = []
+            for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
+                _, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), pck_thr, np.expand_dims(thr_bbox,0))
+                pck_avg.append(pck)
+            info_str.append(('PCK', np.mean(pck_avg)))
+
+        return info_str
+
+    def _merge_obj(self, Xs_list, Xq, idx):
+        """ merge Xs_list and Xq.
+
+        :param Xs_list: N-shot samples X
+        :param Xq: query X
+        :param idx: id of paired_samples
+        :return: Xall
+        """
+        Xall = dict()
+        Xall['img_s'] = [Xs['img'] for Xs in Xs_list]
+        Xall['target_s'] = [Xs['target'] for Xs in Xs_list]
+        Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list]
+        xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list]
+
+        Xall['img_q'] = Xq['img']
+        Xall['target_q'] = Xq['target']
+        Xall['target_weight_q'] = Xq['target_weight']
+        xq_img_metas = Xq['img_metas'].data
+
+        img_metas = dict()
+        for key in xq_img_metas.keys():
+            img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas]
+            img_metas['query_' + key] = xq_img_metas[key]
+        img_metas['bbox_id'] = idx
+
+        Xall['img_metas'] = DC(img_metas, cpu_only=True)
+
+        return Xall
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.paired_samples)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+
+        pair_ids = self.paired_samples[idx]
+        assert len(pair_ids) == self.num_shots + 1
+        sample_id_list = pair_ids[:self.num_shots]
+        query_id = pair_ids[-1]
+
+        sample_obj_list = []
+        for sample_id in sample_id_list:
+            sample_obj = copy.deepcopy(self.db[sample_id])
+            sample_obj['ann_info'] = copy.deepcopy(self.ann_info)
+            sample_obj_list.append(sample_obj)
+
+        query_obj = copy.deepcopy(self.db[query_id])
+        query_obj['ann_info'] = copy.deepcopy(self.ann_info)
+
+        if not self.test_mode:
+            # randomly select "one" keypoint
+            sample_valid = (sample_obj_list[0]['joints_3d_visible'][:, 0] > 0)
+            for sample_obj in sample_obj_list:
+                sample_valid = sample_valid & (sample_obj['joints_3d_visible'][:, 0] > 0)
+            query_valid = (query_obj['joints_3d_visible'][:, 0] > 0)
+
+            valid_s = np.where(sample_valid)[0]
+            valid_q = np.where(query_valid)[0]
+            valid_sq = np.where(sample_valid & query_valid)[0]
+            if len(valid_sq) > 0:
+                kpt_id = np.random.choice(valid_sq)
+            elif len(valid_s) > 0:
+                kpt_id = np.random.choice(valid_s)
+            elif len(valid_q) > 0:
+                kpt_id = np.random.choice(valid_q)
+            else:
+                kpt_id = np.random.choice(np.array(range(len(query_valid))))
+
+            for i in range(self.num_shots):
+                sample_obj_list[i] = self._select_kpt(sample_obj_list[i], kpt_id)
+            query_obj = self._select_kpt(query_obj, kpt_id)
+
+        # when test, all keypoints will be preserved.
+
+        Xs_list = []
+        for sample_obj in sample_obj_list:
+            Xs = self.pipeline(sample_obj)
+            Xs_list.append(Xs)
+        Xq = self.pipeline(query_obj)
+
+        Xall = self._merge_obj(Xs_list, Xq, idx)
+        Xall['skeleton'] = self.db[query_id]['skeleton']
+
+        return Xall
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/EdgeCape/datasets/datasets/mp100/fewshot_dataset.py b/EdgeCape/datasets/datasets/mp100/fewshot_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bf805f4909d625f6600531c8e07180ea41aa28e
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/fewshot_dataset.py
@@ -0,0 +1,312 @@
+from mmpose.datasets import DATASETS
+import random
+import numpy as np
+import os
+from collections import OrderedDict
+from xtcocotools.coco import COCO
+from .fewshot_base_dataset import FewShotBaseDataset
+
+@DATASETS.register_module()
+class FewShotKeypointDataset(FewShotBaseDataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 valid_class_ids,
+                 num_shots = 1,
+                 num_queries = 100,
+                 num_episodes = 1,
+                 test_mode=False):
+        super().__init__(
+            ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode)
+
+        self.ann_info['flip_pairs'] = []
+
+        self.ann_info['upper_body_ids'] = []
+        self.ann_info['lower_body_ids'] = []
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.ann_info['joint_weights'] = np.array([1.,],
+            dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
+
+        self.coco = COCO(ann_file)
+
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+        self.img_ids = self.coco.getImgIds()
+        self.classes = [
+            cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
+        ]
+
+        self.num_classes = len(self.classes)
+        self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
+        self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
+
+        if valid_class_ids is not None:
+            self.valid_class_ids = valid_class_ids
+        else:
+            self.valid_class_ids = self.coco.getCatIds()
+        self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
+
+        self.cats = self.coco.cats
+
+        # Also update self.cat2obj
+        self.db = self._get_db()
+
+        self.num_shots = num_shots
+
+        if not test_mode:
+            # Update every training epoch
+            self.random_paired_samples()
+        else:
+            self.num_queries = num_queries
+            self.num_episodes = num_episodes
+            self.make_paired_samples()
+
+
+    def random_paired_samples(self):
+        num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
+
+        # balance the dataset
+        max_num_data = max(num_datas)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for i in range(max_num_data):
+                shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
+                all_samples.append(shot)
+
+        self.paired_samples = np.array(all_samples)
+        np.random.shuffle(self.paired_samples)
+
+    def make_paired_samples(self):
+        random.seed(1)
+        np.random.seed(0)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for _ in range(self.num_episodes):
+                shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
+                sample_ids = shots[:self.num_shots]
+                query_ids = shots[self.num_shots:]
+                for query_id in query_ids:
+                    all_samples.append(sample_ids + [query_id])
+
+        self.paired_samples = np.array(all_samples)
+
+    def _select_kpt(self, obj, kpt_id):
+        obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1]
+        obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1]
+        obj['kpt_id'] = kpt_id
+
+        return obj
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_db(self):
+        """Ground truth bbox and keypoints."""
+        self.obj_id = 0
+
+        self.cat2obj = {}
+        for i in self.coco.getCatIds():
+            self.cat2obj.update({i: []})
+
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+
+            category_id = obj['category_id']
+            # the number of keypoint for this specific category
+            cat_kpt_num = int(len(obj['keypoints']) / 3)
+
+            joints_3d = np.zeros((cat_kpt_num, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((cat_kpt_num, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:, :2] = keypoints[:, :2]
+            joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = os.path.join(self.img_prefix, self.id2name[img_id])
+
+            self.cat2obj[category_id].append(self.obj_id)
+
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': obj['clean_bbox'][:4],
+                'bbox_score': 1,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'category_id': category_id,
+                'cat_kpt_num': cat_kpt_num,
+                'bbox_id': self.obj_id,
+                'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
+            })
+            bbox_id = bbox_id + 1
+            self.obj_id += 1
+
+        return rec
+
+    def _xywh2cs(self, x, y, w, h):
+        """This encodes bbox(x,y,w,w) into (center, scale)
+
+        Args:
+            x, y, w, h
+
+        Returns:
+            tuple: A tuple containing center and scale.
+
+            - center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            - scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+        #
+        # if (not self.test_mode) and np.random.rand() < 0.3:
+        #     center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * 1.25
+
+        return center, scale
+
+    def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            outputs (list(preds, boxes, image_path, output_heatmap))
+                :preds (np.ndarray[N,K,3]): The first two dimensions are
+                    coordinates, score is the third dimension of the array.
+                :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                    , scale[1],area, score]
+                :image_paths (list[str]): For example, ['C', 'a', 'p', 't',
+                    'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
+                    'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
+                    '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
+                    'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
+                    'j', 'p', 'g']
+                :output_heatmap (np.ndarray[N, K, H, W]): model outpus.
+
+            res_folder (str): Path of directory to save the results.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            boxes = output['boxes']
+            image_paths = output['image_paths']
+            bbox_ids = output['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        return name_value
diff --git a/EdgeCape/datasets/datasets/mp100/test_base_dataset.py b/EdgeCape/datasets/datasets/mp100/test_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b4f4214264bba072e6b5bddb53ed792cda10af1
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/test_base_dataset.py
@@ -0,0 +1,226 @@
+import copy
+from abc import ABCMeta, abstractmethod
+import json_tricks as json
+import numpy as np
+
+from mmcv.parallel import DataContainer as DC
+from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, keypoint_nme, 
+                                                  keypoint_pck_accuracy)
+from torch.utils.data import Dataset
+from mmpose.datasets import DATASETS
+from mmpose.datasets.pipelines import Compose
+
+@DATASETS.register_module()
+class TestBaseDataset(Dataset, metaclass=ABCMeta):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 test_mode=True,
+                 PCK_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25]):
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.annotations_path = ann_file
+        if not img_prefix.endswith('/'):
+            img_prefix = img_prefix + '/'
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+        self.PCK_threshold_list = PCK_threshold_list
+
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['flip_pairs'] = None
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.db = []
+        self.num_shots = 1
+        self.paired_samples = []
+        self.pipeline = Compose(self.pipeline)
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def _select_kpt(self, obj, kpt_id):
+        """Select kpt."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        """Evaluate keypoint results."""
+        raise NotImplementedError
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self,
+                       res_file,
+                       metrics):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.paired_samples)
+
+        outputs = []
+        gts = []
+        masks = []
+        threshold_bbox = []
+        threshold_head_box = []
+
+        for pred, pair in zip(preds, self.paired_samples):
+            item = self.db[pair[-1]]
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+
+            mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0)
+            for id_s in pair[:-1]:
+                mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0))
+            masks.append(np.bitwise_and(mask_query, mask_sample))
+
+            if 'PCK' in metrics or 'NME' in metrics or 'AUC' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+            if 'PCKh' in metrics:
+                head_box_thr = item['head_size']
+                threshold_head_box.append(
+                    np.array([head_box_thr, head_box_thr]))
+        
+        if 'PCK' in metrics:
+            pck_results = dict()
+            for pck_thr in self.PCK_threshold_list:
+                pck_results[pck_thr] = []
+
+            for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
+                for pck_thr in self.PCK_threshold_list:
+                    _, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), pck_thr, np.expand_dims(thr_bbox,0))
+                    pck_results[pck_thr].append(pck)
+
+            mPCK = 0
+            for pck_thr in self.PCK_threshold_list:
+                info_str.append(['PCK@' + str(pck_thr), np.mean(pck_results[pck_thr])])
+                mPCK += np.mean(pck_results[pck_thr])
+            info_str.append(['mPCK', mPCK / len(self.PCK_threshold_list)])
+        
+        if 'NME' in metrics:
+            nme_results = []
+            for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
+                nme = keypoint_nme(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), np.expand_dims(thr_bbox,0))
+                nme_results.append(nme)
+            info_str.append(['NME', np.mean(nme_results)])
+
+        if 'AUC' in metrics:
+            auc_results = []
+            for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
+                auc = keypoint_auc(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), thr_bbox[0])
+                auc_results.append(auc)
+            info_str.append(['AUC', np.mean(auc_results)])
+            
+        if 'EPE' in metrics:
+            epe_results = []
+            for (output, gt, mask) in zip(outputs, gts, masks):
+                epe = keypoint_epe(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0))
+                epe_results.append(epe)
+            info_str.append(['EPE', np.mean(epe_results)])
+        return info_str
+
+    def _merge_obj(self, Xs_list, Xq, idx):
+        """ merge Xs_list and Xq.
+
+        :param Xs_list: N-shot samples X
+        :param Xq: query X
+        :param idx: id of paired_samples
+        :return: Xall
+        """
+        Xall = dict()
+        Xall['img_s'] = [Xs['img'] for Xs in Xs_list]
+        Xall['target_s'] = [Xs['target'] for Xs in Xs_list]
+        Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list]
+        xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list]
+
+        Xall['img_q'] = Xq['img']
+        Xall['target_q'] = Xq['target']
+        Xall['target_weight_q'] = Xq['target_weight']
+        xq_img_metas = Xq['img_metas'].data
+
+        img_metas = dict()
+        for key in xq_img_metas.keys():
+            img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas]
+            img_metas['query_' + key] = xq_img_metas[key]
+        img_metas['bbox_id'] = idx
+
+        Xall['img_metas'] = DC(img_metas, cpu_only=True)
+
+        return Xall
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.paired_samples)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+
+        pair_ids = self.paired_samples[idx] # [supported id * shots, query id]
+        assert len(pair_ids) == self.num_shots + 1
+        sample_id_list = pair_ids[:self.num_shots]
+        query_id = pair_ids[-1]
+
+        sample_obj_list = []
+        for sample_id in sample_id_list:
+            sample_obj = copy.deepcopy(self.db[sample_id])
+            sample_obj['ann_info'] = copy.deepcopy(self.ann_info)
+            sample_obj_list.append(sample_obj)
+
+        query_obj = copy.deepcopy(self.db[query_id])
+        query_obj['ann_info'] = copy.deepcopy(self.ann_info)
+
+        Xs_list = []
+        for sample_obj in sample_obj_list:
+            Xs = self.pipeline(sample_obj) # dict with ['img', 'target', 'target_weight', 'img_metas'], 
+            Xs_list.append(Xs)             # Xs['target'] is of shape [100, map_h, map_w]
+        Xq = self.pipeline(query_obj)
+
+        Xall = self._merge_obj(Xs_list, Xq, idx)
+        Xall['skeleton'] = self.db[query_id]['skeleton']
+
+        return Xall
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
diff --git a/EdgeCape/datasets/datasets/mp100/test_dataset.py b/EdgeCape/datasets/datasets/mp100/test_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa264d2e1edddcce66f13f892e2a303e552ed8cc
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/test_dataset.py
@@ -0,0 +1,319 @@
+from mmpose.datasets import DATASETS
+import random
+import numpy as np
+import os
+from collections import OrderedDict
+from xtcocotools.coco import COCO
+from .test_base_dataset import TestBaseDataset
+
+@DATASETS.register_module()
+class TestPoseDataset(TestBaseDataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 valid_class_ids,
+                 max_kpt_num=None,
+                 num_shots=1,
+                 num_queries=100,
+                 num_episodes=1,
+                 pck_threshold_list=[0.05, 0.1, 0.15, 0.20, 0.25],
+                 test_mode=True):
+        super().__init__(
+            ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode, PCK_threshold_list=pck_threshold_list)
+
+        self.ann_info['flip_pairs'] = []
+
+        self.ann_info['upper_body_ids'] = []
+        self.ann_info['lower_body_ids'] = []
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.ann_info['joint_weights'] = np.array([1.,],
+            dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
+
+        self.coco = COCO(ann_file)
+
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+        self.img_ids = self.coco.getImgIds()
+        self.classes = [
+            cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
+        ]
+
+        self.num_classes = len(self.classes)
+        self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
+        self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
+
+        if valid_class_ids is not None: # None by default
+            self.valid_class_ids = valid_class_ids
+        else:
+            self.valid_class_ids = self.coco.getCatIds()
+        self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
+
+        self.cats = self.coco.cats
+        self.max_kpt_num = max_kpt_num
+
+        # Also update self.cat2obj
+        self.db = self._get_db()
+
+        self.num_shots = num_shots
+
+        if not test_mode:
+            # Update every training epoch
+            self.random_paired_samples()
+        else:
+            self.num_queries = num_queries
+            self.num_episodes = num_episodes
+            self.make_paired_samples()
+
+
+    def random_paired_samples(self):
+        num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
+
+        # balance the dataset
+        max_num_data = max(num_datas)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for i in range(max_num_data):
+                shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
+                all_samples.append(shot)
+
+        self.paired_samples = np.array(all_samples)
+        np.random.shuffle(self.paired_samples)
+
+    def make_paired_samples(self):
+        random.seed(1)
+        np.random.seed(0)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for _ in range(self.num_episodes):
+                shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
+                sample_ids = shots[:self.num_shots]
+                query_ids = shots[self.num_shots:]
+                for query_id in query_ids:
+                    all_samples.append(sample_ids + [query_id])
+
+        self.paired_samples = np.array(all_samples)
+
+    def _select_kpt(self, obj, kpt_id):
+        obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1]
+        obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1]
+        obj['kpt_id'] = kpt_id
+
+        return obj
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_db(self):
+        """Ground truth bbox and keypoints."""
+        self.obj_id = 0
+
+        self.cat2obj = {}
+        for i in self.coco.getCatIds():
+            self.cat2obj.update({i: []})
+
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+
+            category_id = obj['category_id']
+            # the number of keypoint for this specific category
+            cat_kpt_num = int(len(obj['keypoints']) / 3)
+            if self.max_kpt_num is None:
+                kpt_num = cat_kpt_num
+            else:
+                kpt_num = self.max_kpt_num
+
+            joints_3d = np.zeros((kpt_num, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:cat_kpt_num, :2] = keypoints[:, :2]
+            joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = os.path.join(self.img_prefix, self.id2name[img_id])
+
+            self.cat2obj[category_id].append(self.obj_id)
+
+            rec.append({
+                'image_file': image_file,
+                'center': center,
+                'scale': scale,
+                'rotation': 0,
+                'bbox': obj['clean_bbox'][:4],
+                'bbox_score': 1,
+                'joints_3d': joints_3d,
+                'joints_3d_visible': joints_3d_visible,
+                'category_id': category_id,
+                'cat_kpt_num': cat_kpt_num,
+                'bbox_id': self.obj_id,
+                'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
+            })
+            bbox_id = bbox_id + 1
+            self.obj_id += 1
+
+        return rec
+
+    def _xywh2cs(self, x, y, w, h):
+        """This encodes bbox(x,y,w,w) into (center, scale)
+
+        Args:
+            x, y, w, h
+
+        Returns:
+            tuple: A tuple containing center and scale.
+
+            - center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            - scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+        #
+        # if (not self.test_mode) and np.random.rand() < 0.3:
+        #     center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * 1.25
+
+        return center, scale
+
+    def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            outputs (list(preds, boxes, image_path, output_heatmap))
+                :preds (np.ndarray[N,K,3]): The first two dimensions are
+                    coordinates, score is the third dimension of the array.
+                :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                    , scale[1],area, score]
+                :image_paths (list[str]): For example, ['C', 'a', 'p', 't',
+                    'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
+                    'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
+                    '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
+                    'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
+                    'j', 'p', 'g']
+                :output_heatmap (np.ndarray[N, K, H, W]): model outpus.
+
+            res_folder (str): Path of directory to save the results.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            boxes = output['boxes']
+            image_paths = output['image_paths']
+            bbox_ids = output['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        return name_value
diff --git a/EdgeCape/datasets/datasets/mp100/transformer_base_dataset.py b/EdgeCape/datasets/datasets/mp100/transformer_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd6b343e56c6096e6fa35c960f2596811cc2ac32
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/transformer_base_dataset.py
@@ -0,0 +1,209 @@
+import copy
+from abc import ABCMeta, abstractmethod
+import json_tricks as json
+import numpy as np
+
+from mmcv.parallel import DataContainer as DC
+from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe,
+                                                  keypoint_pck_accuracy)
+from torch.utils.data import Dataset
+from mmpose.datasets import DATASETS
+from mmpose.datasets.pipelines import Compose
+
+@DATASETS.register_module()
+class TransformerBaseDataset(Dataset, metaclass=ABCMeta):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 masking_ratio=0.3,
+                 test_mode=False):
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.annotations_path = ann_file
+        if not img_prefix.endswith('/'):
+            img_prefix = img_prefix + '/'
+        self.img_prefix = img_prefix
+        self.pipeline = pipeline
+        self.test_mode = test_mode
+        self.masking_ratio = masking_ratio
+        self.ann_info['image_size'] = np.array(data_cfg['image_size'])
+        self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size'])
+        self.ann_info['num_joints'] = data_cfg['num_joints']
+
+        self.ann_info['flip_pairs'] = None
+
+        self.ann_info['inference_channel'] = data_cfg['inference_channel']
+        self.ann_info['num_output_channels'] = data_cfg['num_output_channels']
+        self.ann_info['dataset_channel'] = data_cfg['dataset_channel']
+
+        self.db = []
+        self.num_shots = 1
+        self.paired_samples = []
+        self.pipeline = Compose(self.pipeline)
+
+    @abstractmethod
+    def _get_db(self):
+        """Load dataset."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def _select_kpt(self, obj, kpt_id):
+        """Select kpt."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, cfg, preds, output_dir, *args, **kwargs):
+        """Evaluate keypoint results."""
+        raise NotImplementedError
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _report_metric(self,
+                       res_file,
+                       metrics,
+                       pck_thr=0.2,
+                       pckh_thr=0.7,
+                       auc_nor=30):
+        """Keypoint evaluation.
+
+        Args:
+            res_file (str): Json file stored prediction results.
+            metrics (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'PCKh', 'AUC', 'EPE'.
+            pck_thr (float): PCK threshold, default as 0.2.
+            pckh_thr (float): PCKh threshold, default as 0.7.
+            auc_nor (float): AUC normalization factor, default as 30 pixel.
+
+        Returns:
+            List: Evaluation results for evaluation metric.
+        """
+        info_str = []
+
+        with open(res_file, 'r') as fin:
+            preds = json.load(fin)
+        assert len(preds) == len(self.paired_samples)
+
+        outputs = []
+        gts = []
+        masks = []
+        threshold_bbox = []
+        threshold_head_box = []
+
+        for pred, pair in zip(preds, self.paired_samples):
+            item = self.db[pair[-1]]
+            outputs.append(np.array(pred['keypoints'])[:, :-1])
+            gts.append(np.array(item['joints_3d'])[:, :-1])
+
+            mask_query = ((np.array(item['joints_3d_visible'])[:, 0]) > 0)
+            mask_sample = ((np.array(self.db[pair[0]]['joints_3d_visible'])[:, 0]) > 0)
+            for id_s in pair[:-1]:
+                mask_sample = np.bitwise_and(mask_sample, ((np.array(self.db[id_s]['joints_3d_visible'])[:, 0]) > 0))
+            masks.append(np.bitwise_and(mask_query, mask_sample))
+
+            if 'PCK' in metrics:
+                bbox = np.array(item['bbox'])
+                bbox_thr = np.max(bbox[2:])
+                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
+            if 'PCKh' in metrics:
+                head_box_thr = item['head_size']
+                threshold_head_box.append(
+                    np.array([head_box_thr, head_box_thr]))
+
+        if 'PCK' in metrics:
+            pck_avg = []
+            for (output, gt, mask, thr_bbox) in zip(outputs, gts, masks, threshold_bbox):
+                _, pck, _ = keypoint_pck_accuracy(np.expand_dims(output, 0), np.expand_dims(gt,0), np.expand_dims(mask,0), pck_thr, np.expand_dims(thr_bbox,0))
+                pck_avg.append(pck)
+            info_str.append(('PCK', np.mean(pck_avg)))
+
+        return info_str
+
+    def _merge_obj(self, Xs_list, Xq, idx):
+        """ merge Xs_list and Xq.
+
+        :param Xs_list: N-shot samples X
+        :param Xq: query X
+        :param idx: id of paired_samples
+        :return: Xall
+        """
+        Xall = dict()
+        Xall['img_s'] = [Xs['img'] for Xs in Xs_list]
+        Xall['target_s'] = [Xs['target'] for Xs in Xs_list]
+        Xall['target_weight_s'] = [Xs['target_weight'] for Xs in Xs_list]
+        xs_img_metas = [Xs['img_metas'].data for Xs in Xs_list]
+
+        Xall['img_q'] = Xq['img']
+        Xall['target_q'] = Xq['target']
+        Xall['target_weight_q'] = Xq['target_weight']
+        xq_img_metas = Xq['img_metas'].data
+
+        img_metas = dict()
+        for key in xq_img_metas.keys():
+            img_metas['sample_' + key] = [xs_img_meta[key] for xs_img_meta in xs_img_metas]
+            img_metas['query_' + key] = xq_img_metas[key]
+        img_metas['bbox_id'] = idx
+
+        Xall['img_metas'] = DC(img_metas, cpu_only=True)
+
+        return Xall
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return len(self.paired_samples)
+
+    def __getitem__(self, idx):
+        """Get the sample given index."""
+
+        pair_ids = self.paired_samples[idx] # [supported id * shots, query id]
+        assert len(pair_ids) == self.num_shots + 1
+        sample_id_list = pair_ids[:self.num_shots]
+        query_id = pair_ids[-1]
+
+        sample_obj_list = []
+        for sample_id in sample_id_list:
+            sample_obj = copy.deepcopy(self.db[sample_id])
+            sample_obj['ann_info'] = copy.deepcopy(self.ann_info)
+            sample_obj_list.append(sample_obj)
+
+        query_obj = copy.deepcopy(self.db[query_id])
+        query_obj['ann_info'] = copy.deepcopy(self.ann_info)
+
+        Xs_list = []
+        for sample_obj in sample_obj_list:
+            Xs = self.pipeline(sample_obj) # dict with ['img', 'target', 'target_weight', 'img_metas'], 
+            Xs_list.append(Xs)             # Xs['target'] is of shape [100, map_h, map_w]
+        Xq = self.pipeline(query_obj)
+
+        Xall = self._merge_obj(Xs_list, Xq, idx)
+        Xall['skeleton'] = self.db[query_id]['skeleton']
+        Xall['rand_mask'] = self.rand_mask(Xall['target_weight_s'])
+        return Xall
+
+    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
+        """sort kpts and remove the repeated ones."""
+        kpts = sorted(kpts, key=lambda x: x[key])
+        num = len(kpts)
+        for i in range(num - 1, 0, -1):
+            if kpts[i][key] == kpts[i - 1][key]:
+                del kpts[i]
+
+        return kpts
+
+    def rand_mask(self, target_weight_s):
+        mask_s = target_weight_s[0]
+        for target_weight in target_weight_s:
+            mask_s = mask_s * target_weight
+        num_to_mask = int(np.sum(mask_s) * self.masking_ratio)
+        true_indices = np.where(mask_s == 1)[0]
+        rand_mask = np.random.permutation(true_indices)[:num_to_mask]
+        mask_s[rand_mask] = 0
+        return mask_s
diff --git a/EdgeCape/datasets/datasets/mp100/transformer_dataset.py b/EdgeCape/datasets/datasets/mp100/transformer_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bb584a7cf914e1f0dc2b37e74b3fe546c5afd04
--- /dev/null
+++ b/EdgeCape/datasets/datasets/mp100/transformer_dataset.py
@@ -0,0 +1,319 @@
+from mmpose.datasets import DATASETS
+import random
+import numpy as np
+import os
+from collections import OrderedDict
+from xtcocotools.coco import COCO
+from .transformer_base_dataset import TransformerBaseDataset
+
+@DATASETS.register_module()
+class TransformerPoseDataset(TransformerBaseDataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 data_cfg,
+                 pipeline,
+                 valid_class_ids,
+                 max_kpt_num=None,
+                 num_shots=1,
+                 num_queries=100,
+                 num_episodes=1,
+                 test_mode=False):
+        super().__init__(
+            ann_file, img_prefix, data_cfg, pipeline, test_mode=test_mode)
+        
+        self.ann_info['flip_pairs'] = []
+
+        self.ann_info['upper_body_ids'] = []
+        self.ann_info['lower_body_ids'] = []
+
+        self.ann_info['use_different_joint_weights'] = False
+        self.ann_info['joint_weights'] = np.array([1.,],
+            dtype=np.float32).reshape((self.ann_info['num_joints'], 1))
+
+        self.coco = COCO(ann_file)
+
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+        self.img_ids = self.coco.getImgIds()
+        self.classes = [
+            cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())
+        ]
+
+        self.num_classes = len(self.classes)
+        self._class_to_ind = dict(zip(self.classes, self.coco.getCatIds()))
+        self._ind_to_class = dict(zip(self.coco.getCatIds(), self.classes))
+
+        if valid_class_ids is not None: # None by default
+            self.valid_class_ids = valid_class_ids
+        else:
+            self.valid_class_ids = self.coco.getCatIds()
+        self.valid_classes = [self._ind_to_class[ind] for ind in self.valid_class_ids]
+
+        self.cats = self.coco.cats
+        self.max_kpt_num = max_kpt_num
+
+        # Also update self.cat2obj
+        self.db = self._get_db()
+
+        self.num_shots = num_shots
+
+        if not test_mode:
+            # Update every training epoch
+            self.random_paired_samples()
+        else:
+            self.num_queries = num_queries
+            self.num_episodes = num_episodes
+            self.make_paired_samples()
+
+
+    def random_paired_samples(self):
+        num_datas = [len(self.cat2obj[self._class_to_ind[cls]]) for cls in self.valid_classes]
+
+        # balance the dataset
+        max_num_data = max(num_datas)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for i in range(max_num_data):
+                shot = random.sample(self.cat2obj[cls], self.num_shots + 1)
+                all_samples.append(shot)
+
+        self.paired_samples = np.array(all_samples)
+        np.random.shuffle(self.paired_samples)
+
+    def make_paired_samples(self):
+        random.seed(1)
+        np.random.seed(0)
+
+        all_samples = []
+        for cls in self.valid_class_ids:
+            for _ in range(self.num_episodes):
+                shots = random.sample(self.cat2obj[cls], self.num_shots + self.num_queries)
+                sample_ids = shots[:self.num_shots]
+                query_ids = shots[self.num_shots:]
+                for query_id in query_ids:
+                    all_samples.append(sample_ids + [query_id])
+
+        self.paired_samples = np.array(all_samples)
+
+    def _select_kpt(self, obj, kpt_id):
+        obj['joints_3d'] = obj['joints_3d'][kpt_id:kpt_id+1]
+        obj['joints_3d_visible'] = obj['joints_3d_visible'][kpt_id:kpt_id+1]
+        obj['kpt_id'] = kpt_id
+
+        return obj
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_db(self):
+        """Ground truth bbox and keypoints."""
+        self.obj_id = 0
+
+        self.cat2obj = {}
+        for i in self.coco.getCatIds():
+            self.cat2obj.update({i: []})
+
+        gt_db = []
+        for img_id in self.img_ids:
+            gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id))
+
+        return gt_db
+
+    def _load_coco_keypoint_annotation_kernel(self, img_id):
+        """load annotation from COCOAPI.
+
+        Note:
+            bbox:[x1, y1, w, h]
+        Args:
+            img_id: coco image id
+        Returns:
+            dict: db entry
+        """
+        img_ann = self.coco.loadImgs(img_id)[0]
+        width = img_ann['width']
+        height = img_ann['height']
+
+        ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
+        objs = self.coco.loadAnns(ann_ids)
+
+        # sanitize bboxes
+        valid_objs = []
+        for obj in objs:
+            if 'bbox' not in obj:
+                continue
+            x, y, w, h = obj['bbox']
+            x1 = max(0, x)
+            y1 = max(0, y)
+            x2 = min(width - 1, x1 + max(0, w - 1))
+            y2 = min(height - 1, y1 + max(0, h - 1))
+            if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
+                obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                valid_objs.append(obj)
+        objs = valid_objs
+
+        bbox_id = 0
+        rec = []
+        for obj in objs:
+            if 'keypoints' not in obj:
+                continue
+            if max(obj['keypoints']) == 0:
+                continue
+            if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
+                continue
+
+            category_id = obj['category_id']
+            # the number of keypoint for this specific category
+            cat_kpt_num = int(len(obj['keypoints']) / 3)
+            if self.max_kpt_num is None:
+                kpt_num = cat_kpt_num
+            else:
+                kpt_num = self.max_kpt_num
+
+            joints_3d = np.zeros((kpt_num, 3), dtype=np.float32)
+            joints_3d_visible = np.zeros((kpt_num, 3), dtype=np.float32)
+
+            keypoints = np.array(obj['keypoints']).reshape(-1, 3)
+            joints_3d[:cat_kpt_num, :2] = keypoints[:, :2]
+            joints_3d_visible[:cat_kpt_num, :2] = np.minimum(1, keypoints[:, 2:3])
+
+            center, scale = self._xywh2cs(*obj['clean_bbox'][:4])
+
+            image_file = os.path.join(self.img_prefix, self.id2name[img_id])
+            if os.path.exists(image_file):
+                self.cat2obj[category_id].append(self.obj_id)
+
+                rec.append({
+                    'image_file': image_file,
+                    'center': center,
+                    'scale': scale,
+                    'rotation': 0,
+                    'bbox': obj['clean_bbox'][:4],
+                    'bbox_score': 1,
+                    'joints_3d': joints_3d,
+                    'joints_3d_visible': joints_3d_visible,
+                    'category_id': category_id,
+                    'cat_kpt_num': cat_kpt_num,
+                    'bbox_id': self.obj_id,
+                    'skeleton': self.coco.cats[obj['category_id']]['skeleton'],
+                })
+                bbox_id = bbox_id + 1
+                self.obj_id += 1
+
+        return rec
+
+    def _xywh2cs(self, x, y, w, h):
+        """This encodes bbox(x,y,w,w) into (center, scale)
+
+        Args:
+            x, y, w, h
+
+        Returns:
+            tuple: A tuple containing center and scale.
+
+            - center (np.ndarray[float32](2,)): center of the bbox (x, y).
+            - scale (np.ndarray[float32](2,)): scale of the bbox w & h.
+        """
+        aspect_ratio = self.ann_info['image_size'][0] / self.ann_info['image_size'][1]
+        center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)
+        #
+        # if (not self.test_mode) and np.random.rand() < 0.3:
+        #     center += 0.4 * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+
+        # pixel std is 200.0
+        scale = np.array([w / 200.0, h / 200.0], dtype=np.float32)
+        # padding to include proper amount of context
+        scale = scale * 1.25
+
+        return center, scale
+
+    def evaluate(self, outputs, res_folder, metric='PCK', **kwargs):
+        """Evaluate interhand2d keypoint results. The pose prediction results
+        will be saved in `${res_folder}/result_keypoints.json`.
+
+        Note:
+            batch_size: N
+            num_keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            outputs (list(preds, boxes, image_path, output_heatmap))
+                :preds (np.ndarray[N,K,3]): The first two dimensions are
+                    coordinates, score is the third dimension of the array.
+                :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
+                    , scale[1],area, score]
+                :image_paths (list[str]): For example, ['C', 'a', 'p', 't',
+                    'u', 'r', 'e', '1', '2', '/', '0', '3', '9', '0', '_',
+                    'd', 'h', '_', 't', 'o', 'u', 'c', 'h', 'R', 'O', 'M',
+                    '/', 'c', 'a', 'm', '4', '1', '0', '2', '0', '9', '/',
+                    'i', 'm', 'a', 'g', 'e', '6', '2', '4', '3', '4', '.',
+                    'j', 'p', 'g']
+                :output_heatmap (np.ndarray[N, K, H, W]): model outpus.
+
+            res_folder (str): Path of directory to save the results.
+            metric (str | list[str]): Metric to be performed.
+                Options: 'PCK', 'AUC', 'EPE'.
+
+        Returns:
+            dict: Evaluation results for evaluation metric.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['PCK', 'AUC', 'EPE', 'NME']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+
+        kpts = []
+        for output in outputs:
+            preds = output['preds']
+            boxes = output['boxes']
+            image_paths = output['image_paths']
+            bbox_ids = output['bbox_ids']
+
+            batch_size = len(image_paths)
+            for i in range(batch_size):
+                image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
+
+                kpts.append({
+                    'keypoints': preds[i].tolist(),
+                    'center': boxes[i][0:2].tolist(),
+                    'scale': boxes[i][2:4].tolist(),
+                    'area': float(boxes[i][4]),
+                    'score': float(boxes[i][5]),
+                    'image_id': image_id,
+                    'bbox_id': bbox_ids[i]
+                })
+        kpts = self._sort_and_unique_bboxes(kpts)
+
+        self._write_keypoint_results(kpts, res_file)
+        info_str = self._report_metric(res_file, metrics)
+        name_value = OrderedDict(info_str)
+
+        return name_value
diff --git a/EdgeCape/datasets/pipelines/__init__.py b/EdgeCape/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f4429140785a3e72c0ac6f63dd99a041b9bd92d
--- /dev/null
+++ b/EdgeCape/datasets/pipelines/__init__.py
@@ -0,0 +1,8 @@
+from .top_down_transform import (TopDownAffineFewShot,
+                                 TopDownGenerateTargetFewShot,
+                                 LoadDepthFromFile,
+                                 DepthTopDownAffineFewShot)
+
+__all__ = [
+    'TopDownGenerateTargetFewShot', 'TopDownAffineFewShot', 'LoadDepthFromFile', 'DepthTopDownAffineFewShot',
+]
diff --git a/EdgeCape/datasets/pipelines/__pycache__/__init__.cpython-39.pyc b/EdgeCape/datasets/pipelines/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..948825ac9faf17123a86e341cc2be39dd6e4de48
Binary files /dev/null and b/EdgeCape/datasets/pipelines/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/pipelines/__pycache__/post_transforms.cpython-39.pyc b/EdgeCape/datasets/pipelines/__pycache__/post_transforms.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ce6763358a558d6439cb3ba57fc73a35334a7f6
Binary files /dev/null and b/EdgeCape/datasets/pipelines/__pycache__/post_transforms.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/pipelines/__pycache__/top_down_transform.cpython-39.pyc b/EdgeCape/datasets/pipelines/__pycache__/top_down_transform.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d08eaf3032d9c87f0c870510e26b4c0e93788fd
Binary files /dev/null and b/EdgeCape/datasets/pipelines/__pycache__/top_down_transform.cpython-39.pyc differ
diff --git a/EdgeCape/datasets/pipelines/post_transforms.py b/EdgeCape/datasets/pipelines/post_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1025daf5cc87ca3d9a3a204a8df05ca8af725fd
--- /dev/null
+++ b/EdgeCape/datasets/pipelines/post_transforms.py
@@ -0,0 +1,121 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import cv2
+import numpy as np
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    # pixel_std is 200.
+    scale_tmp = scale * 200.0
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    assert len(pt) == 2
+    new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
diff --git a/EdgeCape/datasets/pipelines/top_down_transform.py b/EdgeCape/datasets/pipelines/top_down_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..147e4160b39a94d568b25187ab90d4be8e7d743c
--- /dev/null
+++ b/EdgeCape/datasets/pipelines/top_down_transform.py
@@ -0,0 +1,716 @@
+import os
+import warnings
+from pathlib import Path
+from typing import Optional
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv import fileio
+
+from mmpose.datasets.builder import PIPELINES
+from .post_transforms import (affine_transform,
+                              get_affine_transform)
+from mmpose.core.post_processing import (affine_transform, fliplr_joints,
+                                         get_affine_transform, get_warp_matrix,
+                                         warp_affine_joints)
+
+@PIPELINES.register_module()
+class TopDownAffineFewShot:
+    """Affine transform the image to make input.
+
+    Required keys:'img', 'joints_3d', 'joints_3d_visible', 'ann_info','scale',
+    'rotation' and 'center'. Modified keys:'img', 'joints_3d', and
+    'joints_3d_visible'.
+
+    Args:
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, use_udp=False):
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        if self.use_udp:
+            trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+            img = cv2.warpAffine(
+                img,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            joints_3d[:, 0:2] = \
+                warp_affine_joints(joints_3d[:, 0:2].copy(), trans)
+        else:
+            trans = get_affine_transform(c, s, r, image_size)
+            img = cv2.warpAffine(
+                img,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            for i in range(len(joints_3d)):
+                if joints_3d_visible[i, 0] > 0.0:
+                    joints_3d[i, 0:2] = affine_transform(joints_3d[i, 0:2], trans)
+
+        results['img'] = img
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+
+        return results
+
+
+@PIPELINES.register_module()
+class TopDownGenerateTargetFewShot:
+    """Generate the target heatmap.
+
+    Required keys: 'joints_3d', 'joints_3d_visible', 'ann_info'.
+    Modified keys: 'target', and 'target_weight'.
+
+    Args:
+        sigma: Sigma of heatmap gaussian for 'MSRA' approach.
+        kernel: Kernel of heatmap gaussian for 'Megvii' approach.
+        encoding (str): Approach to generate target heatmaps.
+            Currently supported approaches: 'MSRA', 'Megvii', 'UDP'.
+            Default:'MSRA'
+
+        unbiased_encoding (bool): Option to use unbiased
+            encoding methods.
+            Paper ref: Zhang et al. Distribution-Aware Coordinate
+            Representation for Human Pose Estimation (CVPR 2020).
+        keypoint_pose_distance: Keypoint pose distance for UDP.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+        target_type (str): supported targets: 'GaussianHeatMap',
+            'CombinedTarget'. Default:'GaussianHeatMap'
+            CombinedTarget: The combination of classification target
+            (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self,
+                 sigma=2,
+                 kernel=(11, 11),
+                 valid_radius_factor=0.0546875,
+                 target_type='GaussianHeatMap',
+                 encoding='MSRA',
+                 unbiased_encoding=False):
+        self.sigma = sigma
+        self.unbiased_encoding = unbiased_encoding
+        self.kernel = kernel
+        self.valid_radius_factor = valid_radius_factor
+        self.target_type = target_type
+        self.encoding = encoding
+
+    def _msra_generate_target(self, cfg, joints_3d, joints_3d_visible, sigma):
+        """Generate the target heatmap via "MSRA" approach.
+
+        Args:
+            cfg (dict): data config
+            joints_3d: np.ndarray ([num_joints, 3])
+            joints_3d_visible: np.ndarray ([num_joints, 3])
+            sigma: Sigma of heatmap gaussian
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target: Target heatmaps.
+            - target_weight: (1: visible, 0: invisible)
+        """
+        num_joints = len(joints_3d)
+        image_size = cfg['image_size']
+        W, H = cfg['heatmap_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+        assert not use_different_joint_weights
+
+        target_weight = np.zeros((num_joints, 1), dtype=np.float32)
+        target = np.zeros((num_joints, H, W), dtype=np.float32)
+
+        # 3-sigma rule
+        tmp_size = sigma * 3
+
+        if self.unbiased_encoding:
+            for joint_id in range(num_joints):
+                target_weight[joint_id] = joints_3d_visible[joint_id, 0]
+
+                feat_stride = image_size / [W, H]
+                mu_x = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y = joints_3d[joint_id][1] / feat_stride[1]
+                # Check that any part of the gaussian is in-bounds
+                ul = [mu_x - tmp_size, mu_y - tmp_size]
+                br = [mu_x + tmp_size + 1, mu_y + tmp_size + 1]
+                if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
+                    target_weight[joint_id] = 0
+
+                if target_weight[joint_id] == 0:
+                    continue
+
+                x = np.arange(0, W, 1, np.float32)
+                y = np.arange(0, H, 1, np.float32)
+                y = y[:, None]
+
+                if target_weight[joint_id] > 0.5:
+                    target[joint_id] = np.exp(-((x - mu_x)**2 +
+                                                (y - mu_y)**2) /
+                                              (2 * sigma**2))
+        else:
+            for joint_id in range(num_joints):
+                target_weight[joint_id] = joints_3d_visible[joint_id, 0]
+
+                feat_stride = image_size / [W, H]
+                mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= W or ul[1] >= H or br[0] < 0 or br[1] < 0:
+                    target_weight[joint_id] = 0
+
+                if target_weight[joint_id] > 0.5:
+                    size = 2 * tmp_size + 1
+                    x = np.arange(0, size, 1, np.float32)
+                    y = x[:, None]
+                    x0 = y0 = size // 2
+                    # The gaussian is not normalized,
+                    # we want the center value to equal 1
+                    g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+
+                    # Usable gaussian range
+                    g_x = max(0, -ul[0]), min(br[0], W) - ul[0]
+                    g_y = max(0, -ul[1]), min(br[1], H) - ul[1]
+                    # Image range
+                    img_x = max(0, ul[0]), min(br[0], W)
+                    img_y = max(0, ul[1]), min(br[1], H)
+
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def _udp_generate_target(self, cfg, joints_3d, joints_3d_visible, factor,
+                             target_type):
+        """Generate the target heatmap via 'UDP' approach. Paper ref: Huang et
+        al. The Devil is in the Details: Delving into Unbiased Data Processing
+        for Human Pose Estimation (CVPR 2020).
+
+        Note:
+            num keypoints: K
+            heatmap height: H
+            heatmap width: W
+            num target channels: C
+            C = K if target_type=='GaussianHeatMap'
+            C = 3*K if target_type=='CombinedTarget'
+
+        Args:
+            cfg (dict): data config
+            joints_3d (np.ndarray[K, 3]): Annotated keypoints.
+            joints_3d_visible (np.ndarray[K, 3]): Visibility of keypoints.
+            factor (float): kernel factor for GaussianHeatMap target or
+                valid radius factor for CombinedTarget.
+            target_type (str): 'GaussianHeatMap' or 'CombinedTarget'.
+                GaussianHeatMap: Heatmap target with gaussian distribution.
+                CombinedTarget: The combination of classification target
+                (response map) and regression target (offset map).
+
+        Returns:
+            tuple: A tuple containing targets.
+
+            - target (np.ndarray[C, H, W]): Target heatmaps.
+            - target_weight (np.ndarray[K, 1]): (1: visible, 0: invisible)
+        """
+        num_joints = len(joints_3d)
+        image_size = cfg['image_size']
+        heatmap_size = cfg['heatmap_size']
+        joint_weights = cfg['joint_weights']
+        use_different_joint_weights = cfg['use_different_joint_weights']
+        assert not use_different_joint_weights
+
+        target_weight = np.ones((num_joints, 1), dtype=np.float32)
+        target_weight[:, 0] = joints_3d_visible[:, 0]
+
+        assert target_type in ['GaussianHeatMap', 'CombinedTarget']
+
+        if target_type == 'GaussianHeatMap':
+            target = np.zeros((num_joints, heatmap_size[1], heatmap_size[0]),
+                              dtype=np.float32)
+
+            tmp_size = factor * 3
+
+            # prepare for gaussian
+            size = 2 * tmp_size + 1
+            x = np.arange(0, size, 1, np.float32)
+            y = x[:, None]
+
+            for joint_id in range(num_joints):
+                feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
+                mu_x = int(joints_3d[joint_id][0] / feat_stride[0] + 0.5)
+                mu_y = int(joints_3d[joint_id][1] / feat_stride[1] + 0.5)
+                # Check that any part of the gaussian is in-bounds
+                ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+                br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+                if ul[0] >= heatmap_size[0] or ul[1] >= heatmap_size[1] \
+                        or br[0] < 0 or br[1] < 0:
+                    # If not, just return the image as is
+                    target_weight[joint_id] = 0
+                    continue
+
+                # # Generate gaussian
+                mu_x_ac = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y_ac = joints_3d[joint_id][1] / feat_stride[1]
+                x0 = y0 = size // 2
+                x0 += mu_x_ac - mu_x
+                y0 += mu_y_ac - mu_y
+                g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * factor**2))
+
+                # Usable gaussian range
+                g_x = max(0, -ul[0]), min(br[0], heatmap_size[0]) - ul[0]
+                g_y = max(0, -ul[1]), min(br[1], heatmap_size[1]) - ul[1]
+                # Image range
+                img_x = max(0, ul[0]), min(br[0], heatmap_size[0])
+                img_y = max(0, ul[1]), min(br[1], heatmap_size[1])
+
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \
+                        g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+        elif target_type == 'CombinedTarget':
+            target = np.zeros(
+                (num_joints, 3, heatmap_size[1] * heatmap_size[0]),
+                dtype=np.float32)
+            feat_width = heatmap_size[0]
+            feat_height = heatmap_size[1]
+            feat_x_int = np.arange(0, feat_width)
+            feat_y_int = np.arange(0, feat_height)
+            feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
+            feat_x_int = feat_x_int.flatten()
+            feat_y_int = feat_y_int.flatten()
+            # Calculate the radius of the positive area in classification
+            #   heatmap.
+            valid_radius = factor * heatmap_size[1]
+            feat_stride = (image_size - 1.0) / (heatmap_size - 1.0)
+            for joint_id in range(num_joints):
+                mu_x = joints_3d[joint_id][0] / feat_stride[0]
+                mu_y = joints_3d[joint_id][1] / feat_stride[1]
+                x_offset = (mu_x - feat_x_int) / valid_radius
+                y_offset = (mu_y - feat_y_int) / valid_radius
+                dis = x_offset**2 + y_offset**2
+                keep_pos = np.where(dis <= 1)[0]
+                v = target_weight[joint_id]
+                if v > 0.5:
+                    target[joint_id, 0, keep_pos] = 1
+                    target[joint_id, 1, keep_pos] = x_offset[keep_pos]
+                    target[joint_id, 2, keep_pos] = y_offset[keep_pos]
+            target = target.reshape(num_joints * 3, heatmap_size[1],
+                                    heatmap_size[0])
+
+        if use_different_joint_weights:
+            target_weight = np.multiply(target_weight, joint_weights)
+
+        return target, target_weight
+
+    def __call__(self, results):
+        """Generate the target heatmap."""
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+
+        assert self.encoding in ['MSRA', 'UDP']
+
+        if self.encoding == 'MSRA':
+            if isinstance(self.sigma, list):
+                num_sigmas = len(self.sigma)
+                cfg = results['ann_info']
+                num_joints = len(joints_3d)
+                heatmap_size = cfg['heatmap_size']
+
+                target = np.empty(
+                    (0, num_joints, heatmap_size[1], heatmap_size[0]),
+                    dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_sigmas):
+                    target_i, target_weight_i = self._msra_generate_target(
+                        cfg, joints_3d, joints_3d_visible, self.sigma[i])
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._msra_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible,
+                    self.sigma)
+        elif self.encoding == 'UDP':
+            if self.target_type == 'CombinedTarget':
+                factors = self.valid_radius_factor
+                channel_factor = 3
+            elif self.target_type == 'GaussianHeatMap':
+                factors = self.sigma
+                channel_factor = 1
+            if isinstance(factors, list):
+                num_factors = len(factors)
+                cfg = results['ann_info']
+                num_joints = len(joints_3d)
+                W, H = cfg['heatmap_size']
+
+                target = np.empty((0, channel_factor * num_joints, H, W),
+                                  dtype=np.float32)
+                target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
+                for i in range(num_factors):
+                    target_i, target_weight_i = self._udp_generate_target(
+                        cfg, joints_3d, joints_3d_visible, factors[i],
+                        self.target_type)
+                    target = np.concatenate([target, target_i[None]], axis=0)
+                    target_weight = np.concatenate(
+                        [target_weight, target_weight_i[None]], axis=0)
+            else:
+                target, target_weight = self._udp_generate_target(
+                    results['ann_info'], joints_3d, joints_3d_visible, factors,
+                    self.target_type)
+        else:
+            raise ValueError(
+                f'Encoding approach {self.encoding} is not supported!')
+
+        results['target'] = target
+        results['target_weight'] = target_weight
+
+        return results
+
+@PIPELINES.register_module()
+class LoadDepthFromFile:
+    """Load depthmap from file.
+
+    Required Keys:
+
+    - depth_path
+
+    Modified Keys:
+
+    - depth
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded depth to a float32
+            numpy array. If set to False, the loaded depth is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        imdecode_backend (str): The depth decoding backend type. The backend
+            argument for :func:`mmcv.imfrombytes`.
+            See :func:`mmcv.imfrombytes` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        ignore_empty (bool): Whether to allow loading empty depth or file path
+            not existent. Defaults to False.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 channel_order='rgb',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.channel_order = channel_order
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _read_depth(self, path):
+        img = np.load(path)['depth']
+        if img is None:
+            raise ValueError(f'Fail to read {path}')
+        if self.to_float32:
+            img = img.astype(np.float32)
+        return img
+
+    def __call__(self, results: dict) -> Optional[dict]:
+        """Functions to load depth.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded depth and meta information.
+        """
+
+        """Loading depth(s) from file."""
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        depth_file = results.get('depth_file', None)
+        # Replace file extension with npy
+        pre, ext = os.path.splitext(depth_file)
+        depth_file = pre + '.npz'
+        if isinstance(depth_file, (list, tuple)):
+            # Load depths from a list of paths
+            results['depth'] = [self._read_depth(path) for path in depth_file]
+        elif depth_file is not None:
+            # Load single depth from path
+            results['depth'] = self._read_depth(depth_file)
+        else:
+            if 'depth' not in results:
+                # If `depth_file`` is not in results, check the `img` exists
+                # and format the depth. This for compatibility when the depth
+                # is manually set outside the pipeline.
+                raise KeyError('Either `depth_file` or `img` should exist in '
+                               'results.')
+            if isinstance(results['depth'], (list, tuple)):
+                assert isinstance(results['depth'][0], np.ndarray)
+            else:
+                assert isinstance(results['depth'], np.ndarray)
+            results['depth_file'] = None
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class DepthTopDownAffineFewShot:
+    """Affine transform the image to make input.
+
+    Required keys:'img', 'depth', 'joints_3d', 'joints_3d_visible', 'ann_info','scale',
+    'rotation' and 'center'. Modified keys:'img', 'joints_3d', and
+    'joints_3d_visible'.
+
+    Args:
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, use_udp=False):
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        depth = results['depth']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        if self.use_udp:
+            trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+            img = cv2.warpAffine(
+                img,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            depth = cv2.warpAffine(
+                depth,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            joints_3d[:, 0:2] = warp_affine_joints(joints_3d[:, 0:2].copy(), trans)
+        else:
+            trans = get_affine_transform(c, s, r, image_size)
+            img = cv2.warpAffine(
+                img,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            depth = cv2.warpAffine(
+                depth,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            for i in range(len(joints_3d)):
+                if joints_3d_visible[i, 0] > 0.0:
+                    joints_3d[i, 0:2] = affine_transform(joints_3d[i, 0:2], trans)
+
+        results['img'] = img
+        results['depth'] = depth
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+
+        return results
+
+
+
+
+@PIPELINES.register_module()
+class LoadFeatFromFile:
+    """Load depthmap from file.
+
+    Required Keys:
+
+    - depth_path
+
+    Modified Keys:
+
+    - depth
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded depth to a float32
+            numpy array. If set to False, the loaded depth is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        imdecode_backend (str): The depth decoding backend type. The backend
+            argument for :func:`mmcv.imfrombytes`.
+            See :func:`mmcv.imfrombytes` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        ignore_empty (bool): Whether to allow loading empty depth or file path
+            not existent. Defaults to False.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 channel_order='rgb',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.channel_order = channel_order
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _read_depth(self, path):
+        img = np.load(path)['feat']
+        if img is None:
+            raise ValueError(f'Fail to read {path}')
+        if self.to_float32:
+            img = img.astype(np.float32)
+        return img
+
+    def __call__(self, results: dict) -> Optional[dict]:
+        """Functions to load depth.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded depth and meta information.
+        """
+
+        """Loading depth(s) from file."""
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        feat_file = results.get('feat_file', None)
+        # Replace file extension with npy
+        pre, ext = os.path.splitext(feat_file)
+        feat_file = pre + '.npz'
+        if isinstance(feat_file, (list, tuple)):
+            # Load depths from a list of paths
+            results['feat'] = [self._read_depth(path) for path in feat_file]
+        elif feat_file is not None:
+            # Load single depth from path
+            results['feat'] = self._read_depth(feat_file)
+        else:
+            if 'feat_file' not in results:
+                # If `depth_file`` is not in results, check the `img` exists
+                # and format the depth. This for compatibility when the depth
+                # is manually set outside the pipeline.
+                raise KeyError('Either `feat_file` or `img` should exist in results.')
+            if isinstance(results['feat'], (list, tuple)):
+                assert isinstance(results['feat'][0], np.ndarray)
+            else:
+                assert isinstance(results['feat'], np.ndarray)
+            results['feat_file'] = None
+
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class FeatTopDownAffineFewShot:
+    """Affine transform the image to make input.
+
+    Required keys:'img', 'depth', 'joints_3d', 'joints_3d_visible', 'ann_info','scale',
+    'rotation' and 'center'. Modified keys:'img', 'joints_3d', and
+    'joints_3d_visible'.
+
+    Args:
+        use_udp (bool): To use unbiased data processing.
+            Paper ref: Huang et al. The Devil is in the Details: Delving into
+            Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    """
+
+    def __init__(self, use_udp=False):
+        self.use_udp = use_udp
+
+    def __call__(self, results):
+        image_size = results['ann_info']['image_size']
+
+        img = results['img']
+        feat = results['feat']
+        joints_3d = results['joints_3d']
+        joints_3d_visible = results['joints_3d_visible']
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        if self.use_udp:
+            trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0)
+            img = cv2.warpAffine(
+                img,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            feat = cv2.warpAffine(
+                feat,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            joints_3d[:, 0:2] = warp_affine_joints(joints_3d[:, 0:2].copy(), trans)
+        else:
+            trans = get_affine_transform(c, s, r, image_size)
+            img = cv2.warpAffine(
+                img,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            feat = cv2.warpAffine(
+                feat,
+                trans, (int(image_size[0]), int(image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            for i in range(len(joints_3d)):
+                if joints_3d_visible[i, 0] > 0.0:
+                    joints_3d[i, 0:2] = affine_transform(joints_3d[i, 0:2], trans)
+
+        results['img'] = img
+        results['depth'] = feat
+        results['joints_3d'] = joints_3d
+        results['joints_3d_visible'] = joints_3d_visible
+
+        return results
diff --git a/EdgeCape/models/__init__.py b/EdgeCape/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45fc685affd284a607f312a718cbe3c1abc7751b
--- /dev/null
+++ b/EdgeCape/models/__init__.py
@@ -0,0 +1,3 @@
+from .detectors import *  # noqa
+from .keypoint_heads import *  # noqa
+from .backbones import *  # noqa
\ No newline at end of file
diff --git a/EdgeCape/models/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a5039155827e7b483a269cfe8f11db33062967d
Binary files /dev/null and b/EdgeCape/models/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/models/backbones/__pycache__/adapter.cpython-39.pyc b/EdgeCape/models/backbones/__pycache__/adapter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81da99d8bada3e5dd0f4d82e7ee00bc13ca9a06b
Binary files /dev/null and b/EdgeCape/models/backbones/__pycache__/adapter.cpython-39.pyc differ
diff --git a/EdgeCape/models/backbones/__pycache__/dino.cpython-39.pyc b/EdgeCape/models/backbones/__pycache__/dino.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90967dfc725e242a468959ba72074d68243d0568
Binary files /dev/null and b/EdgeCape/models/backbones/__pycache__/dino.cpython-39.pyc differ
diff --git a/EdgeCape/models/backbones/adapter.py b/EdgeCape/models/backbones/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c9fc56c63b414151d8cd918f866e902faa95fd
--- /dev/null
+++ b/EdgeCape/models/backbones/adapter.py
@@ -0,0 +1,935 @@
+
+import torch.nn.functional as F
+import fvcore.nn.weight_init as weight_init
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.functional import interpolate
+
+"""
+Code is based on: https://github.com/mbanani/probe3d
+"""
+
+
+class SurfaceNormalHead(nn.Module):
+    def __init__(
+        self,
+        feat_dim,
+        head_type="multiscale",
+        uncertainty_aware=False,
+        hidden_dim=512,
+        kernel_size=1,
+    ):
+        super().__init__()
+
+        self.uncertainty_aware = uncertainty_aware
+        output_dim = 4 if uncertainty_aware else 3
+
+        self.kernel_size = kernel_size
+
+        assert head_type in ["linear", "multiscale", "dpt"]
+        name = f"snorm_{head_type}_k{kernel_size}"
+        self.name = f"{name}_UA" if uncertainty_aware else name
+
+        if head_type == "linear":
+            self.head = Linear(feat_dim, output_dim, kernel_size)
+        elif head_type == "multiscale":
+            self.head = MultiscaleHead(feat_dim, output_dim, hidden_dim, kernel_size)
+        elif head_type == "dpt":
+            self.head = DPT(feat_dim, output_dim, hidden_dim, kernel_size)
+        else:
+            raise ValueError(f"Unknown head type: {self.head_type}")
+
+    def forward(self, feats):
+        return self.head(feats)
+
+
+class DepthHead(nn.Module):
+    def __init__(
+        self,
+        feat_dim,
+        head_type="multiscale",
+        min_depth=0.001,
+        max_depth=10,
+        prediction_type="bindepth",
+        hidden_dim=512,
+        kernel_size=1,
+    ):
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.name = f"{prediction_type}_{head_type}_k{kernel_size}"
+
+        if prediction_type == "bindepth":
+            output_dim = 256
+            self.predict = DepthBinPrediction(min_depth, max_depth, n_bins=output_dim)
+        elif prediction_type == "sigdepth":
+            output_dim = 1
+            self.predict = DepthSigmoidPrediction(min_depth, max_depth)
+        else:
+            raise ValueError()
+
+        if head_type == "linear":
+            self.head = Linear(feat_dim, output_dim, kernel_size)
+        elif head_type == "multiscale":
+            self.head = MultiscaleHead(feat_dim, output_dim, hidden_dim, kernel_size)
+        elif head_type == "dpt":
+            self.head = DPT(feat_dim, output_dim, hidden_dim, kernel_size)
+        else:
+            raise ValueError(f"Unknown head type: {self.head_type}")
+
+    def forward(self, feats):
+        """Prediction each pixel."""
+        feats = self.head(feats)
+        depth = self.predict(feats)
+        return depth
+
+
+class DepthBinPrediction(nn.Module):
+    def __init__(
+        self,
+        min_depth=0.001,
+        max_depth=10,
+        n_bins=256,
+        bins_strategy="UD",
+        norm_strategy="linear",
+    ):
+        super().__init__()
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.norm_strategy = norm_strategy
+        self.bins_strategy = bins_strategy
+
+    def forward(self, prob):
+        if self.bins_strategy == "UD":
+            bins = torch.linspace(
+                self.min_depth, self.max_depth, self.n_bins, device=prob.device
+            )
+        elif self.bins_strategy == "SID":
+            bins = torch.logspace(
+                self.min_depth, self.max_depth, self.n_bins, device=prob.device
+            )
+
+        # following Adabins, default linear
+        if self.norm_strategy == "linear":
+            prob = torch.relu(prob)
+            eps = 0.1
+            prob = prob + eps
+            prob = prob / prob.sum(dim=1, keepdim=True)
+        elif self.norm_strategy == "softmax":
+            prob = torch.softmax(prob, dim=1)
+        elif self.norm_strategy == "sigmoid":
+            prob = torch.sigmoid(prob)
+            prob = prob / prob.sum(dim=1, keepdim=True)
+
+        depth = torch.einsum("ikhw,k->ihw", [prob, bins])
+        depth = depth.unsqueeze(dim=1)
+        return depth
+
+
+class DepthSigmoidPrediction(nn.Module):
+    def __init__(self, min_depth=0.001, max_depth=10):
+        super().__init__()
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+
+    def forward(self, pred):
+        depth = pred.sigmoid()
+        depth = self.min_depth + depth * (self.max_depth - self.min_depth)
+        return depth
+
+
+class FeatureFusionBlock(nn.Module):
+    def __init__(self, features, kernel_size, with_skip=True):
+        super().__init__()
+        self.with_skip = with_skip
+        if self.with_skip:
+            self.resConfUnit1 = ResidualConvUnit(features, kernel_size)
+
+        self.resConfUnit2 = ResidualConvUnit(features, kernel_size)
+
+    def forward(self, x, skip_x=None):
+        if skip_x is not None:
+            assert self.with_skip and skip_x.shape == x.shape
+            x = self.resConfUnit1(x) + skip_x
+
+        x = self.resConfUnit2(x)
+        return x
+
+
+class ResidualConvUnit(nn.Module):
+    def __init__(self, features, kernel_size):
+        super().__init__()
+        assert kernel_size % 1 == 0, "Kernel size needs to be odd"
+        padding = kernel_size // 2
+        self.conv = nn.Sequential(
+            nn.Conv2d(features, features, kernel_size, padding=padding),
+            nn.ReLU(True),
+            nn.Conv2d(features, features, kernel_size, padding=padding),
+            nn.ReLU(True),
+        )
+
+    def forward(self, x):
+        return self.conv(x) + x
+
+
+class DPT(nn.Module):
+    def __init__(self, input_dims, output_dim, hidden_dim=512, kernel_size=3, hr=False, swin=False):
+        super().__init__()
+        assert len(input_dims) == 4
+        self.hr = hr
+        self.conv_0 = nn.Conv2d(input_dims[0], hidden_dim, 1, padding=0)
+        self.conv_1 = nn.Conv2d(input_dims[1], hidden_dim, 1, padding=0)
+        self.conv_2 = nn.Conv2d(input_dims[2], hidden_dim, 1, padding=0)
+        self.conv_3 = nn.Conv2d(input_dims[3], hidden_dim, 1, padding=0)
+
+        self.ref_0 = FeatureFusionBlock(hidden_dim, kernel_size)
+        self.ref_1 = FeatureFusionBlock(hidden_dim, kernel_size)
+        self.ref_2 = FeatureFusionBlock(hidden_dim, kernel_size)
+        self.ref_3 = FeatureFusionBlock(hidden_dim, kernel_size, with_skip=False)
+
+        self.out_conv = nn.Sequential(
+            nn.Conv2d(hidden_dim, hidden_dim, 3, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(hidden_dim, output_dim, 3, padding=1),
+        )
+
+        if swin:
+            self.scale_factor = [1, 2, 4, 4]
+        else:
+            self.scale_factor = [2, 2, 2, 2]
+
+    def forward(self, features):
+        """Prediction each pixel."""
+        assert len(features) == 4
+        feats = features.copy()
+        feats[0] = self.conv_0(feats[0])
+        feats[1] = self.conv_1(feats[1])
+        feats[2] = self.conv_2(feats[2])
+        feats[3] = self.conv_3(feats[3])
+
+        feats = [interpolate(x, scale_factor=scale_factor) for x, scale_factor in zip(feats, self.scale_factor)]
+
+        out = self.ref_3(feats[3], None)
+        out = self.ref_2(feats[2], out)
+        out = self.ref_1(feats[1], out)
+        out = self.ref_0(feats[0], out)
+        if not self.hr:
+            return self.out_conv(out)
+        out = interpolate(out, scale_factor=4)
+        out = self.out_conv(out)
+        # out = interpolate(out, scale_factor=2)
+        return out
+
+
+def make_conv(input_dim, hidden_dim, output_dim, num_layers, kernel_size=1):
+    return conv
+
+
+class Linear(nn.Module):
+    def __init__(self, input_dim, output_dim, kernel_size=1):
+        super().__init__()
+        if type(input_dim) is not int:
+            input_dim = sum(input_dim)
+
+        assert type(input_dim) is int
+        padding = kernel_size // 2
+        self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, padding=padding)
+
+    def forward(self, feats):
+        if type(feats) is list:
+            feats = torch.cat(feats, dim=1)
+
+        feats = interpolate(feats, scale_factor=4, mode="bilinear")
+        return self.conv(feats)
+
+
+class MultiscaleHead(nn.Module):
+    def __init__(self, input_dims, output_dim, hidden_dim=512, kernel_size=1):
+        super().__init__()
+
+        self.convs = nn.ModuleList(
+            [make_conv(in_d, None, hidden_dim, 1, kernel_size) for in_d in input_dims]
+        )
+        interm_dim = len(input_dims) * hidden_dim
+        self.conv_mid = make_conv(interm_dim, hidden_dim, hidden_dim, 3, kernel_size)
+        self.conv_out = make_conv(hidden_dim, hidden_dim, output_dim, 2, kernel_size)
+
+    def forward(self, feats):
+        num_feats = len(feats)
+        feats = [self.convs[i](feats[i]) for i in range(num_feats)]
+
+        h, w = feats[-1].shape[-2:]
+        feats = [interpolate(feat, (h, w), mode="bilinear") for feat in feats]
+        feats = torch.cat(feats, dim=1).relu()
+
+        # upsample
+        feats = interpolate(feats, scale_factor=2, mode="bilinear")
+        feats = self.conv_mid(feats).relu()
+        feats = interpolate(feats, scale_factor=4, mode="bilinear")
+        return self.conv_out(feats)
+
+def get_norm(norm, out_channels, num_norm_groups=32):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "GN": lambda channels: nn.GroupNorm(num_norm_groups, channels),
+        }[norm]
+    return norm(out_channels)
+
+
+def get_activation(activation):
+    """
+    Args:
+        activation (str or callable): either one of relu, lrelu, prelu, leaky_relu,
+            sigmoid, tanh, elu, selu, swish, mish; or a callable that takes a
+            tensor and returns a tensor.
+    Returns:
+        nn.Module or None: the activation layer
+    """
+    if activation is None:
+        return None
+    if isinstance(activation, str):
+        if len(activation) == 0:
+            return None
+        activation = {
+            "relu": nn.ReLU,
+            "lrelu": nn.LeakyReLU,
+            "prelu": nn.PReLU,
+            "leaky_relu": nn.LeakyReLU,
+            "sigmoid": nn.Sigmoid,
+            "tanh": nn.Tanh,
+            "elu": nn.ELU,
+            "selu": nn.SELU,
+        }[activation]
+    return activation()
+
+
+# SCE crisscross + diags
+class EfficientSpatialContextNet(nn.Module):
+    def __init__(self, kernel_size=7, in_channels=768, out_channels=768, use_cuda=True):
+        super(EfficientSpatialContextNet, self).__init__()
+        self.kernel_size = kernel_size
+        self.pad = kernel_size // 2
+        self.conv = torch.nn.Conv2d(
+            in_channels + 4 * self.kernel_size,
+            out_channels,
+            1,
+            bias=True,
+            padding_mode="zeros",
+        )
+
+        if use_cuda:
+            self.conv = self.conv.cuda()
+
+    def forward(self, feature):
+        b, c, h, w = feature.size()
+        feature_normalized = F.normalize(feature, p=2, dim=1)
+        feature_pad = F.pad(
+            feature_normalized, (self.pad, self.pad, self.pad, self.pad), "constant", 0
+        )
+        output = torch.zeros(
+            [4 * self.kernel_size, b, h, w],
+            dtype=feature.dtype,
+            requires_grad=feature.requires_grad,
+        )
+        if feature.is_cuda:
+            output = output.cuda(feature.get_device())
+
+        # left-top to right-bottom
+        for i in range(self.kernel_size):
+            c = i
+            r = i
+            output[i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1)
+
+        # col
+        for i in range(self.kernel_size):
+            c = self.kernel_size // 2
+            r = i
+            output[1 * self.kernel_size + i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1)
+
+        # right-top to left-bottom
+        for i in range(self.kernel_size):
+            c = (self.kernel_size - 1) - i
+            r = i
+            output[2 * self.kernel_size + i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1)
+
+        # row
+        for i in range(self.kernel_size):
+            c = i
+            r = self.kernel_size // 2
+            output[3 * self.kernel_size + i] = (feature_pad[:, :, r: (h + r), c: (w + c)] * feature_normalized).sum(1)
+
+        output = output.transpose(0, 1).contiguous()
+        output = torch.cat((feature, output), 1)
+        output = self.conv(output)
+        # output = F.relu(output)
+
+        return output
+
+
+class Conv2d(nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            *,
+            bottleneck_channels,
+            stride=1,
+            num_groups=1,
+            norm="GN",
+            stride_in_1x1=False,
+            dilation=1,
+            num_norm_groups=32,
+            kernel_size=(1, 3, 1)
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels, num_norm_groups),
+            )
+        else:
+            self.shortcut = None
+
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=kernel_size[0],
+            stride=stride_1x1,
+            padding=(kernel_size[0] - 1) // 2,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels, num_norm_groups),
+        )
+
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=kernel_size[1],
+            stride=stride_3x3,
+            padding=dilation * (kernel_size[1] - 1) // 2,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels, num_norm_groups),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=kernel_size[2],
+            bias=False,
+            norm=get_norm(norm, out_channels, num_norm_groups),
+        )
+
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                weight_init.c2_msra_fill(layer)
+
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+
+        out = self.conv2(out)
+        out = F.relu_(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu_(out)
+        return out
+
+
+class ResNet(nn.Module):
+    """
+    Implement :paper:`ResNet`.
+    """
+
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stage_names, self.stages = [], []
+
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
+            )
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+
+            blocks.append(
+                block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
+            )
+            in_channels = out_channels
+        return blocks
+
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+
+class DummyAggregationNetwork(nn.Module):  # for testing, return the input
+    def __init__(self):
+        super(DummyAggregationNetwork, self).__init__()
+        # dummy paprameter
+        self.dummy = nn.Parameter(torch.ones([]))
+
+    def forward(self, batch, pose=None):
+        return batch * self.dummy
+
+
+class AggregationNetwork(nn.Module):
+    """
+    Module for aggregating feature maps across time and space.
+    Design inspired by the Feature Extractor from ODISE (Xu et. al., CVPR 2023).
+    https://github.com/NVlabs/ODISE/blob/5836c0adfcd8d7fd1f8016ff5604d4a31dd3b145/odise/modeling/backbone/feature_extractor.py
+    """
+
+    def __init__(
+            self,
+            device,
+            feature_dims=[640, 1280, 1280, 768],
+            projection_dim=384,
+            num_norm_groups=32,
+            save_timestep=[1],
+            kernel_size=[1, 3, 1],
+            contrastive_temp=10,
+            feat_map_dropout=0.0,
+    ):
+        super().__init__()
+        self.skip_connection = True
+        self.feat_map_dropout = feat_map_dropout
+        self.azimuth_embedding = None
+        self.pos_embedding = None
+        self.bottleneck_layers = nn.ModuleList()
+        self.feature_dims = feature_dims
+        # For CLIP symmetric cross entropy loss during training
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.self_logit_scale = nn.Parameter(torch.ones([]) * np.log(contrastive_temp))
+        self.device = device
+        self.save_timestep = save_timestep
+
+        self.mixing_weights_names = []
+        for l, feature_dim in enumerate(self.feature_dims):
+            bottleneck_layer = nn.Sequential(
+                *ResNet.make_stage(
+                    BottleneckBlock,
+                    num_blocks=1,
+                    in_channels=feature_dim,
+                    bottleneck_channels=projection_dim // 4,
+                    out_channels=projection_dim,
+                    norm="GN",
+                    num_norm_groups=num_norm_groups,
+                    kernel_size=kernel_size
+                )
+            )
+            self.bottleneck_layers.append(bottleneck_layer)
+            for t in save_timestep:
+                # 1-index the layer name following prior work
+                self.mixing_weights_names.append(f"timestep-{save_timestep}_layer-{l + 1}")
+        self.last_layer = None
+        self.bottleneck_layers = self.bottleneck_layers.to(device)
+        mixing_weights = torch.ones(len(self.bottleneck_layers) * len(save_timestep))
+        self.mixing_weights = nn.Parameter(mixing_weights.to(device))
+        # count number of parameters
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(f"AggregationNetwork has {num_params} parameters.")
+
+    def load_pretrained_weights(self, pretrained_dict):
+        custom_dict = self.state_dict()
+
+        # Handle size mismatch
+        if 'mixing_weights' in custom_dict and 'mixing_weights' in pretrained_dict and custom_dict[
+            'mixing_weights'].shape != pretrained_dict['mixing_weights'].shape:
+            # Keep the first four weights from the pretrained model, and randomly initialize the fifth weight
+            custom_dict['mixing_weights'][:4] = pretrained_dict['mixing_weights'][:4]
+            custom_dict['mixing_weights'][4] = torch.zeros_like(custom_dict['mixing_weights'][4])
+        else:
+            custom_dict['mixing_weights'][:4] = pretrained_dict['mixing_weights'][:4]
+
+        # Load the weights that do match
+        matching_keys = {k: v for k, v in pretrained_dict.items() if k in custom_dict and k != 'mixing_weights'}
+        custom_dict.update(matching_keys)
+
+        # Now load the updated state_dict
+        self.load_state_dict(custom_dict, strict=False)
+
+    def forward(self, batch, pose=None):
+        """
+        Assumes batch is shape (B, C, H, W) where C is the concatentation of all layer features.
+        """
+        if self.feat_map_dropout > 0 and self.training:
+            batch = F.dropout(batch, p=self.feat_map_dropout)
+
+        output_feature = None
+        start = 0
+        mixing_weights = torch.nn.functional.softmax(self.mixing_weights, dim=0)
+        if self.pos_embedding is not None:  # position embedding
+            batch = torch.cat((batch, self.pos_embedding), dim=1)
+        for i in range(len(mixing_weights)):
+            # Share bottleneck layers across timesteps
+            bottleneck_layer = self.bottleneck_layers[i % len(self.feature_dims)]
+            # Chunk the batch according the layer
+            # Account for looping if there are multiple timesteps
+            end = start + self.feature_dims[i % len(self.feature_dims)]
+            feats = batch[:, start:end, :, :]
+            start = end
+            # Downsample the number of channels and weight the layer
+            bottlenecked_feature = bottleneck_layer(feats)
+            bottlenecked_feature = mixing_weights[i] * bottlenecked_feature
+            if output_feature is None:
+                output_feature = bottlenecked_feature
+            else:
+                output_feature += bottlenecked_feature
+
+        if self.last_layer is not None:
+
+            output_feature_after = self.last_layer(output_feature)
+            if self.skip_connection:
+                # skip connection
+                output_feature = output_feature + output_feature_after
+        return output_feature
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution without padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, in_planes, planes, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(in_planes, planes, stride)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                conv1x1(in_planes, planes, stride=stride),
+                nn.BatchNorm2d(planes)
+            )
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.bn1(self.conv1(y)))
+        y = self.bn2(self.conv2(y))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
\ No newline at end of file
diff --git a/EdgeCape/models/backbones/dino.py b/EdgeCape/models/backbones/dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..4323eba0dfc74ba71b18274b57e91bcd91650046
--- /dev/null
+++ b/EdgeCape/models/backbones/dino.py
@@ -0,0 +1,206 @@
+import einops as E
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers.models.vit_mae.modeling_vit_mae import (
+    get_2d_sincos_pos_embed_from_grid,
+)
+
+
+def resize_pos_embed(
+    pos_embed: torch.Tensor, hw: tuple[int, int], has_cls_token: bool = True
+):
+    """
+    Resize positional embedding for arbitrary image resolution. Resizing is done
+    via bicubic interpolation.
+
+    Args:
+        pos_embed: Positional embedding tensor of shape ``(n_patches, embed_dim)``.
+        hw: Target height and width of the tensor after interpolation.
+        has_cls_token: Whether ``pos_embed[0]`` is for the ``[cls]`` token.
+
+    Returns:
+        Tensor of shape ``(new_n_patches, embed_dim)`` of resized embedding.
+        ``new_n_patches`` is ``new_height * new_width`` if ``has_cls`` is False,
+        else ``1 + new_height * new_width``.
+    """
+
+    n_grid = pos_embed.shape[0] - 1 if has_cls_token else pos_embed.shape[0]
+
+    # Do not resize if already in same shape.
+    if n_grid == hw[0] * hw[1]:
+        return pos_embed
+
+    # Get original position embedding and extract ``[cls]`` token.
+    if has_cls_token:
+        cls_embed, pos_embed = pos_embed[[0]], pos_embed[1:]
+
+    orig_dim = int(pos_embed.shape[0] ** 0.5)
+
+    pos_embed = E.rearrange(pos_embed, "(h w) c -> 1 c h w", h=orig_dim)
+    pos_embed = F.interpolate(
+        pos_embed, hw, mode="bicubic", align_corners=False, antialias=True
+    )
+    pos_embed = E.rearrange(pos_embed, "1 c h w -> (h w) c")
+
+    # Add embedding of ``[cls]`` token back after resizing.
+    if has_cls_token:
+        pos_embed = torch.cat([cls_embed, pos_embed], dim=0)
+
+    return pos_embed
+
+
+def center_padding(images, patch_size):
+    _, _, h, w = images.shape
+    diff_h = h % patch_size
+    diff_w = w % patch_size
+
+    if diff_h == 0 and diff_w == 0:
+        return images
+
+    pad_h = patch_size - diff_h
+    pad_w = patch_size - diff_w
+
+    pad_t = pad_h // 2
+    pad_l = pad_w // 2
+    pad_r = pad_w - pad_l
+    pad_b = pad_h - pad_t
+
+    images = F.pad(images, (pad_l, pad_r, pad_t, pad_b))
+    return images
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
+    """
+    COPIED FROM TRANSFORMERS PACKAGE AND EDITED TO ALLOW FOR DIFFERENT WIDTH-HEIGHT
+    Create 2D sin/cos positional embeddings.
+
+    Args:
+        embed_dim (`int`):
+            Embedding dimension.
+        grid_size (`int`):
+            The grid height and width.
+        add_cls_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a classification (CLS) token.
+
+    Returns:
+        (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or
+        (1+grid_size*grid_size, embed_dim): the
+        position embeddings (with or without classification token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if add_cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def tokens_to_output(output_type, dense_tokens, cls_token, feat_hw):
+    if output_type == "cls":
+        assert cls_token is not None
+        output = cls_token
+    elif output_type == "gap":
+        output = dense_tokens.mean(dim=1)
+    elif output_type == "dense":
+        h, w = feat_hw
+        dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w)
+        output = dense_tokens.contiguous()
+    elif output_type == "dense-cls":
+        assert cls_token is not None
+        h, w = feat_hw
+        dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w)
+        cls_token = cls_token[:, :, None, None].repeat(1, 1, h, w)
+        output = torch.cat((dense_tokens, cls_token), dim=1).contiguous()
+    else:
+        raise ValueError()
+
+    return output
+
+class DINO(torch.nn.Module):
+    def __init__(
+        self,
+        dino_name="dinov2",
+        model_name="vits14",
+        output="dense-cls",
+        layer=-1,
+        return_multilayer=True,
+    ):
+        super().__init__()
+        feat_dims = {
+            "vits14": 384,
+            "vitb8": 768,
+            "vitb16": 768,
+            "vitb14": 768,
+            "vitb14_reg": 768,
+            "vitl14": 1024,
+            "vitg14": 1536,
+        }
+
+        # get model
+        self.model_name = dino_name
+        self.checkpoint_name = f"{dino_name}_{model_name}"
+        dino_vit = torch.hub.load(f"facebookresearch/{dino_name}", self.checkpoint_name)
+        self.vit = dino_vit.eval().to(torch.float32)
+        self.has_registers = "_reg" in model_name
+
+        assert output in ["cls", "gap", "dense", "dense-cls"]
+        self.output = output
+        self.patch_size = self.vit.patch_embed.proj.kernel_size[0]
+
+        feat_dim = feat_dims[model_name]
+        feat_dim = feat_dim * 2 if output == "dense-cls" else feat_dim
+
+        num_layers = len(self.vit.blocks)
+        multilayers = [
+            num_layers // 4 - 1,
+            num_layers // 2 - 1,
+            num_layers // 4 * 3 - 1,
+            num_layers - 1,
+        ]
+
+        if return_multilayer:
+            self.feat_dim = [feat_dim, feat_dim, feat_dim, feat_dim]
+            self.multilayers = multilayers
+        else:
+            self.feat_dim = feat_dim
+            layer = multilayers[-1] if layer == -1 else layer
+            self.multilayers = [layer]
+
+        # define layer name (for logging)
+        self.layer = "-".join(str(_x) for _x in self.multilayers)
+
+    def forward(self, images):
+
+        # pad images (if needed) to ensure it matches patch_size
+        images = center_padding(images, self.patch_size)
+        h, w = images.shape[-2:]
+        h, w = h // self.patch_size, w // self.patch_size
+
+        if self.model_name == "dinov2":
+            x = self.vit.prepare_tokens_with_masks(images, None)
+        else:
+            x = self.vit.prepare_tokens(images)
+
+        embeds = []
+        for i, blk in enumerate(self.vit.blocks):
+            x = blk(x)
+            if i in self.multilayers:
+                embeds.append(x)
+                if len(embeds) == len(self.multilayers):
+                    break
+
+        num_spatial = h * w
+        outputs = []
+        for i, x_i in enumerate(embeds):
+            cls_tok = x_i[:, 0]
+            # ignoring register tokens
+            spatial = x_i[:, -1 * num_spatial :]
+            x_i = tokens_to_output(self.output, spatial, cls_tok, (h, w))
+            outputs.append(x_i)
+
+        return outputs[0] if len(outputs) == 1 else outputs
\ No newline at end of file
diff --git a/EdgeCape/models/detectors/EdgeCape.py b/EdgeCape/models/detectors/EdgeCape.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8de214f8d5184d74b666d457bdb5ab775ed679d
--- /dev/null
+++ b/EdgeCape/models/detectors/EdgeCape.py
@@ -0,0 +1,392 @@
+import math
+import cv2
+import mmcv
+import numpy as np
+import torch
+import torch.nn
+import torch.nn.functional as F
+from mmcv.image import imwrite
+from mmcv.visualization.image import imshow
+from mmpose.models import builder
+from mmpose.models.builder import POSENETS
+from mmpose.models.detectors.base import BasePose
+from EdgeCape.models.backbones.adapter import DPT
+from EdgeCape.models.backbones.dino import DINO
+
+
+@POSENETS.register_module()
+class EdgeCape(BasePose):
+    """
+    EdgeCape: Edge-aware Context-Aware Pose Estimation.
+    Args:
+        keypoint_head (dict): Config for keypoint head.
+        encoder_config (dict): Config for encoder.
+        train_cfg (dict): Config for training. Default: None.
+        test_cfg (dict): Config for testing. Default: None.
+        freeze_backbone (bool): If True, freeze backbone. Default: False.
+    """
+
+    def __init__(self,
+                 keypoint_head,
+                 encoder_config,
+                 train_cfg=None,
+                 test_cfg=None,
+                 freeze_backbone=False):
+        super().__init__()
+        feature_output_setting = encoder_config.get('output', 'dense-cls')
+        model_name = encoder_config.get('model_name', 'vits14')
+        self.encoder_sample = self.encoder_query = DINO(output=feature_output_setting, model_name=model_name)
+        self.probe = DPT(input_dims=self.encoder_query.feat_dim, output_dim=768)
+        self.backbone = 'dino_extractor'
+        self.freeze_backbone = freeze_backbone
+        if keypoint_head.get('freeze', None) is not None:
+            self.freeze_backbone = True
+
+        self.keypoint_head_module = builder.build_head(keypoint_head)
+        self.keypoint_head_module.init_weights()
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.target_type = test_cfg.get('target_type',
+                                        'GaussianHeatMap')  # GaussianHeatMap
+
+    @property
+    def with_keypoint(self):
+        """Check if has keypoint_head."""
+        return hasattr(self, 'keypoint_head_module')
+
+    def init_weights(self, pretrained=None):
+        """Weight initialization for model."""
+        self.encoder_sample.init_weights(pretrained)
+        self.encoder_query.init_weights(pretrained)
+        self.keypoint_head_module.init_weights()
+
+    def forward(self,
+                img_s,
+                img_q,
+                target_s=None,
+                target_weight_s=None,
+                target_q=None,
+                target_weight_q=None,
+                img_metas=None,
+                return_loss=True,
+                **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True. Note this setting will change the expected inputs.
+        When `return_loss=True`, img and img_meta are single-nested (i.e.
+        Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img_s, target_s, target_weight_s, img_q,
+                                      target_q, target_weight_q, img_metas,
+                                      **kwargs)
+        else:
+            return self.forward_test(img_s, target_s, target_weight_s, img_q,
+                                     target_q, target_weight_q, img_metas,
+                                     **kwargs)
+
+    def forward_train(self,
+                      img_s,
+                      target_s,
+                      target_weight_s,
+                      img_q,
+                      target_q,
+                      target_weight_q,
+                      img_metas,
+                      **kwargs):
+        """Defines the computation performed at every call when training."""
+        bs, _, h, w = img_q.shape
+        random_mask = kwargs.get('rand_mask', None)
+        output, initial_proposals, similarity_map, mask_s, reconstructed_keypoints = self.predict(img_s,
+                                                                                                  target_s,
+                                                                                                  target_weight_s,
+                                                                                                  img_q,
+                                                                                                  img_metas,
+                                                                                                  random_mask)
+
+        # parse the img meta to get the target keypoints
+        device = output.device
+        target_keypoints = self.parse_keypoints_from_img_meta(img_metas,
+                                                              device,
+                                                              keyword='query')
+
+        target_sizes = torch.tensor(
+            [img_q.shape[-2], img_q.shape[-1]]).unsqueeze(0).repeat(
+            img_q.shape[0], 1, 1)
+
+        losses = dict()
+        if self.with_keypoint:
+            keypoint_losses = self.keypoint_head_module.get_loss(output,
+                                                                 initial_proposals,
+                                                                 similarity_map,
+                                                                 target_keypoints,
+                                                                 target_q,
+                                                                 target_weight_q * mask_s,
+                                                                 target_sizes,
+                                                                 reconstructed_keypoints,
+                                                                 )
+            losses.update(keypoint_losses)
+            keypoint_accuracy = self.keypoint_head_module.get_accuracy(output[-1],
+                                                                       target_keypoints,
+                                                                       target_weight_q * mask_s,
+                                                                       target_sizes,
+                                                                       height=h)
+            losses.update(keypoint_accuracy)
+        return losses
+
+    def forward_test(self,
+                     img_s,
+                     target_s,
+                     target_weight_s,
+                     img_q,
+                     target_q,
+                     target_weight_q,
+                     img_metas=None,
+                     vis_offset=True,
+                     **kwargs):
+
+        """Defines the computation performed at every call when testing."""
+        batch_size, _, img_height, img_width = img_q.shape
+        output, initial_proposals, similarity_map, mask_s, reconstructed_keypoints = self.predict(img_s,
+                                                                                                  target_s,
+                                                                                                  target_weight_s,
+                                                                                                  img_q,
+                                                                                                  img_metas
+                                                                                                  )
+        predicted_pose = output[-1].detach().cpu().numpy()
+        result = {}
+
+        if self.with_keypoint:
+            keypoint_result = self.keypoint_head_module.decode(img_metas, predicted_pose, img_size=[img_width, img_height])
+            result.update(keypoint_result)
+
+        if vis_offset:
+            result.update({"points": torch.cat((initial_proposals[None], output)).cpu().numpy()})
+
+        result.update({"sample_image_file": [img_metas[i]['sample_image_file'] for i in range(len(img_metas))]})
+
+        return result
+
+    def predict(self,
+                img_s,
+                target_s,
+                target_weight_s,
+                img_q,
+                img_metas=None,
+                random_mask=None):
+
+        batch_size, _, img_height, img_width = img_q.shape
+        assert [i['sample_skeleton'][0] != i['query_skeleton'] for i in img_metas]
+        mask_s = target_weight_s[0]
+        for target_weight in target_weight_s:
+            mask_s = mask_s * target_weight
+        feature_q, feature_s = self.extract_features(img_s, img_q)
+        skeleton_lst = [i['sample_skeleton'][0] for i in img_metas]
+
+        (output, initial_proposals, similarity_map, reconstructed_keypoints) = self.keypoint_head_module(
+            feature_q, feature_s, target_s, mask_s, skeleton_lst, random_mask=random_mask)
+
+        return output, initial_proposals, similarity_map, mask_s, reconstructed_keypoints
+
+    def extract_features(self, img_s, img_q):
+        with torch.no_grad():
+            dino_feature_s = [self.encoder_sample(img) for img in img_s]
+            dino_feature_q = self.encoder_query(img_q)  # [bs, 3, h, w]
+        if self.freeze_backbone:
+            with torch.no_grad():
+                feature_s = [self.probe(f) for f in dino_feature_s]
+                feature_q = self.probe(dino_feature_q)
+        else:
+            feature_s = [self.probe(f) for f in dino_feature_s]
+            feature_q = self.probe(dino_feature_q)
+
+        return feature_q, feature_s
+
+    def parse_keypoints_from_img_meta(self, img_meta, device, keyword='query'):
+        """Parse keypoints from the img_meta.
+
+        Args:
+            img_meta (dict): Image meta info.
+            device (torch.device): Device of the output keypoints.
+            keyword (str): 'query' or 'sample'. Default: 'query'.
+
+        Returns:
+            Tensor: Keypoints coordinates of query images.
+        """
+
+        if keyword == 'query':
+            query_kpt = torch.stack([
+                torch.tensor(info[f'{keyword}_joints_3d']).to(device) for info in img_meta], dim=0)[:, :, :2]
+        else:
+            query_kpt = []
+            for info in img_meta:
+                if isinstance(info[f'{keyword}_joints_3d'][0], torch.Tensor):
+                    samples = torch.stack(info[f'{keyword}_joints_3d'])
+                else:
+                    samples = np.array(info[f'{keyword}_joints_3d'])
+                query_kpt.append(torch.tensor(samples).to(device)[:, :, :2])
+            query_kpt = torch.stack(query_kpt, dim=0)  # [bs, , num_samples, num_query, 2]
+        return query_kpt
+
+    def get_full_similarity_map(self, feature_q, feature_s, h, w):
+        resized_feature_q = F.interpolate(feature_q, size=(h, w),
+                                          mode='bilinear')
+        resized_feature_s = [F.interpolate(s, size=(h, w), mode='bilinear') for
+                             s in feature_s]
+        return [self.chunk_cosine_sim(f_s, resized_feature_q) for f_s in
+                resized_feature_s]
+
+    # UNMODIFIED
+    def show_result(self,
+                    img,
+                    result,
+                    skeleton=None,
+                    kpt_score_thr=0.3,
+                    bbox_color='green',
+                    pose_kpt_color=None,
+                    pose_limb_color=None,
+                    radius=4,
+                    text_color=(255, 0, 0),
+                    thickness=1,
+                    font_scale=0.5,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (list[dict]): The results to draw over `img`
+                (bbox_result, pose_result).
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints.
+                If None, do not draw keypoints.
+            pose_limb_color (np.array[Mx3]): Color of M limbs.
+                If None, do not draw limbs.
+            text_color (str or tuple or :obj:`Color`): Color of texts.
+            thickness (int): Thickness of lines.
+            font_scale (float): Font scales of texts.
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            Tensor: Visualized img, only if not `show` or `out_file`.
+        """
+
+        img = mmcv.imread(img)
+        img = img.copy()
+        img_h, img_w, _ = img.shape
+
+        bbox_result = []
+        pose_result = []
+        for res in result:
+            bbox_result.append(res['bbox'])
+            pose_result.append(res['keypoints'])
+
+        if len(bbox_result) > 0:
+            bboxes = np.vstack(bbox_result)
+            # draw bounding boxes
+            mmcv.imshow_bboxes(
+                img,
+                bboxes,
+                colors=bbox_color,
+                top_k=-1,
+                thickness=thickness,
+                show=False,
+                win_name=win_name,
+                wait_time=wait_time,
+                out_file=None)
+
+            for person_id, kpts in enumerate(pose_result):
+                # draw each point on image
+                if pose_kpt_color is not None:
+                    assert len(pose_kpt_color) == len(kpts), (
+                        len(pose_kpt_color), len(kpts))
+                    for kid, kpt in enumerate(kpts):
+                        x_coord, y_coord, kpt_score = int(kpt[0]), int(
+                            kpt[1]), kpt[2]
+                        if kpt_score > kpt_score_thr:
+                            img_copy = img.copy()
+                            r, g, b = pose_kpt_color[kid]
+                            cv2.circle(img_copy, (int(x_coord), int(y_coord)),
+                                       radius, (int(r), int(g), int(b)), -1)
+                            transparency = max(0, min(1, kpt_score))
+                            cv2.addWeighted(
+                                img_copy,
+                                transparency,
+                                img,
+                                1 - transparency,
+                                0,
+                                dst=img)
+
+                # draw limbs
+                if skeleton is not None and pose_limb_color is not None:
+                    assert len(pose_limb_color) == len(skeleton)
+                    for sk_id, sk in enumerate(skeleton):
+                        pos1 = (int(kpts[sk[0] - 1, 0]), int(kpts[sk[0] - 1,
+                        1]))
+                        pos2 = (int(kpts[sk[1] - 1, 0]), int(kpts[sk[1] - 1,
+                        1]))
+                        if (pos1[0] > 0 and pos1[0] < img_w and pos1[1] > 0
+                                and pos1[1] < img_h and pos2[0] > 0
+                                and pos2[0] < img_w and pos2[1] > 0
+                                and pos2[1] < img_h
+                                and kpts[sk[0] - 1, 2] > kpt_score_thr
+                                and kpts[sk[1] - 1, 2] > kpt_score_thr):
+                            img_copy = img.copy()
+                            X = (pos1[0], pos2[0])
+                            Y = (pos1[1], pos2[1])
+                            mX = np.mean(X)
+                            mY = np.mean(Y)
+                            length = ((Y[0] - Y[1]) ** 2 + (
+                                    X[0] - X[1]) ** 2) ** 0.5
+                            angle = math.degrees(
+                                math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                            stickwidth = 2
+                            polygon = cv2.ellipse2Poly(
+                                (int(mX), int(mY)),
+                                (int(length / 2), int(stickwidth)), int(angle),
+                                0, 360, 1)
+
+                            r, g, b = pose_limb_color[sk_id]
+                            cv2.fillConvexPoly(img_copy, polygon,
+                                               (int(r), int(g), int(b)))
+                            transparency = max(
+                                0,
+                                min(
+                                    1, 0.5 *
+                                       (kpts[sk[0] - 1, 2] + kpts[
+                                           sk[1] - 1, 2])))
+                            cv2.addWeighted(
+                                img_copy,
+                                transparency,
+                                img,
+                                1 - transparency,
+                                0,
+                                dst=img)
+
+        show, wait_time = 1, 1
+        if show:
+            height, width = img.shape[:2]
+            max_ = max(height, width)
+
+            factor = min(1, 800 / max_)
+            enlarge = cv2.resize(
+                img, (0, 0),
+                fx=factor,
+                fy=factor,
+                interpolation=cv2.INTER_CUBIC)
+            imshow(enlarge, win_name, wait_time)
+
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        return img
diff --git a/EdgeCape/models/detectors/__init__.py b/EdgeCape/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc0456aedb28e94134caca15f93b81db3d0adae
--- /dev/null
+++ b/EdgeCape/models/detectors/__init__.py
@@ -0,0 +1,3 @@
+from .EdgeCape import EdgeCape
+
+__all__ = ['EdgeCape']
diff --git a/EdgeCape/models/detectors/__pycache__/EdgeCape.cpython-39.pyc b/EdgeCape/models/detectors/__pycache__/EdgeCape.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93fd30c53fd3229bdb88a220c56881dd2576be8a
Binary files /dev/null and b/EdgeCape/models/detectors/__pycache__/EdgeCape.cpython-39.pyc differ
diff --git a/EdgeCape/models/detectors/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/detectors/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe71eac8979c4c9fabdd5a4cf220a80f8e518a54
Binary files /dev/null and b/EdgeCape/models/detectors/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/models/keypoint_heads/__init__.py b/EdgeCape/models/keypoint_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..58bbac1437bb1221908dc01345553e4982c43020
--- /dev/null
+++ b/EdgeCape/models/keypoint_heads/__init__.py
@@ -0,0 +1,5 @@
+from .head import TwoStageHead
+from .skeleton import SkeletonPredictor
+
+__all__ = ['TwoStageHead', 'SkeletonPredictor']
+           
diff --git a/EdgeCape/models/keypoint_heads/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd211c38f41e624bb15962d2ab5ead42c4cfab14
Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/models/keypoint_heads/__pycache__/encoder_decoder.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/encoder_decoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc1520b1de8245f9040db146f2efb52fa0d75275
Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/encoder_decoder.cpython-39.pyc differ
diff --git a/EdgeCape/models/keypoint_heads/__pycache__/head.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/head.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..795f2101f995cacf23d21ad165f3943ff060d7c7
Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/head.cpython-39.pyc differ
diff --git a/EdgeCape/models/keypoint_heads/__pycache__/skeleton.cpython-39.pyc b/EdgeCape/models/keypoint_heads/__pycache__/skeleton.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d42bad67f3d3f169f464a2ea35db3de13cf32d6
Binary files /dev/null and b/EdgeCape/models/keypoint_heads/__pycache__/skeleton.cpython-39.pyc differ
diff --git a/EdgeCape/models/keypoint_heads/encoder_decoder.py b/EdgeCape/models/keypoint_heads/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6d3886679ecd3119647457b15f8c712d213fd41
--- /dev/null
+++ b/EdgeCape/models/keypoint_heads/encoder_decoder.py
@@ -0,0 +1,670 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import copy
+from typing import Optional
+from EdgeCape.models.utils.bias_attn import BiasedMultiheadAttention
+from EdgeCape.models.utils.builder import TRANSFORMER
+from mmcv.cnn import (xavier_init)
+
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.gelu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class ProposalGenerator(nn.Module):
+
+    def __init__(self, hidden_dim, proj_dim, dynamic_proj_dim):
+        super().__init__()
+        self.support_proj = nn.Linear(hidden_dim, proj_dim)
+        self.query_proj = nn.Linear(hidden_dim, proj_dim)
+        self.dynamic_proj = nn.Sequential(
+            nn.Linear(hidden_dim, dynamic_proj_dim),
+            nn.ReLU(),
+            nn.Linear(dynamic_proj_dim, hidden_dim))
+        self.dynamic_act = nn.Tanh()
+
+    def forward(self, query_feat, support_feat, spatial_shape):
+        """
+        Args:
+            support_feat: [query, bs, c]
+            query_feat: [hw, bs, c]
+            spatial_shape: h, w
+        """
+        device = query_feat.device
+        _, bs, c = query_feat.shape
+        h, w = spatial_shape
+        side_normalizer = torch.tensor([w, h]).to(query_feat.device)[None,
+                          None,
+                          :]  # [bs, query, 2], Normalize the coord to [0,1]
+
+        query_feat = query_feat.transpose(0, 1)
+        support_feat = support_feat.transpose(0, 1)
+        nq = support_feat.shape[1]
+
+        fs_proj = self.support_proj(support_feat)  # [bs, query, c]
+        fq_proj = self.query_proj(query_feat)  # [bs, hw, c]
+        pattern_attention = self.dynamic_act(
+            self.dynamic_proj(fs_proj))  # [bs, query, c]
+
+        fs_feat = (pattern_attention + 1) * fs_proj  # [bs, query, c]
+        similarity = torch.bmm(fq_proj,
+                               fs_feat.transpose(1, 2))  # [bs, hw, query]
+        similarity = similarity.transpose(1, 2).reshape(bs, nq, h, w)
+        grid_y, grid_x = torch.meshgrid(
+            torch.linspace(0.5, h - 0.5, h, dtype=torch.float32, device=device),  # (h, w)
+            torch.linspace(0.5, w - 0.5, w, dtype=torch.float32, device=device),
+            indexing="ij")
+
+        # compute softmax and sum up
+        coord_grid = torch.stack([grid_x, grid_y], dim=0).unsqueeze(0).unsqueeze(0).repeat(bs, nq, 1, 1, 1)
+        # [bs, query, 2, h, w]
+        coord_grid = coord_grid.permute(0, 1, 3, 4, 2)  # [bs, query, h, w, 2]
+        similarity_softmax = similarity.flatten(2, 3).softmax(dim=-1)  # [bs, query, hw]
+        similarity_coord_grid = similarity_softmax[:, :, :, None] * coord_grid.flatten(2, 3)
+        proposal_for_loss = similarity_coord_grid.sum(dim=2, keepdim=False)  # [bs,
+        # query, 2]
+        proposal_for_loss = proposal_for_loss / side_normalizer
+
+        max_pos = torch.argmax(similarity.reshape(bs, nq, -1), dim=-1, keepdim=True)  # (bs, nq, 1)
+        max_mask = F.one_hot(max_pos, num_classes=w * h)  # (bs, nq, 1, w*h)
+        max_mask = max_mask.reshape(bs, nq, w, h).type(torch.float)  # (bs, nq, w, h)
+        local_max_mask = F.max_pool2d(input=max_mask,
+                                      kernel_size=3,
+                                      stride=1,
+                                      padding=1).reshape(bs, nq, w * h, 1)  # (bs, nq, w*h, 1)
+
+        # first, extract the local probability map with the mask
+        local_similarity_softmax = similarity_softmax[:, :, :, None] * local_max_mask  # (bs, nq, w*h, 1)
+
+        # then, re-normalize the local probability map
+        local_similarity_softmax = local_similarity_softmax / (
+                local_similarity_softmax.sum(dim=-2, keepdim=True) + 1e-10)  # [bs, nq, w*h, 1]
+
+        # point-wise mulplication of local probability map and coord grid
+        proposals = local_similarity_softmax * coord_grid.flatten(2, 3)  # [bs, # nq, w*h, 2]
+
+        # sum the mulplication to obtain the final coord proposals
+        proposals = proposals.sum(dim=2) / side_normalizer  # [bs, nq, 2]
+
+        return proposal_for_loss, similarity, proposals
+
+
+@TRANSFORMER.register_module()
+class TwoStageSupportRefineTransformer(nn.Module):
+
+    def __init__(self,
+                 d_model=256,
+                 nhead=8,
+                 num_encoder_layers=3,
+                 num_decoder_layers=3,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False,
+                 similarity_proj_dim=256,
+                 dynamic_proj_dim=128,
+                 return_intermediate_dec=True,
+                 attn_bias=False,
+                 max_hops=5,
+                 use_bias_attn_module=False,
+                 masked_supervision=False,
+                 recon_features=False,
+
+                 ):
+        super().__init__()
+
+        if num_encoder_layers > 0:
+            encoder_layer = TransformerEncoderLayer(d_model, nhead,
+                                                    dim_feedforward, dropout,
+                                                    activation,
+                                                    normalize_before)
+            encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+            self.encoder = TransformerEncoder(encoder_layer,
+                                              num_encoder_layers, encoder_norm)
+        else:
+            self.encoder = None
+        decoder_norm = nn.LayerNorm(d_model)
+        decoder_layer = TransformerDecoderLayer(d_model, nhead,
+                                                dim_feedforward, dropout,
+                                                activation, normalize_before,
+                                                use_bias_attn_module=use_bias_attn_module,
+                                                attn_bias=attn_bias,
+                                                max_hops=max_hops,
+                                                )
+
+        self.decoder = TransformerDecoder(
+            d_model,
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )
+
+        self.proposal_generator = ProposalGenerator(
+            hidden_dim=d_model,
+            proj_dim=similarity_proj_dim,
+            dynamic_proj_dim=dynamic_proj_dim)
+
+        self.d_model = d_model
+        self.nhead = nhead
+        self.recon_features = recon_features
+        self.masked_supervision = masked_supervision
+        self.freeze = ''
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, query_image_feat, query_mask, support_kp_feat,
+                query_image_pos_embed, kp_pos_embedding, support_kp_mask,
+                position_embedding, kpt_branch, adj, attn_adj, return_attn_map=False, random_mask=None):
+        # flatten NxCxHxW to HWxNxC
+        # src = query image features,
+        # support_embed = support image embedding
+        bs, c, h, w = query_image_feat.shape
+
+        query_image_feat = query_image_feat.flatten(2).permute(2, 0, 1)
+        query_image_pos_embed = query_image_pos_embed.flatten(2).permute(2, 0, 1)
+        kp_pos_embedding = kp_pos_embedding.flatten(2).permute(2, 0, 1)
+        query_image_pos_embed = torch.cat((query_image_pos_embed, kp_pos_embedding))
+        query_image_embed = support_kp_feat.transpose(0, 1)  # [query, bs, c ]
+        query_mask = query_mask.flatten(1)
+
+        query_image_embed, refined_kp_support_embed = self.encoder(
+            query_image_feat,
+            query_image_embed,
+            src_key_padding_mask=query_mask,
+            query_key_padding_mask=support_kp_mask,
+            pos=query_image_pos_embed)
+
+        # generate initial proposals and corresponding positional embedding.
+        initial_proposals_for_loss, similarity_map, initial_proposals = (
+            self.proposal_generator(
+                query_image_embed, refined_kp_support_embed,
+                spatial_shape=[h, w]))  # inital_proposals has been normalized
+        initial_position_embedding = position_embedding.forward_coordinates(initial_proposals)
+        k, bs, c = refined_kp_support_embed.shape
+        mask_decoder_cond = self.masked_supervision and self.decoder.training
+        if mask_decoder_cond:
+            support_gt_keypoint = refined_kp_support_embed.transpose(0, 1).detach().clone()
+            old_keypoints = support_gt_keypoint.detach().clone().contiguous()
+            new_keypoints = old_keypoints * random_mask + (
+                        (1 - random_mask) * ~support_kp_mask.unsqueeze(-1)) * self.mask_token
+            new_keypoints = new_keypoints.transpose(0, 1)
+            attn_adj_aux = attn_adj
+            change_requires_grad([self.decoder, position_embedding, kpt_branch], False)
+            recon_hs, recon_out_points, _, _ = self.decoder(
+                new_keypoints,
+                query_image_embed.detach(),
+                memory_key_padding_mask=query_mask.detach(),
+                pos=query_image_pos_embed.detach(),
+                query_pos=initial_position_embedding.detach(),
+                tgt_key_padding_mask=support_kp_mask.detach(),
+                position_embedding=position_embedding,
+                initial_proposals=initial_proposals.detach(),
+                kpt_branch=kpt_branch,
+                adj=adj,
+                attn_adj=attn_adj_aux,
+                return_attn_map=return_attn_map)
+            reconstructed_keypoints = recon_out_points[-1]
+            change_requires_grad([self.decoder, position_embedding, kpt_branch], True)
+        else:
+            reconstructed_keypoints = None
+
+        adj_dec, attn_adj_dec = adj, attn_adj
+        hs, out_points, adjs, attn_maps = self.decoder(
+            refined_kp_support_embed,
+            query_image_embed,
+            memory_key_padding_mask=query_mask,
+            pos=query_image_pos_embed,
+            query_pos=initial_position_embedding,
+            tgt_key_padding_mask=support_kp_mask,
+            position_embedding=position_embedding,
+            initial_proposals=initial_proposals,
+            kpt_branch=kpt_branch,
+            adj=adj_dec,
+            attn_adj=attn_adj_dec,
+            return_attn_map=return_attn_map)
+
+        return (
+            hs.transpose(1, 2),
+            initial_proposals_for_loss,
+            out_points,
+            similarity_map,
+            reconstructed_keypoints
+        )
+
+
+def change_requires_grad(models, status=True):
+    for model in models:
+        model.requires_grad_(status)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                src,
+                query,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                query_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        # src: [hw, bs, c]
+        # query: [num_query, bs, c]
+        # mask: None by default
+        # src_key_padding_mask: [bs, hw]
+        # query_key_padding_mask: [bs, nq]
+        # pos: [hw, bs, c]
+
+        n, bs, c = src.shape
+        src_cat = torch.cat((src, query), dim=0)  # [hw + nq, bs, c]
+        mask_cat = torch.cat((src_key_padding_mask, query_key_padding_mask), dim=1)  # [bs, hw+nq]
+        output = src_cat
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                query_length=n,
+                src_mask=mask,
+                src_key_padding_mask=mask_cat,
+                pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        # resplit the output into src and query
+        refined_query = output[n:, :, :]  # [nq, bs, c]
+        output = output[:n, :, :]  # [n, bs, c]
+
+        return output, refined_query
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False,
+                 ):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        self.ref_point_head = MLP(d_model, d_model, d_model,
+                                  2)  # this MLP will process the positional
+
+    def forward(self,
+                support_feat,
+                query_feat,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None,
+                position_embedding=None,
+                initial_proposals=None,
+                kpt_branch=None,
+                adj=None,
+                attn_adj=None,
+                return_attn_map=False):
+        """
+        position_embedding: Class used to compute positional embedding
+        inital_proposals: [bs, nq, 2], normalized coordinates of inital
+        proposals
+        kpt_branch: MLP used to predict the offsets for each query.
+        """
+
+        refined_support_feat = support_feat
+        intermediate = []
+        attn_maps = []
+        adjs = []
+        bi = initial_proposals.detach()
+        query_points = [initial_proposals.detach()]
+
+        tgt_key_padding_mask_remove_all_true = tgt_key_padding_mask.clone().to(tgt_key_padding_mask.device)
+        tgt_key_padding_mask_remove_all_true[tgt_key_padding_mask.logical_not().sum(dim=-1) == 0, 0] = False
+
+        for layer_idx, layer in enumerate(self.layers):
+            if layer_idx == 0:  # use positional embedding form inital
+                # proposals
+                query_pos_embed = query_pos.transpose(0, 1)
+            else:
+                # recalculate the positional embedding
+                query_pos_embed = position_embedding.forward_coordinates(bi)
+                query_pos_embed = query_pos_embed.transpose(0, 1)
+            query_pos_embed = self.ref_point_head(query_pos_embed)
+
+            (refined_support_feat, query_feat, adjs_layer, img_attn_map,
+             kpt_attention_map) = layer(
+                refined_support_feat,
+                query_feat,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask_remove_all_true,
+                memory_key_padding_mask=memory_key_padding_mask,
+                concat_pos_embed=pos,
+                init_pos_emb=query_pos_embed,
+                adj=adj,
+                attn_adj=attn_adj, )
+
+            adj_gt = adjs_layer[1]  # same for all layers
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(refined_support_feat))
+                adjs.append(adjs_layer[0])
+
+            if return_attn_map:
+                attn_maps.append(img_attn_map)
+
+            # update the query coordinates
+            delta_bi = kpt_branch[layer_idx](
+                refined_support_feat.transpose(0, 1))
+
+            # Prediction loss
+            bi_tag = self.update(bi, delta_bi)
+            bi_pred = bi_tag
+
+            bi = bi_tag.detach()
+            query_points.append(bi_pred)
+
+        if self.norm is not None:
+            refined_support_feat = self.norm(refined_support_feat)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(refined_support_feat)
+
+        if self.return_intermediate:
+            if adjs[0] is None:
+                return (torch.stack(intermediate),
+                        query_points,
+                        [[], adj_gt],
+                        attn_maps)
+            return (torch.stack(intermediate),
+                    query_points,
+                    [torch.stack(adjs), adj_gt],
+                    attn_maps)
+
+        return (refined_support_feat.unsqueeze(0),
+                query_points,
+                [adjs, adj_gt],
+                attn_maps)
+
+    def update(self, query_coordinates, delta_unsig):
+        query_coordinates_unsigmoid = inverse_sigmoid(query_coordinates)
+        new_query_coordinates = query_coordinates_unsigmoid + delta_unsig
+        new_query_coordinates = new_query_coordinates.sigmoid()
+        return new_query_coordinates
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self,
+                src,
+                query_length,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        src = self.with_pos_embed(src, pos)
+        q = k = src
+        # NOTE: compared with original implementation, we add positional
+        # embedding into the VALUE.
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask,
+            need_weights=False)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+
+class GCNLayer(nn.Module):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 kernel_size=2,
+                 use_bias=True,
+                 activation=nn.ReLU(inplace=True),
+                 batch_first=True):
+        super(GCNLayer, self).__init__()
+        self.conv = nn.Conv1d(
+            in_features,
+            out_features * kernel_size,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            dilation=1,
+            bias=use_bias, )
+
+        self.kernel_size = kernel_size
+        self.activation = activation
+        self.batch_first = batch_first
+
+    def forward(self, x, adj):
+        assert adj.size(1) == self.kernel_size
+        if not self.batch_first:
+            x = x.permute(1, 2, 0)
+        else:
+            x = x.transpose(1, 2)
+        x = self.conv(x)
+        b, kc, v = x.size()
+        x = x.view(b, self.kernel_size, kc // self.kernel_size, v)
+        x = torch.einsum('bkcv,bkwv->bcw', (x, adj))
+        if self.activation is not None:
+            x = self.activation(x)
+        if not self.batch_first:
+            x = x.permute(2, 0, 1)
+        else:
+            x = x.transpose(1, 2)
+        return x
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False,
+                 attn_bias=False,
+                 max_hops=5,
+                 use_bias_attn_module=False,
+                 add_pos_emb=False,
+                 learn_hops=False,
+                 edge_features=False,
+                 two_way_attn=False,
+                 ):
+        super().__init__()
+        self.d_model = d_model
+        self.attn_bias = attn_bias
+        self.max_hops = max_hops
+        self.learn_hops = learn_hops
+        self.edge_features = edge_features
+        self.two_way_attn = two_way_attn
+        if attn_bias or use_bias_attn_module:
+            self.self_attn = BiasedMultiheadAttention(d_model, nhead,
+                                                      self_attention=True,
+                                                      dropout=dropout,
+                                                      max_hops=self.max_hops,
+                                                      bias_attn=attn_bias)
+        else:
+            self.self_attn = nn.MultiheadAttention(d_model, nhead,
+                                                   dropout=dropout)
+        self.add_pos_emb = add_pos_emb
+        self.multihead_attn = nn.MultiheadAttention(d_model * 2, nhead, dropout=dropout, vdim=d_model)
+        self.choker = nn.Linear(in_features=2 * d_model, out_features=d_model)
+        self.ffn1 = GCNLayer(d_model, dim_feedforward, batch_first=False)
+        self.ffn2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        if self.two_way_attn:
+            self.cross_attn_image_to_token = nn.MultiheadAttention(d_model * 2, nhead, dropout=dropout, vdim=d_model)
+            self.cross_attn_image_to_token_choker = nn.Linear(in_features=2 * d_model, out_features=d_model)
+            self.dropout4 = nn.Dropout(dropout)
+            self.norm4 = nn.LayerNorm(d_model)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self,
+                refined_support_feat,
+                refined_query_feat,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                concat_pos_embed: Optional[Tensor] = None,
+                init_pos_emb: Optional[Tensor] = None,
+                adj: Optional[Tensor] = None,
+                attn_adj: Optional[Tensor] = None):
+
+        q = k = v = refined_support_feat
+        if self.attn_bias:
+            if self.learn_hops:
+                attn_adj = adj[:, 1]
+            tgt2, kpt_attention_map = self.self_attn(
+                q,
+                k,
+                v,
+                attn_bias=attn_adj,
+                attn_mask=tgt_mask,
+                key_padding_mask=tgt_key_padding_mask,
+                need_weights=False)
+        else:
+            tgt2, kpt_attention_map = self.self_attn(
+                q,
+                k,
+                v,
+                attn_mask=tgt_mask,
+                key_padding_mask=tgt_key_padding_mask,
+                need_weights=False)
+
+        refined_support_feat = refined_support_feat + self.dropout1(tgt2)
+        refined_support_feat = self.norm1(refined_support_feat)
+        # concatenate the positional embedding with the content feature, instead of direct addition
+        cross_attn_q = torch.cat((refined_support_feat, init_pos_emb + concat_pos_embed[refined_query_feat.shape[0]:]),
+                                 dim=-1)
+        cross_attn_k = torch.cat((refined_query_feat, concat_pos_embed[:refined_query_feat.shape[0]]), dim=-1)
+
+        tgt2, attn_map = self.multihead_attn(
+            query=cross_attn_q,
+            key=cross_attn_k,
+            value=refined_query_feat,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask)
+
+        refined_support_feat = refined_support_feat + self.dropout2(self.choker(tgt2))
+        refined_support_feat = self.norm2(refined_support_feat)
+        adj_loss = gt_adj_loss = None
+        tgt2 = self.ffn2(self.dropout(
+            self.activation(self.ffn1(refined_support_feat, adj))))
+        refined_support_feat = refined_support_feat + self.dropout3(tgt2)
+        refined_support_feat = self.norm3(refined_support_feat)
+        if self.two_way_attn:
+            q = torch.cat((refined_query_feat, concat_pos_embed[:refined_query_feat.shape[0]]), dim=-1)
+            k = torch.cat((refined_support_feat, init_pos_emb + concat_pos_embed[refined_query_feat.shape[0]:]), dim=-1)
+            v = refined_support_feat
+            tgt4, _ = self.cross_attn_image_to_token(
+                q,
+                k,
+                v,
+                attn_mask=tgt_mask,
+                need_weights=False)
+            refined_query_feat = refined_query_feat + self.dropout4(self.cross_attn_image_to_token_choker(tgt4))
+            refined_query_feat = self.norm4(refined_query_feat)
+
+        return refined_support_feat, refined_query_feat, [adj_loss, gt_adj_loss], attn_map, kpt_attention_map
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+def clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
diff --git a/EdgeCape/models/keypoint_heads/head.py b/EdgeCape/models/keypoint_heads/head.py
new file mode 100644
index 0000000000000000000000000000000000000000..375291f7ecba4c9a96e2e42b6c9ff4697d029dc3
--- /dev/null
+++ b/EdgeCape/models/keypoint_heads/head.py
@@ -0,0 +1,387 @@
+from copy import deepcopy
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (Conv2d, Linear, xavier_init)
+from mmcv.cnn.bricks.transformer import build_positional_encoding
+from mmpose.core.evaluation import keypoint_pck_accuracy
+from mmpose.core.post_processing import transform_preds
+from mmpose.models import HEADS
+from mmpose.models import builder
+from mmpose.models.utils.ops import resize
+from EdgeCape.models.utils import build_transformer
+
+
+#  From ControlNet Rep: https://github.com/lllyasviel/ControlNet-v1-1-nightly
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class TokenDecodeMLP(nn.Module):
+    '''
+    The MLP used to predict coordinates from the support keypoints tokens.
+    '''
+
+    def __init__(self,
+                 in_channels,
+                 hidden_channels,
+                 out_channels=2,
+                 num_layers=3):
+        super(TokenDecodeMLP, self).__init__()
+        layers = []
+        for i in range(num_layers):
+            if i == 0:
+                layers.append(nn.Linear(in_channels, hidden_channels))
+                layers.append(nn.GELU())
+            else:
+                layers.append(nn.Linear(hidden_channels, hidden_channels))
+                layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_channels, out_channels))
+        # TODO: what about tanh / 2 + center ?
+        self.mlp = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.mlp(x)
+
+
+@HEADS.register_module()
+class TwoStageHead(nn.Module):
+    '''
+    In two stage regression A3, the proposal generator are moved into transformer.
+    All valid proposals will be added with an positional embedding to better regress the location
+    '''
+
+    def __init__(self,
+                 in_channels,
+                 transformer=None,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 share_kpt_branch=False,
+                 num_decoder_layer=3,
+                 with_heatmap_loss=False,
+                 heatmap_loss_weight=2.0,
+                 skeleton_loss_weight=1,
+                 train_cfg=None,
+                 test_cfg=None,
+                 skeleton_head=None,
+                 learn_skeleton=False,
+                 masked_supervision=False,
+                 freeze=None,
+                 model_freeze=None,
+                 masking_ratio=0.5,
+                 ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.positional_encoding = build_positional_encoding(positional_encoding)
+        self.encoder_positional_encoding = build_positional_encoding(positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.d_model
+        self.with_heatmap_loss = with_heatmap_loss
+        self.heatmap_loss_weight = heatmap_loss_weight
+        self.skeleton_loss_weight = skeleton_loss_weight
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+                                                 f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+                                                 f' and {num_feats}.'
+
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1)
+        self.query_proj = Linear(self.in_channels, self.embed_dims)
+        # Instantiate the proposal generator and subsequent keypoint branch.
+        kpt_branch = TokenDecodeMLP(
+            in_channels=self.embed_dims, hidden_channels=self.embed_dims)
+        if share_kpt_branch:
+            self.kpt_branch = nn.ModuleList(
+                [kpt_branch for i in range(num_decoder_layer)])
+        else:
+            self.kpt_branch = nn.ModuleList(
+                [deepcopy(kpt_branch) for i in range(num_decoder_layer)])
+
+        self.train_cfg = {} if train_cfg is None else train_cfg
+        self.test_cfg = {} if test_cfg is None else test_cfg
+        self.target_type = self.test_cfg.get('target_type', 'GaussianHeatMap')
+
+        skeleton_head['max_hop'] = transformer.get('max_hops', 4)
+        self.skeleton_head = builder.build_head(skeleton_head)
+        self.skeleton_head.init_weights()
+        self.learn_skeleton = learn_skeleton
+        self.masking_ratio = masking_ratio
+        transformer_d_model = transformer.get('d_model', 256)
+        self.masked_supervision = masked_supervision
+        self.transformer.masked_supervision = self.masked_supervision
+        self.transformer.mask_token = nn.Parameter(torch.zeros(1, transformer_d_model))
+        self.transformer.masking_ratio = self.masking_ratio
+        self.use_zero_conv = skeleton_head.get('use_zero_conv', False)
+        if freeze == "skeleton" or model_freeze == "skeleton":
+            self.skeleton_head.requires_grad_(False)
+            self.input_proj.requires_grad_(False)
+            self.query_proj.requires_grad_(False)
+        elif freeze == "prediction" or model_freeze == "prediction":
+            self.kpt_branch.requires_grad_(False)
+            self.transformer.requires_grad_(False)
+            self.encoder_positional_encoding.requires_grad_(False)
+            self.transformer.freeze = freeze
+
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+        # initialization for input_proj & prediction head
+        for mlp in self.kpt_branch:
+            nn.init.constant_(mlp.mlp[-1].weight.data, 0)
+            nn.init.constant_(mlp.mlp[-1].bias.data, 0)
+        nn.init.xavier_uniform_(self.input_proj.weight, gain=1)
+        nn.init.constant_(self.input_proj.bias, 0)
+        nn.init.xavier_uniform_(self.query_proj.weight, gain=1)
+        nn.init.constant_(self.query_proj.bias, 0)
+        if self.use_zero_conv:
+            self.skeleton_head.zero_conv = zero_module(self.skeleton_head.zero_conv)
+
+    def forward(self,
+                feature_q,
+                feature_s,
+                target_s,
+                mask_s,
+                skeleton_lst,
+                return_attn_maps=False,
+                random_mask=None):
+        feature_q = self.input_proj(feature_q)  # [bs, dim, h64, w64], /64 resolution.
+        bs, dim, h, w = feature_q.shape
+        # Feature map pos embedding
+        masks = feature_q.new_zeros((feature_q.shape[0], feature_q.shape[2], feature_q.shape[3])).to(torch.bool)
+        query_image_pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        query_embed_list = []
+        for i, (feature, target) in enumerate(zip(feature_s, target_s)):
+            # resize the support feature back to the heatmap sizes.
+            resized_feature = resize(
+                input=feature,
+                size=target.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            target = target / (target.sum(dim=-1).sum(dim=-1)[:, :, None, None] + 1e-8)
+            support_keypoints = target.flatten(2) @ resized_feature.flatten(2).permute(0, 2, 1)
+            query_embed_list.append(support_keypoints)
+        # support_keypoints is the support keypoint features.
+        support_keypoints = torch.mean(torch.stack(query_embed_list, dim=0), 0)
+        support_keypoints = support_keypoints * mask_s
+        support_keypoints = self.query_proj(support_keypoints)
+        kp_mask = (~mask_s.to(torch.bool)).squeeze(-1)  # True indicating this query matched no actual joints.
+        kp_pos_embedding = feature_q.new_zeros((bs, self.embed_dims, 1, target_s[0].shape[1])).to(torch.bool)
+
+        # Predict Skeleton
+        skeleton_kp_embed = kp_pos_embedding
+        support_image_features = feature_s
+        support_keypoints_skeleton, support_image_features_skeleton = support_keypoints, support_image_features
+        adj, attn_adj, unnormalized_adj = self.skeleton_head(skeleton_lst,
+                                                             support_keypoints_skeleton,
+                                                             support_image_features_skeleton,
+                                                             kp_mask,
+                                                             query_image_pos_embed)
+
+        (outs_dec, initial_proposals, out_points,
+         similarity_map, reconstructed_keypoints) = self.transformer(feature_q,
+                                                                     masks,
+                                                                     support_keypoints,
+                                                                     query_image_pos_embed,
+                                                                     kp_pos_embedding,
+                                                                     kp_mask,
+                                                                     self.positional_encoding,
+                                                                     self.kpt_branch,
+                                                                     adj,
+                                                                     attn_adj,
+                                                                     return_attn_map=return_attn_maps,
+                                                                     random_mask=random_mask)
+
+        output_kpts = []
+        for idx in range(outs_dec.shape[0]):
+            layer_delta_unsig = self.kpt_branch[idx](outs_dec[idx])
+            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(out_points[idx])
+            output_kpts.append(layer_outputs_unsig.sigmoid())
+
+        return torch.stack(output_kpts, dim=0), initial_proposals, similarity_map, reconstructed_keypoints
+
+    def get_loss(self, output, initial_proposals, similarity_map, target,
+                 target_heatmap, target_weight, target_sizes, reconstructed_keypoints):
+        """Calculate top-down keypoint loss."""
+
+        losses = dict()
+        if self.learn_skeleton:
+            num_dec_layer, bs, nq = output.shape[:3]
+            normalizer = target_weight.squeeze(dim=-1).sum(dim=-1)  # [bs, ]
+            normalizer[normalizer == 0] = 1
+            reconstructed_keypoints, mask_indices = reconstructed_keypoints
+            support_gt_keypoints = target / target_sizes.to(output.device)
+            pred_loss = F.l1_loss(reconstructed_keypoints, support_gt_keypoints, reduction="none")
+            pred_loss = pred_loss.sum(dim=-1, keepdim=False) * target_weight.squeeze(dim=-1)
+            pred_loss = pred_loss.sum(dim=-1, keepdim=False) / normalizer
+            pred_loss = pred_loss.sum(dim=-1, keepdim=False) / bs
+            pred_loss = pred_loss * self.skeleton_loss_weight
+            losses['adj_reconstruct_loss'] = pred_loss.sum()
+
+        num_dec_layer, bs, nq = output.shape[:3]
+        target_sizes = target_sizes.to(output.device)  # [bs, 1, 2]
+        target = target / target_sizes
+        target = target[None, :, :, :].repeat(num_dec_layer, 1, 1, 1)
+        # set the weight for unset query point to be zero
+        normalizer = target_weight.squeeze(dim=-1).sum(dim=-1)  # [bs, ]
+        normalizer[normalizer == 0] = 1
+
+        # compute the heatmap loss
+        if self.with_heatmap_loss:
+            losses['heatmap_loss'] = self.heatmap_loss(
+                similarity_map, target_heatmap, target_weight,
+                normalizer) * self.heatmap_loss_weight
+
+        # compute L1 loss for inital_proposals
+        proposal_l1_loss = F.l1_loss(initial_proposals, target[0], reduction="none")
+        proposal_l1_loss = proposal_l1_loss.sum(dim=-1, keepdim=False) * target_weight.squeeze(dim=-1)
+        proposal_l1_loss = proposal_l1_loss.sum(dim=-1, keepdim=False) / normalizer  # [bs, ]
+        losses['proposal_loss'] = proposal_l1_loss.sum() / bs
+
+        # compute L1 loss for each layer
+        for idx in range(num_dec_layer):
+            layer_output, layer_target = output[idx], target[idx]
+            l1_loss = F.l1_loss(layer_output, layer_target, reduction="none")  # [bs, query, 2]
+            l1_loss = l1_loss.sum(dim=-1, keepdim=False) * target_weight.squeeze(dim=-1)  # [bs, query]
+            # normalize the loss for each sample with the number of visible joints
+            l1_loss = l1_loss.sum(dim=-1, keepdim=False) / normalizer  # [bs, ]
+            losses['l1_loss' + '_layer' + str(idx)] = l1_loss.sum() / bs
+
+        return losses
+
+    def heatmap_loss(self, similarity_map, target_heatmap, target_weight,
+                     normalizer):
+        # similarity_map: [bs, num_query, h, w]
+        # target_heatmap: [bs, num_query, sh, sw]
+        # target_weight: [bs, num_query, 1]
+
+        # preprocess the similarity_map
+        h, w = similarity_map.shape[-2:]
+        # similarity_map = torch.clamp(similarity_map, 0.0, None)
+        similarity_map = similarity_map.sigmoid()
+
+        target_heatmap = F.interpolate(
+            target_heatmap, size=(h, w), mode='bilinear')
+        target_heatmap = (target_heatmap /
+                          (target_heatmap.max(dim=-1)[0].max(dim=-1)[0] + 1e-10)[:, :, None, None])
+
+        l2_loss = F.mse_loss(similarity_map, target_heatmap, reduction="none")  # bs, nq, h, w
+        l2_loss = l2_loss * target_weight[:, :, :, None]  # bs, nq, h, w
+        l2_loss = l2_loss.flatten(2, 3).sum(-1) / (h * w)  # bs, nq
+        l2_loss = l2_loss.sum(-1) / normalizer  # bs,
+
+        return l2_loss.mean()
+
+    def get_accuracy(self, output, target, target_weight, target_sizes, height=256):
+        """Calculate accuracy for top-down keypoint loss.
+
+        Args:
+            output (torch.Tensor[NxKx2]): estimated keypoints in ABSOLUTE coordinates.
+            target (torch.Tensor[NxKx2]): gt keypoints in ABSOLUTE coordinates.
+            target_weight (torch.Tensor[NxKx1]): Weights across different joint types.
+            target_sizes (torch.Tensor[Nx2): shapes of the image.
+        """
+        # NOTE: In POMNet, PCK is estimated on 1/8 resolution, which is slightly different here.
+
+        accuracy = dict()
+        output = output * float(height)
+        output, target, target_weight, target_sizes = (
+            output.detach().cpu().numpy(), target.detach().cpu().numpy(),
+            target_weight.squeeze(-1).long().detach().cpu().numpy(),
+            target_sizes.squeeze(1).detach().cpu().numpy())
+
+        _, avg_acc, _ = keypoint_pck_accuracy(
+            output,
+            target,
+            target_weight.astype(np.bool8),
+            thr=0.2,
+            normalize=target_sizes)
+        accuracy['acc_pose'] = float(avg_acc)
+
+        return accuracy
+
+    def decode(self, img_metas, output, img_size, **kwargs):
+        """Decode the predicted keypoints from prediction.
+
+        Args:
+            img_metas (list(dict)): Information about data augmentation
+                By default this includes:
+                - "image_file: path to the image file
+                - "center": center of the bbox
+                - "scale": scale of the bbox
+                - "rotation": rotation of the bbox
+                - "bbox_score": score of bbox
+            output (np.ndarray[N, K, H, W]): model predicted heatmaps.
+        """
+        batch_size = len(img_metas)
+        W, H = img_size
+        output = output * np.array([W, H])[None, None, :]  # [bs, query, 2], coordinates with recovered shapes.
+
+        if 'bbox_id' or 'query_bbox_id' in img_metas[0]:
+            bbox_ids = []
+        else:
+            bbox_ids = None
+
+        c = np.zeros((batch_size, 2), dtype=np.float32)
+        s = np.zeros((batch_size, 2), dtype=np.float32)
+        image_paths = []
+        score = np.ones(batch_size)
+        for i in range(batch_size):
+            c[i, :] = img_metas[i]['query_center']
+            s[i, :] = img_metas[i]['query_scale']
+            image_paths.append(img_metas[i]['query_image_file'])
+
+            if 'query_bbox_score' in img_metas[i]:
+                score[i] = np.array(
+                    img_metas[i]['query_bbox_score']).reshape(-1)
+            if 'bbox_id' in img_metas[i]:
+                bbox_ids.append(img_metas[i]['bbox_id'])
+            elif 'query_bbox_id' in img_metas[i]:
+                bbox_ids.append(img_metas[i]['query_bbox_id'])
+
+        preds = np.zeros(output.shape)
+        for idx in range(output.shape[0]):
+            preds[idx] = transform_preds(
+                output[idx],
+                c[idx],
+                s[idx], [W, H],
+                use_udp=self.test_cfg.get('use_udp', False))
+
+        all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
+        all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
+        all_preds[:, :, 0:2] = preds[:, :, 0:2]
+        all_preds[:, :, 2:3] = 1.0  # NOTE: Currently, assume all predicted points are of 100% confidence.
+        all_boxes[:, 0:2] = c[:, 0:2]
+        all_boxes[:, 2:4] = s[:, 0:2]
+        all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
+        all_boxes[:, 5] = score
+
+        result = {}
+
+        result['preds'] = all_preds
+        result['boxes'] = all_boxes
+        result['image_paths'] = image_paths
+        result['bbox_ids'] = bbox_ids
+
+        return result
diff --git a/EdgeCape/models/keypoint_heads/skeleton.py b/EdgeCape/models/keypoint_heads/skeleton.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee530de319e88dd706c8f68770b436bdd9a3cac3
--- /dev/null
+++ b/EdgeCape/models/keypoint_heads/skeleton.py
@@ -0,0 +1,208 @@
+import random
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmpose.models import HEADS
+from EdgeCape.models.keypoint_heads.encoder_decoder import (TransformerDecoderLayer, _get_clones)
+
+
+@HEADS.register_module()
+class SkeletonPredictor(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 nhead=8,
+                 num_layers=3,
+                 dim_feedforward=768,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False,
+                 learn_skeleton: bool = False,
+                 max_hop: int = 5,
+                 adj_normalization: bool = True,
+                 markov_bias: bool = True,
+                 mask_res: bool = False,
+                 use_zero_conv: bool = True,
+                 max_hops: int = 4,
+                 two_way_attn: bool = True,
+                 gcn_norm: bool = False, ):
+        super(SkeletonPredictor, self).__init__()
+        if num_layers > 0:
+            decoder_layer = TransformerDecoderLayer(d_model=d_model,
+                                                    nhead=nhead,
+                                                    dim_feedforward=dim_feedforward,
+                                                    dropout=dropout,
+                                                    activation='relu',
+                                                    normalize_before=normalize_before,
+                                                    max_hops=max_hops,
+                                                    two_way_attn=two_way_attn)
+            self.skeleton_predictor = _get_clones(decoder_layer, num_layers)
+        self.gcn_norm = gcn_norm
+        self.image_project = nn.Conv2d(dim_feedforward, d_model, kernel_size=1)
+        self.learn_skeleton = learn_skeleton
+        self.max_hop = max_hop
+        if activation == "relu":
+            self.activation = nn.ReLU()
+        else:
+            self.activation = nn.Sigmoid()
+        self.adj_normalization = adj_normalization
+        self.markov_bias = markov_bias
+        self.k_proj = nn.Linear(d_model, d_model)
+        self.q_proj = nn.Linear(d_model, d_model)
+        self.mh_linear = nn.Conv2d(nhead, 1, kernel_size=1)
+        self.num_heads = nhead
+        self.mask_res = mask_res
+        self.use_zero_conv = use_zero_conv
+        if self.use_zero_conv:
+            self.zero_conv = nn.Conv2d(1, 1, kernel_size=1, stride=1, padding=0)
+
+    def forward(self,
+                skeleton: list,
+                kp_features: torch.Tensor,
+                image_features: torch.Tensor,
+                kp_mask: torch.Tensor,
+                query_image_pos_embed: torch.Tensor,
+                ) -> [torch.Tensor, torch.Tensor]:
+
+        assert skeleton is not None
+        b, num_pts, _ = kp_features.shape
+        gt_adj, _ = self.adj_mx_from_edges(num_pts=num_pts,
+                                           skeleton=skeleton,
+                                           mask=kp_mask,
+                                           device=kp_features.device)
+        binary_adj = gt_adj[:, 1] > 0
+        if not self.learn_skeleton:
+            return gt_adj, None, binary_adj
+        adj, adj_for_attn, unnnormalized_adj = self.predict_adj(image_features=image_features,
+                                                                kp_features=kp_features,
+                                                                kp_mask=kp_mask,
+                                                                query_image_pos_embed=query_image_pos_embed,
+                                                                gt_adj=binary_adj)
+        return adj, adj_for_attn, unnnormalized_adj
+
+    def refine_features(self,
+                        image_features: torch.Tensor,
+                        kp_features: torch.Tensor,
+                        kp_mask: torch.Tensor,
+                        query_image_pos_embed: torch.Tensor,
+                        adj: torch.Tensor = None,
+                        ):
+
+        bs, num_pts, _ = kp_features.shape
+        adj = self.soft_normalize_adj(adj, kp_mask)
+        image_features = [self.image_project(image_feature) for image_feature in image_features]
+        zero_pos_embed = torch.zeros_like(kp_features).flatten(2).permute(1, 0, 2)
+        query_image_pos_embed = query_image_pos_embed.flatten(2).permute(2, 0, 1)
+        concat_pos_embed = torch.cat((query_image_pos_embed, zero_pos_embed))
+        kp_features = kp_features.flatten(2).permute(1, 0, 2)
+        image_features = [image_feature.flatten(2).permute(2, 0, 1) for image_feature in image_features]
+        tgt_key_padding_mask_remove_all_true = kp_mask.clone().to(kp_mask.device)
+        tgt_key_padding_mask_remove_all_true[kp_mask.logical_not().sum(dim=-1) == 0, 0] = False
+        kp_feat_lst = []
+        for s, image_feature in enumerate(image_features):
+            s_kp_features = kp_features.clone()
+            for i, layer in enumerate(self.skeleton_predictor):
+                s_kp_features, image_feature, _, _, _ = layer(
+                    s_kp_features,
+                    image_feature,
+                    tgt_key_padding_mask=tgt_key_padding_mask_remove_all_true,
+                    concat_pos_embed=concat_pos_embed,
+                    init_pos_emb=zero_pos_embed,
+                    adj=adj,
+                )
+            kp_feat_lst.append(s_kp_features.permute(1, 0, 2))
+        kp_features = torch.mean(torch.stack(kp_feat_lst, dim=0), 0)
+
+        return kp_features
+
+    def predict_adj(self,
+                    image_features: torch.Tensor,
+                    kp_features: torch.Tensor,
+                    kp_mask: torch.Tensor,
+                    query_image_pos_embed: torch.Tensor,
+                    gt_adj: torch.Tensor = None):
+
+        kp_features = self.refine_features(image_features,
+                                           kp_features,
+                                           kp_mask,
+                                           query_image_pos_embed,
+                                           gt_adj)
+
+        normalized_adj, unnnormalized_adj = self.predict_skeleton(kp_features, kp_mask, gt_adj)
+        attn_bias_matrix = self.markov_transition_matrix(normalized_adj[:, 1])
+        return normalized_adj, attn_bias_matrix, unnnormalized_adj
+
+    def predict_skeleton(self, kp_features, kp_mask, gt_adj):
+        bs, num_pts, _ = kp_features.shape
+        # Self-attention matrix from kp_features
+        kp_features = kp_features.permute(1, 0, 2) * ~kp_mask.transpose(0, 1).unsqueeze(-1)
+        q_kp = self.q_proj(kp_features).contiguous().view(num_pts, bs * self.num_heads, -1).transpose(0, 1)
+        k_kp = self.k_proj(kp_features).contiguous().view(num_pts, bs * self.num_heads, -1).transpose(0, 1)
+        attn = torch.bmm(q_kp, k_kp.transpose(1, 2)).view(bs, self.num_heads, num_pts, num_pts)
+        unnormalized_adj_matrix = self.mh_linear(attn).squeeze(1)
+        unnormalized_adj_matrix = (unnormalized_adj_matrix + unnormalized_adj_matrix.transpose(1, 2)) / 2
+        unnormalized_adj_matrix = self.combine_adj(gt_adj, unnormalized_adj_matrix)
+        unnormalized_adj_matrix = self.activation(unnormalized_adj_matrix)
+        normalized_adj = self.soft_normalize_adj(unnormalized_adj_matrix, kp_mask, gt_adj)
+        unnormalized_adj_matrix = unnormalized_adj_matrix * ~kp_mask.unsqueeze(-1) * ~kp_mask.unsqueeze(-2)
+        return normalized_adj, unnormalized_adj_matrix
+
+    def combine_adj(self, gt_adj, predicted_adj):
+        if self.use_zero_conv:
+            predicted_adj = self.zero_conv(predicted_adj.unsqueeze(1)).squeeze(1)
+        adj = gt_adj + predicted_adj
+        return adj
+
+    def markov_transition_matrix(self, adj):
+        """
+        Compute the Markov transition matrix from the adjacency matrix.
+        :param adj: (bs, num_pts, num_pts)
+        :return: (bs, num_pts, num_pts)
+        """
+        adj = adj / (adj.sum(dim=-1, keepdim=True) + 1e-8)
+        transfer_mat = [torch.matrix_power(adj.float(), d) for d in range(self.max_hop + 1)]
+        arrive_mat = torch.stack(transfer_mat)
+        return arrive_mat
+
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        """Initialize weights of the transformer head."""
+        # nn.init.xavier_uniform_(self.input_proj.weight, gain=1)
+        # nn.init.constant_(self.input_proj.bias, 0)
+
+    def adj_mx_from_edges(self, num_pts, skeleton, mask=None, device='cuda'):
+        binary_adj_mx = torch.empty(0, device=device)
+        batch_size = len(skeleton)
+        for b in range(batch_size):
+            edges = torch.tensor(skeleton[b])
+            adj = torch.zeros(num_pts, num_pts, device=device)
+            if len(edges.shape) > 1:
+                adj[edges[:, 0], edges[:, 1]] = 1
+                adj[edges[:, 1], edges[:, 0]] = 1
+            binary_adj_mx = torch.concatenate((binary_adj_mx, adj.unsqueeze(0)), dim=0)
+        if mask is not None:
+            adj = self.normalize_adj(binary_adj_mx, mask)
+        else:
+            adj = None
+        return adj, binary_adj_mx
+
+    def normalize_adj(self, binary_adj_mx, mask):
+        trans_adj_mx = torch.transpose(binary_adj_mx, 1, 2)
+        cond = (trans_adj_mx > binary_adj_mx).float()
+        adj_unnormalized = binary_adj_mx + trans_adj_mx * cond - binary_adj_mx * cond
+        adj = adj_unnormalized * ~mask[..., None] * ~mask[:, None]
+        adj = torch.nan_to_num(adj / adj.sum(dim=-1, keepdim=True))
+        adj = torch.stack((torch.diag_embed(~mask), adj), dim=1)
+        return adj
+
+    def soft_normalize_adj(self, adj_mx, mask, gt_adj=None):
+        adj_mask = ~mask[..., None] * ~mask[:, None]
+        if self.mask_res and gt_adj is not None:
+            adj_mask = gt_adj
+        adj = adj_mx * adj_mask
+        if self.adj_normalization:
+            adj = adj / (adj.sum(dim=-1, keepdim=True) + 1e-8)
+        if not self.gcn_norm:
+            adj = torch.stack((torch.diag_embed(~mask), adj), dim=1)
+        return adj
diff --git a/EdgeCape/models/utils/__init__.py b/EdgeCape/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..af4e06657cc4b8a6e0c1b306cd4011aebead30dc
--- /dev/null
+++ b/EdgeCape/models/utils/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_linear_layer, build_transformer, build_backbone
+from .transformer import (DetrTransformerDecoderLayer, DetrTransformerDecoder,
+                          DetrTransformerEncoder, DynamicConv)
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding)
+
+from EdgeCape.models.keypoint_heads.encoder_decoder import TwoStageSupportRefineTransformer
+
+__all__ = [
+    'build_transformer', 'build_backbone', 'build_linear_layer', 'DetrTransformerDecoderLayer',
+    'DetrTransformerDecoder', 'DetrTransformerEncoder',
+    'LearnedPositionalEncoding', 'SinePositionalEncoding',
+    'TwoStageSupportRefineTransformer',
+]
diff --git a/EdgeCape/models/utils/__pycache__/__init__.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77cd369ac355297ee71f071ba0d2be918d29ed00
Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/EdgeCape/models/utils/__pycache__/bias_attn.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/bias_attn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b88361a03ac2931ef6ef21981fdb4d09aea6a9dd
Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/bias_attn.cpython-39.pyc differ
diff --git a/EdgeCape/models/utils/__pycache__/builder.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/builder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74a056f3a44266f13d32cef67348a9f240847f08
Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/builder.cpython-39.pyc differ
diff --git a/EdgeCape/models/utils/__pycache__/positional_encoding.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/positional_encoding.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2daeaaefd66ba34f5b43b324fecc52bbc6bb9ff5
Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/positional_encoding.cpython-39.pyc differ
diff --git a/EdgeCape/models/utils/__pycache__/transformer.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/transformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eb92254f1a20bf6f818903f5b0b9b16096f9d2d
Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/transformer.cpython-39.pyc differ
diff --git a/EdgeCape/models/utils/__pycache__/visualization.cpython-39.pyc b/EdgeCape/models/utils/__pycache__/visualization.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7c16e1384c7c2e46f71536f0ae3c0050361d029
Binary files /dev/null and b/EdgeCape/models/utils/__pycache__/visualization.cpython-39.pyc differ
diff --git a/EdgeCape/models/utils/backbone.py b/EdgeCape/models/utils/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c3a0d2eb30060ad3c456382747ae27d3805422
--- /dev/null
+++ b/EdgeCape/models/utils/backbone.py
@@ -0,0 +1,116 @@
+# --------------------------------------------------------
+# SimMIM
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+
+import os
+import torch
+import torch.distributed as dist
+import numpy as np
+from scipy import interpolate
+
+def load_pretrained(config, model, logger):
+    checkpoint = torch.load(config, map_location='cpu')
+    checkpoint_model = checkpoint['model']
+
+    if any([True if 'encoder.' in k else False for k in checkpoint_model.keys()]):
+        checkpoint_model = {k.replace('encoder.', ''): v for k, v in checkpoint_model.items() if k.startswith('encoder.')}
+        print('Detect pre-trained model, remove [encoder.] prefix.')
+    else:
+        print('Detect non-pre-trained model, pass without doing anything.')
+
+    checkpoint = remap_pretrained_keys_swin(model, checkpoint_model, logger)
+    msg = model.load_state_dict(checkpoint_model, strict=False)
+    print(msg)
+
+    del checkpoint
+    torch.cuda.empty_cache()
+
+
+def remap_pretrained_keys_swin(model, checkpoint_model, logger):
+    state_dict = model.state_dict()
+
+    # Geometric interpolation when pre-trained patch size mismatch with fine-tuned patch size
+    all_keys = list(checkpoint_model.keys())
+    for key in all_keys:
+        if "relative_position_bias_table" in key:
+            relative_position_bias_table_pretrained = checkpoint_model[key]
+            relative_position_bias_table_current = state_dict[key]
+            L1, nH1 = relative_position_bias_table_pretrained.size()
+            L2, nH2 = relative_position_bias_table_current.size()
+            if nH1 != nH2:
+                print(f"Error in loading {key}, passing......")
+            else:
+                if L1 != L2:
+                    print(f"{key}: Interpolate relative_position_bias_table using geo.")
+                    src_size = int(L1 ** 0.5)
+                    dst_size = int(L2 ** 0.5)
+
+                    def geometric_progression(a, r, n):
+                        return a * (1.0 - r ** n) / (1.0 - r)
+
+                    left, right = 1.01, 1.5
+                    while right - left > 1e-6:
+                        q = (left + right) / 2.0
+                        gp = geometric_progression(1, q, src_size // 2)
+                        if gp > dst_size // 2:
+                            right = q
+                        else:
+                            left = q
+
+                    # if q > 1.090307:
+                    #     q = 1.090307
+
+                    dis = []
+                    cur = 1
+                    for i in range(src_size // 2):
+                        dis.append(cur)
+                        cur += q ** (i + 1)
+
+                    r_ids = [-_ for _ in reversed(dis)]
+
+                    x = r_ids + [0] + dis
+                    y = r_ids + [0] + dis
+
+                    t = dst_size // 2.0
+                    dx = np.arange(-t, t + 0.1, 1.0)
+                    dy = np.arange(-t, t + 0.1, 1.0)
+
+                    print("Original positions = %s" % str(x))
+                    print("Target positions = %s" % str(dx))
+
+                    all_rel_pos_bias = []
+
+                    for i in range(nH1):
+                        z = relative_position_bias_table_pretrained[:, i].view(src_size, src_size).float().numpy()
+                        f_cubic = interpolate.interp2d(x, y, z, kind='cubic')
+                        all_rel_pos_bias.append(torch.Tensor(f_cubic(dx, dy)).contiguous().view(-1, 1).to(
+                            relative_position_bias_table_pretrained.device))
+
+                    new_rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+                    checkpoint_model[key] = new_rel_pos_bias
+
+    # delete relative_position_index since we always re-init it
+    relative_position_index_keys = [k for k in checkpoint_model.keys() if "relative_position_index" in k]
+    for k in relative_position_index_keys:
+        del checkpoint_model[k]
+
+    # delete relative_coords_table since we always re-init it
+    relative_coords_table_keys = [k for k in checkpoint_model.keys() if "relative_coords_table" in k]
+    for k in relative_coords_table_keys:
+        del checkpoint_model[k]
+
+    # re-map keys due to name change
+    rpe_mlp_keys = [k for k in checkpoint_model.keys() if "rpe_mlp" in k]
+    for k in rpe_mlp_keys:
+        checkpoint_model[k.replace('rpe_mlp', 'cpb_mlp')] = checkpoint_model.pop(k)
+
+    # delete attn_mask since we always re-init it
+    attn_mask_keys = [k for k in checkpoint_model.keys() if "attn_mask" in k]
+    for k in attn_mask_keys:
+        del checkpoint_model[k]
+
+    return checkpoint_model
\ No newline at end of file
diff --git a/EdgeCape/models/utils/bias_attn.py b/EdgeCape/models/utils/bias_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dbe51e203dd3267dfe027d45a5adfa0ecc7ef02
--- /dev/null
+++ b/EdgeCape/models/utils/bias_attn.py
@@ -0,0 +1,265 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Optional, Tuple
+
+import torch
+import torchvision
+from fairseq import utils
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor, nn
+from torchvision.ops import MLP
+
+
+class BiasedMultiheadAttention(nn.Module):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias_attn=False,
+            bias=True,
+            self_attention=False,
+            q_noise=0.0,
+            qn_block_size=8,
+            max_hops=5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__
+        )
+
+        self.head_dim = embed_dim // num_heads
+        assert (
+                self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.self_attention = self_attention
+
+        assert self.self_attention, "Only support self attention"
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.max_hops = max_hops
+        self.bias_attn = bias_attn
+        if bias_attn:
+            self.markov_structural_mlp = MLP(self.max_hops + 1,
+                                             [self.max_hops + num_heads, num_heads])
+        self.reset_parameters()
+        self.onnx_trace = False
+
+    def prepare_for_onnx_export_(self):
+        raise NotImplementedError
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+
+    def forward(
+            self,
+            query,
+            key: Optional[Tensor],
+            value: Optional[Tensor],
+            attn_bias: Optional[Tensor] = None,
+            key_padding_mask: Optional[Tensor] = None,
+            need_weights: bool = True,
+            attn_mask: Optional[Tensor] = None,
+            before_softmax: bool = False,
+            need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embed_dim = query.size()
+        src_len = tgt_len
+        assert embed_dim == self.embed_dim, f"query dim {embed_dim} != {self.embed_dim}"
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                assert key_bsz == bsz
+                assert value is not None
+                assert src_len, bsz == value.shape[:2]
+
+        q = self.q_proj(query)
+        k = self.k_proj(query)
+        v = self.v_proj(query)
+        q *= self.scaling
+
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+
+        assert k is not None
+        assert k.size(1) == src_len
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if self.bias_attn and attn_bias is not None:
+            attn_bias_val = self.markov_structural_mlp(attn_bias.permute(1, 2, 3, 0))
+            attn_bias_val = attn_bias_val.permute(0, 3, 1, 2).reshape(bsz * self.num_heads, tgt_len, src_len)
+            attn_weights += attn_bias_val
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                float("-inf"),
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+
+    def rename_state_dict(self, state_dict, prefix):
+        # prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.startswith(prefix):
+                if k.endswith(prefix + "in_proj_weight"):
+                    # in_proj_weight used to be q + k + v with same dimensions
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                    items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim: 2 * dim]
+                    items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim:]
+                    keys_to_remove.append(k)
+
+                if k.endswith(prefix + "in_proj_bias"):
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k][dim: 2 * dim]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k][2 * dim:]
+                    keys_to_remove.append(prefix + "in_proj_bias")
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
+
+    def _load_from_state_dict(self, state_dict, name, *args, **kwargs):
+        self.rename_state_dict(state_dict, name)
+        super()._load_from_state_dict(state_dict, name, *args, **kwargs)
diff --git a/EdgeCape/models/utils/builder.py b/EdgeCape/models/utils/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f695813d0a6f83d25568997169870a2801f511d1
--- /dev/null
+++ b/EdgeCape/models/utils/builder.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.utils import Registry, build_from_cfg
+
+TRANSFORMER = Registry('Transformer')
+BACKBONES = Registry('BACKBONES')
+LINEAR_LAYERS = Registry('linear layers')
+
+
+def build_backbone(cfg, default_args=None):
+    """Build backbone."""
+    return build_from_cfg(cfg, BACKBONES, default_args)
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
+
+
+LINEAR_LAYERS.register_module('Linear', module=nn.Linear)
+
+
+def build_linear_layer(cfg, *args, **kwargs):
+    """Build linear layer.
+    Args:
+        cfg (None or dict): The linear layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an linear layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding linear layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding linear layer.
+    Returns:
+        nn.Module: Created linear layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Linear')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in LINEAR_LAYERS:
+        raise KeyError(f'Unrecognized linear type {layer_type}')
+    else:
+        linear_layer = LINEAR_LAYERS.get(layer_type)
+
+    layer = linear_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/EdgeCape/models/utils/positional_encoding.py b/EdgeCape/models/utils/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..62b6e4dfb57b740e364d78665b63582a9fc66cd4
--- /dev/null
+++ b/EdgeCape/models/utils/positional_encoding.py
@@ -0,0 +1,219 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.runner import BaseModule
+
+#TODO: add an SinePositionalEncoding for coordinates input
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32) # [bs, h, w], recording the y coordinate ot each pixel
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize: # default True
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t   # [bs, h, w, num_feats]
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1) # [bs, h, w, num_feats]
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1) 
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+    
+    def forward_coordinates(self, coord):
+        """
+        Forward funtion for normalized coordinates input with the shape of [bs, kpt, 2]
+        return:
+            pos (Tensor): position embedding with the shape of [bs, kpt, num_feats*2]
+        """
+        x_embed, y_embed = coord[:,:,0], coord[:,:,1] # [bs, kpt]
+        x_embed = x_embed * self.scale # [bs, kpt]
+        y_embed = y_embed * self.scale        
+
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=coord.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+
+        pos_x = x_embed[:, :, None] / dim_t   # [bs, kpt, num_feats]
+        pos_y = y_embed[:, :, None] / dim_t   # [bs, kpt, num_feats]
+        bs, kpt, _ = pos_x.shape
+
+        pos_x = torch.stack(
+            (pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
+            dim=3).view(bs, kpt, -1) # [bs, kpt, num_feats]
+        pos_y = torch.stack(
+            (pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
+            dim=3).view(bs, kpt, -1) # [bs, kpt, num_feats]
+        pos = torch.cat((pos_y, pos_x), dim=2) # [bs, kpt, num_feats * 2]
+
+        return pos
+
+    def forward_3d_coordinates(self, coord):
+        """
+        Forward funtion for normalized coordinates input with the shape of [bs, 3, kpt]
+        return:
+            pos (Tensor): position embedding with the shape of [bs, kpt, num_feats*2]
+        """
+        bs, _, H, W = coord.shape
+
+        x_embed, y_embed, z_embed = coord[:, 0], coord[:, 1], coord[:, 2]  # [bs, kpt]
+        x_embed = x_embed.flatten(1) * self.scale  # [bs, kpt]
+        y_embed = y_embed.flatten(1) * self.scale
+        z_embed = z_embed.flatten(1) * self.scale
+
+        dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=coord.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
+
+        pos_x = x_embed[:, :, None] / dim_t  # [bs, HW, num_feats]
+        pos_y = y_embed[:, :, None] / dim_t  # [bs, HW, num_feats]
+        pos_z = z_embed[:, :, None] / dim_t  # [bs, HW, num_feats]
+
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),dim=3).view(bs, H, W, -1)  # [bs, H, W, num_feats]
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),dim=3).view(bs, H, W, -1)  # [bs, H, W, num_feats]
+        pos_z = torch.stack((pos_z[:, :, 0::2].sin(), pos_z[:, :, 1::2].cos()),dim=3).view(bs, H, W, -1)  # [bs, H, W, num_feats]
+        pos = torch.cat((pos_y, pos_x, pos_z), dim=3).permute(0, 3, 1, 2)  # [bs, H, W, num_feats * 3]
+
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
diff --git a/EdgeCape/models/utils/post_processing/__init__.py b/EdgeCape/models/utils/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..47980293fc1bd49ac1f74fbbfa6e54f25d62fcfa
--- /dev/null
+++ b/EdgeCape/models/utils/post_processing/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .nms import nearby_joints_nms, oks_iou, oks_nms, soft_oks_nms
+from .one_euro_filter import OneEuroFilter
+from .post_transforms import (affine_transform, flip_back, fliplr_joints,
+                              fliplr_regression, get_affine_transform,
+                              get_warp_matrix, rotate_point, transform_preds,
+                              warp_affine_joints)
+from .smoother import Smoother
+
+__all__ = [
+    'oks_nms', 'soft_oks_nms', 'nearby_joints_nms', 'affine_transform',
+    'rotate_point', 'flip_back', 'fliplr_joints', 'fliplr_regression',
+    'transform_preds', 'get_affine_transform', 'get_warp_matrix',
+    'warp_affine_joints', 'oks_iou', 'OneEuroFilter', 'Smoother'
+]
\ No newline at end of file
diff --git a/EdgeCape/models/utils/post_processing/group.py b/EdgeCape/models/utils/post_processing/group.py
new file mode 100644
index 0000000000000000000000000000000000000000..24077c7a6e8da0ad8e503680f79f6301077d1a95
--- /dev/null
+++ b/EdgeCape/models/utils/post_processing/group.py
@@ -0,0 +1,557 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/princeton-vl/pose-ae-train/
+# Original licence: Copyright (c) 2017, umich-vl, under BSD 3-Clause License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from munkres import Munkres
+
+from mmpose.core.evaluation import post_dark_udp
+
+
+def _py_max_match(scores):
+    """Apply munkres algorithm to get the best match.
+
+    Args:
+        scores(np.ndarray): cost matrix.
+
+    Returns:
+        np.ndarray: best match.
+    """
+    m = Munkres()
+    tmp = m.compute(scores)
+    tmp = np.array(tmp).astype(int)
+    return tmp
+
+
+def _match_by_tag(inp, params):
+    """Match joints by tags. Use Munkres algorithm to calculate the best match
+    for keypoints grouping.
+
+    Note:
+        number of keypoints: K
+        max number of people in an image: M (M=30 by default)
+        dim of tags: L
+            If use flip testing, L=2; else L=1.
+
+    Args:
+        inp(tuple):
+            tag_k (np.ndarray[KxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[KxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[KxM]): top k value of the
+                feature maps per keypoint.
+        params(Params): class Params().
+
+    Returns:
+        np.ndarray: result of pose groups.
+    """
+    assert isinstance(params, _Params), 'params should be class _Params()'
+
+    tag_k, loc_k, val_k = inp
+
+    default_ = np.zeros((params.num_joints, 3 + tag_k.shape[2]),
+                        dtype=np.float32)
+
+    joint_dict = {}
+    tag_dict = {}
+    for i in range(params.num_joints):
+        idx = params.joint_order[i]
+
+        tags = tag_k[idx]
+        joints = np.concatenate((loc_k[idx], val_k[idx, :, None], tags), 1)
+        mask = joints[:, 2] > params.detection_threshold
+        tags = tags[mask]  # shape: [M, L]
+        joints = joints[mask]  # shape: [M, 3 + L], 3: x, y, val
+
+        if joints.shape[0] == 0:
+            continue
+
+        if i == 0 or len(joint_dict) == 0:
+            for tag, joint in zip(tags, joints):
+                key = tag[0]
+                joint_dict.setdefault(key, np.copy(default_))[idx] = joint
+                tag_dict[key] = [tag]
+        else:
+            # shape: [M]
+            grouped_keys = list(joint_dict.keys())
+            if params.ignore_too_much:
+                grouped_keys = grouped_keys[:params.max_num_people]
+            # shape: [M, L]
+            grouped_tags = [np.mean(tag_dict[i], axis=0) for i in grouped_keys]
+
+            # shape: [M, M, L]
+            diff = joints[:, None, 3:] - np.array(grouped_tags)[None, :, :]
+            # shape: [M, M]
+            diff_normed = np.linalg.norm(diff, ord=2, axis=2)
+            diff_saved = np.copy(diff_normed)
+
+            if params.use_detection_val:
+                diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3]
+
+            num_added = diff.shape[0]
+            num_grouped = diff.shape[1]
+
+            if num_added > num_grouped:
+                diff_normed = np.concatenate(
+                    (diff_normed,
+                     np.zeros((num_added, num_added - num_grouped),
+                              dtype=np.float32) + 1e10),
+                    axis=1)
+
+            pairs = _py_max_match(diff_normed)
+            for row, col in pairs:
+                if (row < num_added and col < num_grouped
+                        and diff_saved[row][col] < params.tag_threshold):
+                    key = grouped_keys[col]
+                    joint_dict[key][idx] = joints[row]
+                    tag_dict[key].append(tags[row])
+                else:
+                    key = tags[row][0]
+                    joint_dict.setdefault(key, np.copy(default_))[idx] = \
+                        joints[row]
+                    tag_dict[key] = [tags[row]]
+
+    joint_dict_keys = list(joint_dict.keys())
+    if params.ignore_too_much:
+        # The new person joints beyond the params.max_num_people will be
+        # ignored, for the dict is in ordered when python > 3.6 version.
+        joint_dict_keys = joint_dict_keys[:params.max_num_people]
+    results = np.array([joint_dict[i]
+                        for i in joint_dict_keys]).astype(np.float32)
+    return results
+
+
+class _Params:
+    """A class of parameter.
+
+    Args:
+        cfg(Config): config.
+    """
+
+    def __init__(self, cfg):
+        self.num_joints = cfg['num_joints']
+        self.max_num_people = cfg['max_num_people']
+
+        self.detection_threshold = cfg['detection_threshold']
+        self.tag_threshold = cfg['tag_threshold']
+        self.use_detection_val = cfg['use_detection_val']
+        self.ignore_too_much = cfg['ignore_too_much']
+
+        if self.num_joints == 17:
+            self.joint_order = [
+                i - 1 for i in
+                [1, 2, 3, 4, 5, 6, 7, 12, 13, 8, 9, 10, 11, 14, 15, 16, 17]
+            ]
+        else:
+            self.joint_order = list(np.arange(self.num_joints))
+
+
+class HeatmapParser:
+    """The heatmap parser for post processing."""
+
+    def __init__(self, cfg):
+        self.params = _Params(cfg)
+        self.tag_per_joint = cfg['tag_per_joint']
+        self.pool = torch.nn.MaxPool2d(cfg['nms_kernel'], 1,
+                                       cfg['nms_padding'])
+        self.use_udp = cfg.get('use_udp', False)
+        self.score_per_joint = cfg.get('score_per_joint', False)
+
+    def nms(self, heatmaps):
+        """Non-Maximum Suppression for heatmaps.
+
+        Args:
+            heatmap(torch.Tensor): Heatmaps before nms.
+
+        Returns:
+            torch.Tensor: Heatmaps after nms.
+        """
+
+        maxm = self.pool(heatmaps)
+        maxm = torch.eq(maxm, heatmaps).float()
+        heatmaps = heatmaps * maxm
+
+        return heatmaps
+
+    def match(self, tag_k, loc_k, val_k):
+        """Group keypoints to human poses in a batch.
+
+        Args:
+            tag_k (np.ndarray[NxKxMxL]): tag corresponding to the
+                top k values of feature map per keypoint.
+            loc_k (np.ndarray[NxKxMx2]): top k locations of the
+                feature maps for keypoint.
+            val_k (np.ndarray[NxKxM]): top k value of the
+                feature maps per keypoint.
+
+        Returns:
+            list
+        """
+
+        def _match(x):
+            return _match_by_tag(x, self.params)
+
+        return list(map(_match, zip(tag_k, loc_k, val_k)))
+
+    def top_k(self, heatmaps, tags):
+        """Find top_k values in an image.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            max number of people: M
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW])
+            tags (torch.Tensor[NxKxHxWxL])
+
+        Returns:
+            dict: A dict containing top_k values.
+
+            - tag_k (np.ndarray[NxKxMxL]):
+                tag corresponding to the top k values of
+                feature map per keypoint.
+            - loc_k (np.ndarray[NxKxMx2]):
+                top k location of feature map per keypoint.
+            - val_k (np.ndarray[NxKxM]):
+                top k value of feature map per keypoint.
+        """
+        heatmaps = self.nms(heatmaps)
+        N, K, H, W = heatmaps.size()
+        heatmaps = heatmaps.view(N, K, -1)
+        val_k, ind = heatmaps.topk(self.params.max_num_people, dim=2)
+
+        tags = tags.view(tags.size(0), tags.size(1), W * H, -1)
+        if not self.tag_per_joint:
+            tags = tags.expand(-1, self.params.num_joints, -1, -1)
+
+        tag_k = torch.stack(
+            [torch.gather(tags[..., i], 2, ind) for i in range(tags.size(3))],
+            dim=3)
+
+        x = ind % W
+        y = ind // W
+
+        ind_k = torch.stack((x, y), dim=3)
+
+        results = {
+            'tag_k': tag_k.cpu().numpy(),
+            'loc_k': ind_k.cpu().numpy(),
+            'val_k': val_k.cpu().numpy()
+        }
+
+        return results
+
+    @staticmethod
+    def adjust(results, heatmaps):
+        """Adjust the coordinates for better accuracy.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            results (list(np.ndarray)): Keypoint predictions.
+            heatmaps (torch.Tensor[NxKxHxW]): Heatmaps.
+        """
+        _, _, H, W = heatmaps.shape
+        for batch_id, people in enumerate(results):
+            for people_id, people_i in enumerate(people):
+                for joint_id, joint in enumerate(people_i):
+                    if joint[2] > 0:
+                        x, y = joint[0:2]
+                        xx, yy = int(x), int(y)
+                        tmp = heatmaps[batch_id][joint_id]
+                        if tmp[min(H - 1, yy + 1), xx] > tmp[max(0, yy - 1),
+                                                             xx]:
+                            y += 0.25
+                        else:
+                            y -= 0.25
+
+                        if tmp[yy, min(W - 1, xx + 1)] > tmp[yy,
+                                                             max(0, xx - 1)]:
+                            x += 0.25
+                        else:
+                            x -= 0.25
+                        results[batch_id][people_id, joint_id,
+                                          0:2] = (x + 0.5, y + 0.5)
+        return results
+
+    @staticmethod
+    def refine(heatmap, tag, keypoints, use_udp=False):
+        """Given initial keypoint predictions, we identify missing joints.
+
+        Note:
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmap: np.ndarray(K, H, W).
+            tag: np.ndarray(K, H, W) |  np.ndarray(K, H, W, L)
+            keypoints: np.ndarray of size (K, 3 + L)
+                        last dim is (x, y, score, tag).
+            use_udp: bool-unbiased data processing
+
+        Returns:
+            np.ndarray: The refined keypoints.
+        """
+
+        K, H, W = heatmap.shape
+        if len(tag.shape) == 3:
+            tag = tag[..., None]
+
+        tags = []
+        for i in range(K):
+            if keypoints[i, 2] > 0:
+                # save tag value of detected keypoint
+                x, y = keypoints[i][:2].astype(int)
+                x = np.clip(x, 0, W - 1)
+                y = np.clip(y, 0, H - 1)
+                tags.append(tag[i, y, x])
+
+        # mean tag of current detected people
+        prev_tag = np.mean(tags, axis=0)
+        results = []
+
+        for _heatmap, _tag in zip(heatmap, tag):
+            # distance of all tag values with mean tag of
+            # current detected people
+            distance_tag = (((_tag -
+                              prev_tag[None, None, :])**2).sum(axis=2)**0.5)
+            norm_heatmap = _heatmap - np.round(distance_tag)
+
+            # find maximum position
+            y, x = np.unravel_index(np.argmax(norm_heatmap), _heatmap.shape)
+            xx = x.copy()
+            yy = y.copy()
+            # detection score at maximum position
+            val = _heatmap[y, x]
+            if not use_udp:
+                # offset by 0.5
+                x += 0.5
+                y += 0.5
+
+            # add a quarter offset
+            if _heatmap[yy, min(W - 1, xx + 1)] > _heatmap[yy, max(0, xx - 1)]:
+                x += 0.25
+            else:
+                x -= 0.25
+
+            if _heatmap[min(H - 1, yy + 1), xx] > _heatmap[max(0, yy - 1), xx]:
+                y += 0.25
+            else:
+                y -= 0.25
+
+            results.append((x, y, val))
+        results = np.array(results)
+
+        if results is not None:
+            for i in range(K):
+                # add keypoint if it is not detected
+                if results[i, 2] > 0 and keypoints[i, 2] == 0:
+                    keypoints[i, :3] = results[i, :3]
+
+        return keypoints
+
+    def parse(self, heatmaps, tags, adjust=True, refine=True):
+        """Group keypoints into poses given heatmap and tag.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            heatmap height: H
+            heatmap width: W
+            dim of tags: L
+                If use flip testing, L=2; else L=1.
+
+        Args:
+            heatmaps (torch.Tensor[NxKxHxW]): model output heatmaps.
+            tags (torch.Tensor[NxKxHxWxL]): model output tagmaps.
+
+        Returns:
+            tuple: A tuple containing keypoint grouping results.
+
+            - results (list(np.ndarray)): Pose results.
+            - scores (list/list(np.ndarray)): Score of people.
+        """
+        results = self.match(**self.top_k(heatmaps, tags))
+
+        if adjust:
+            if self.use_udp:
+                for i in range(len(results)):
+                    if results[i].shape[0] > 0:
+                        results[i][..., :2] = post_dark_udp(
+                            results[i][..., :2].copy(), heatmaps[i:i + 1, :])
+            else:
+                results = self.adjust(results, heatmaps)
+
+        if self.score_per_joint:
+            scores = [i[:, 2] for i in results[0]]
+        else:
+            scores = [i[:, 2].mean() for i in results[0]]
+
+        if refine:
+            results = results[0]
+            # for every detected person
+            for i in range(len(results)):
+                heatmap_numpy = heatmaps[0].cpu().numpy()
+                tag_numpy = tags[0].cpu().numpy()
+                if not self.tag_per_joint:
+                    tag_numpy = np.tile(tag_numpy,
+                                        (self.params.num_joints, 1, 1, 1))
+                results[i] = self.refine(
+                    heatmap_numpy, tag_numpy, results[i], use_udp=self.use_udp)
+            results = [results]
+
+        return results, scores
+
+
+class HeatmapOffsetParser:
+    """The heatmap&offset parser for post processing."""
+
+    def __init__(self, cfg):
+        super(HeatmapOffsetParser, self).__init__()
+
+        self.num_joints = cfg['num_joints']
+        self.keypoint_threshold = cfg['keypoint_threshold']
+        self.max_num_people = cfg['max_num_people']
+
+        # init pooling layer
+        kernel_size = cfg.get('max_pool_kernel', 5)
+        self.pool = torch.nn.MaxPool2d(kernel_size, 1, kernel_size // 2)
+
+    def _offset_to_pose(self, offsets):
+        """Convert offset maps to pose maps.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            offset maps height: H
+            offset maps width: W
+
+        Args:
+            offsets (torch.Tensor[NxKxHxW]): model output offset maps.
+
+        Returns:
+            torch.Tensor[NxKxHxW]: A tensor containing pose for each pixel.
+        """
+        h, w = offsets.shape[-2:]
+        offsets = offsets.view(self.num_joints, -1, h, w)
+
+        # generate regular coordinates
+        x = torch.arange(0, offsets.shape[-1]).float()
+        y = torch.arange(0, offsets.shape[-2]).float()
+        y, x = torch.meshgrid(y, x)
+        regular_coords = torch.stack((x, y), dim=0).unsqueeze(0)
+
+        posemaps = regular_coords.to(offsets) - offsets
+        posemaps = posemaps.view(1, -1, h, w)
+        return posemaps
+
+    def _get_maximum_from_heatmap(self, heatmap):
+        """Find local maximum of heatmap to localize instances.
+
+        Note:
+            batch size: N
+            heatmap height: H
+            heatmap width: W
+
+        Args:
+            heatmap (torch.Tensor[Nx1xHxW]): model output center heatmap.
+
+        Returns:
+            tuple: A tuple containing instances detection results.
+
+            - pos_idx (torch.Tensor): Index of pixels which have detected
+                instances.
+            - score (torch.Tensor): Score of detected instances.
+        """
+        assert heatmap.size(0) == 1 and heatmap.size(1) == 1
+        max_map = torch.eq(heatmap, self.pool(heatmap)).float()
+        heatmap = heatmap * max_map
+        score = heatmap.view(-1)
+
+        score, pos_idx = score.topk(self.max_num_people)
+        mask = score > self.keypoint_threshold
+        score = score[mask]
+        pos_idx = pos_idx[mask]
+        return pos_idx, score
+
+    def decode(self, heatmaps, offsets):
+        """Convert center heatmaps and offset maps to poses.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            offset maps height: H
+            offset maps width: W
+
+        Args:
+            heatmaps (torch.Tensor[Nx(1+K)xHxW]): model output heatmaps.
+            offsets (torch.Tensor[NxKxHxW]): model output offset maps.
+
+        Returns:
+            torch.Tensor[NxKx4]: A tensor containing predicted pose and
+                score for each instance.
+        """
+
+        posemap = self._offset_to_pose(offsets)
+        inst_indexes, inst_scores = self._get_maximum_from_heatmap(
+            heatmaps[:, :1])
+
+        poses = posemap.view(posemap.size(1), -1)[..., inst_indexes]
+        poses = poses.view(self.num_joints, 2, -1).permute(2, 0,
+                                                           1).contiguous()
+        inst_scores = inst_scores.unsqueeze(1).unsqueeze(2).expand(
+            poses.size())
+        poses = torch.cat((poses, inst_scores), dim=2)
+        return poses.clone()
+
+    def refine_score(self, heatmaps, poses):
+        """Refine instance scores with keypoint heatmaps.
+
+        Note:
+            batch size: N
+            number of keypoints: K
+            offset maps height: H
+            offset maps width: W
+
+        Args:
+            heatmaps (torch.Tensor[Nx(1+K)xHxW]): model output heatmaps.
+            poses (torch.Tensor[NxKx4]): decoded pose and score for each
+                instance.
+
+        Returns:
+            torch.Tensor[NxKx4]: poses with refined scores.
+        """
+        normed_poses = poses.unsqueeze(0).permute(2, 0, 1, 3).contiguous()
+        normed_poses = torch.cat((
+            normed_poses.narrow(3, 0, 1) / (heatmaps.size(3) - 1) * 2 - 1,
+            normed_poses.narrow(3, 1, 1) / (heatmaps.size(2) - 1) * 2 - 1,
+        ),
+                                 dim=3)
+        kpt_scores = torch.nn.functional.grid_sample(
+            heatmaps[:, 1:].view(self.num_joints, 1, heatmaps.size(2),
+                                 heatmaps.size(3)),
+            normed_poses,
+            padding_mode='border').view(self.num_joints, -1)
+        kpt_scores = kpt_scores.transpose(0, 1).contiguous()
+
+        # scores only from keypoint heatmaps
+        poses[..., 3] = kpt_scores
+        # combine center and keypoint heatmaps
+        poses[..., 2] = poses[..., 2] * kpt_scores
+
+        return poses
\ No newline at end of file
diff --git a/EdgeCape/models/utils/post_processing/nms.py b/EdgeCape/models/utils/post_processing/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4db52f95ed7c287ff7663327bb14a9a1955f87d
--- /dev/null
+++ b/EdgeCape/models/utils/post_processing/nms.py
@@ -0,0 +1,279 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def nms(dets, thr):
+    """Greedily select boxes with high confidence and overlap <= thr.
+
+    Args:
+        dets: [[x1, y1, x2, y2, score]].
+        thr: Retain overlap < thr.
+
+    Returns:
+         list: Indexes to keep.
+    """
+    if len(dets) == 0:
+        return []
+
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thr)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def oks_iou(g, d, a_g, a_d, sigmas=None, vis_thr=None):
+    """Calculate oks ious.
+
+    Args:
+        g: Ground truth keypoints.
+        d: Detected keypoints.
+        a_g: Area of the ground truth object.
+        a_d: Area of the detected object.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+
+    Returns:
+        list: The oks ious.
+    """
+    if sigmas is None:
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+    vars = (sigmas * 2)**2
+    xg = g[0::3]
+    yg = g[1::3]
+    vg = g[2::3]
+    ious = np.zeros(len(d), dtype=np.float32)
+    for n_d in range(0, len(d)):
+        xd = d[n_d, 0::3]
+        yd = d[n_d, 1::3]
+        vd = d[n_d, 2::3]
+        dx = xd - xg
+        dy = yd - yg
+        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
+        if vis_thr is not None:
+            ind = list(vg > vis_thr) and list(vd > vis_thr)
+            e = e[ind]
+        ious[n_d] = np.sum(np.exp(-e)) / len(e) if len(e) != 0 else 0.0
+    return ious
+
+
+def oks_nms(kpts_db, thr, sigmas=None, vis_thr=None, score_per_joint=False):
+    """OKS NMS implementations.
+
+    Args:
+        kpts_db: keypoints.
+        thr: Retain overlap < thr.
+        sigmas: standard deviation of keypoint labelling.
+        vis_thr: threshold of the keypoint visibility.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while len(order) > 0:
+        i = order[0]
+        keep.append(i)
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        inds = np.where(oks_ovr <= thr)[0]
+        order = order[inds + 1]
+
+    keep = np.array(keep)
+
+    return keep
+
+
+def _rescore(overlap, scores, thr, type='gaussian'):
+    """Rescoring mechanism gaussian or linear.
+
+    Args:
+        overlap: calculated ious
+        scores: target scores.
+        thr: retain oks overlap < thr.
+        type: 'gaussian' or 'linear'
+
+    Returns:
+        np.ndarray: indexes to keep
+    """
+    assert len(overlap) == len(scores)
+    assert type in ['gaussian', 'linear']
+
+    if type == 'linear':
+        inds = np.where(overlap >= thr)[0]
+        scores[inds] = scores[inds] * (1 - overlap[inds])
+    else:
+        scores = scores * np.exp(-overlap**2 / thr)
+
+    return scores
+
+
+def soft_oks_nms(kpts_db,
+                 thr,
+                 max_dets=20,
+                 sigmas=None,
+                 vis_thr=None,
+                 score_per_joint=False):
+    """Soft OKS NMS implementations.
+
+    Args:
+        kpts_db: keypoints and scores.
+        thr: retain oks overlap < thr.
+        max_dets: max number of detections to keep.
+        sigmas: Keypoint labelling uncertainty.
+        score_per_joint: the input scores (in kpts_db) are per joint scores
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'].flatten() for k in kpts_db])
+    areas = np.array([k['area'] for k in kpts_db])
+
+    order = scores.argsort()[::-1]
+    scores = scores[order]
+
+    keep = np.zeros(max_dets, dtype=np.intp)
+    keep_cnt = 0
+    while len(order) > 0 and keep_cnt < max_dets:
+        i = order[0]
+
+        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
+                          sigmas, vis_thr)
+
+        order = order[1:]
+        scores = _rescore(oks_ovr, scores[1:], thr)
+
+        tmp = scores.argsort()[::-1]
+        order = order[tmp]
+        scores = scores[tmp]
+
+        keep[keep_cnt] = i
+        keep_cnt += 1
+
+    keep = keep[:keep_cnt]
+
+    return keep
+
+
+def nearby_joints_nms(
+    kpts_db,
+    dist_thr,
+    num_nearby_joints_thr=None,
+    score_per_joint=False,
+    max_dets=-1,
+):
+    """Nearby joints NMS implementations.
+
+    Args:
+        kpts_db (list[dict]): keypoints and scores.
+        dist_thr (float): threshold for judging whether two joints are close.
+        num_nearby_joints_thr (int): threshold for judging whether two
+            instances are close.
+        max_dets (int): max number of detections to keep.
+        score_per_joint (bool): the input scores (in kpts_db) are per joint
+            scores.
+
+    Returns:
+        np.ndarray: indexes to keep.
+    """
+
+    assert dist_thr > 0, '`dist_thr` must be greater than 0.'
+    if len(kpts_db) == 0:
+        return []
+
+    if score_per_joint:
+        scores = np.array([k['score'].mean() for k in kpts_db])
+    else:
+        scores = np.array([k['score'] for k in kpts_db])
+
+    kpts = np.array([k['keypoints'] for k in kpts_db])
+
+    num_people, num_joints, _ = kpts.shape
+    if num_nearby_joints_thr is None:
+        num_nearby_joints_thr = num_joints // 2
+    assert num_nearby_joints_thr < num_joints, '`num_nearby_joints_thr` must '\
+        'be less than the number of joints.'
+
+    # compute distance threshold
+    pose_area = kpts.max(axis=1) - kpts.min(axis=1)
+    pose_area = np.sqrt(np.power(pose_area, 2).sum(axis=1))
+    pose_area = pose_area.reshape(num_people, 1, 1)
+    pose_area = np.tile(pose_area, (num_people, num_joints))
+    close_dist_thr = pose_area * dist_thr
+
+    # count nearby joints between instances
+    instance_dist = kpts[:, None] - kpts
+    instance_dist = np.sqrt(np.power(instance_dist, 2).sum(axis=3))
+    close_instance_num = (instance_dist < close_dist_thr).sum(2)
+    close_instance = close_instance_num > num_nearby_joints_thr
+
+    # apply nms
+    ignored_pose_inds, keep_pose_inds = set(), list()
+    indexes = np.argsort(scores)[::-1]
+    for i in indexes:
+        if i in ignored_pose_inds:
+            continue
+        keep_inds = close_instance[i].nonzero()[0]
+        keep_ind = keep_inds[np.argmax(scores[keep_inds])]
+        if keep_ind not in ignored_pose_inds:
+            keep_pose_inds.append(keep_ind)
+            ignored_pose_inds = ignored_pose_inds.union(set(keep_inds))
+
+    # limit the number of output instances
+    if max_dets > 0 and len(keep_pose_inds) > max_dets:
+        sub_inds = np.argsort(scores[keep_pose_inds])[-1:-max_dets - 1:-1]
+        keep_pose_inds = [keep_pose_inds[i] for i in sub_inds]
+
+    return keep_pose_inds
\ No newline at end of file
diff --git a/EdgeCape/models/utils/post_processing/one_euro_filter.py b/EdgeCape/models/utils/post_processing/one_euro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a1a834ab7520b2c6f051ccf5710a9c4fa15dc05
--- /dev/null
+++ b/EdgeCape/models/utils/post_processing/one_euro_filter.py
@@ -0,0 +1,113 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/HoBeom/OneEuroFilter-Numpy
+# Original licence: Copyright (c)  HoBeom Jeon, under the MIT License.
+# ------------------------------------------------------------------------------
+import warnings
+from time import time
+
+import numpy as np
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * np.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuroFilter:
+
+    def __init__(self,
+                 x0,
+                 dx0=0.0,
+                 min_cutoff=1.7,
+                 beta=0.3,
+                 d_cutoff=30.0,
+                 fps=None):
+        """One Euro Filter for keypoints smoothing.
+
+        Args:
+            x0 (np.ndarray[K, 2]): Initialize keypoints value
+            dx0 (float): 0.0
+            min_cutoff (float): parameter for one euro filter
+            beta (float): parameter for one euro filter
+            d_cutoff (float): Input data FPS
+            fps (float): Video FPS for video inference
+        """
+        warnings.warn(
+            'OneEuroFilter from '
+            '`mmpose/core/post_processing/one_euro_filter.py` will '
+            'be deprecated in the future. Please use Smoother'
+            '(`mmpose/core/post_processing/smoother.py`) with '
+            'OneEuroFilter (`mmpose/core/post_processing/temporal_'
+            'filters/one_euro_filter.py`).', DeprecationWarning)
+
+        # The parameters.
+        self.data_shape = x0.shape
+        self.min_cutoff = np.full(x0.shape, min_cutoff)
+        self.beta = np.full(x0.shape, beta)
+        self.d_cutoff = np.full(x0.shape, d_cutoff)
+        # Previous values.
+        self.x_prev = x0.astype(np.float32)
+        self.dx_prev = np.full(x0.shape, dx0)
+        self.mask_prev = np.ma.masked_where(x0 <= 0, x0)
+        self.realtime = True
+        if fps is None:
+            # Using in realtime inference
+            self.t_e = None
+            self.skip_frame_factor = d_cutoff
+            self.fps = d_cutoff
+        else:
+            # fps using video inference
+            self.realtime = False
+            self.fps = float(fps)
+            self.d_cutoff = np.full(x0.shape, self.fps)
+
+        self.t_prev = time()
+
+    def __call__(self, x, t_e=1.0):
+        """Compute the filtered signal.
+
+        Hyper-parameters (cutoff, beta) are from `VNect
+        <http://gvv.mpi-inf.mpg.de/projects/VNect/>`__ .
+
+        Realtime Camera fps (d_cutoff) default 30.0
+
+        Args:
+            x (np.ndarray[K, 2]): keypoints results in frame
+            t_e (Optional): video skip frame count for posetrack
+                evaluation
+        """
+        assert x.shape == self.data_shape
+
+        t = 0
+        if self.realtime:
+            t = time()
+            t_e = (t - self.t_prev) * self.skip_frame_factor
+        t_e = np.full(x.shape, t_e)
+
+        # missing keypoints mask
+        mask = np.ma.masked_where(x <= 0, x)
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e / self.fps, self.d_cutoff)
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e / self.fps, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+
+        # missing keypoints remove
+        np.copyto(x_hat, -10, where=mask.mask)
+
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        self.mask_prev = mask
+
+        return x_hat
\ No newline at end of file
diff --git a/EdgeCape/models/utils/post_processing/post_transforms.py b/EdgeCape/models/utils/post_processing/post_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca0b22aac2515bb332742642715b9d4de9e45629
--- /dev/null
+++ b/EdgeCape/models/utils/post_processing/post_transforms.py
@@ -0,0 +1,366 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import math
+
+import cv2
+import numpy as np
+import torch
+
+
+def fliplr_joints(joints_3d, joints_3d_visible, img_width, flip_pairs):
+    """Flip human joints horizontally.
+
+    Note:
+        - num_keypoints: K
+
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        joints_3d_visible (np.ndarray([K, 1])): Visibility of keypoints.
+        img_width (int): Image width.
+        flip_pairs (list[tuple]): Pairs of keypoints which are mirrored
+            (for example, left ear and right ear).
+
+    Returns:
+        tuple: Flipped human joints.
+
+        - joints_3d_flipped (np.ndarray([K, 3])): Flipped joints.
+        - joints_3d_visible_flipped (np.ndarray([K, 1])): Joint visibility.
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+    assert img_width > 0
+
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    # Flip horizontally
+    joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
+    joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0)
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+def fliplr_regression(regression,
+                      flip_pairs,
+                      center_mode='static',
+                      center_x=0.5,
+                      center_index=0):
+    """Flip human joints horizontally.
+
+    Note:
+        - batch_size: N
+        - num_keypoint: K
+
+    Args:
+        regression (np.ndarray([..., K, C])): Coordinates of keypoints, where K
+            is the joint number and C is the dimension. Example shapes are:
+
+            - [N, K, C]: a batch of keypoints where N is the batch size.
+            - [N, T, K, C]: a batch of pose sequences, where T is the frame
+                number.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        center_mode (str): The mode to set the center location on the x-axis
+            to flip around. Options are:
+
+            - static: use a static x value (see center_x also)
+            - root: use a root joint (see center_index also)
+        center_x (float): Set the x-axis location of the flip center. Only used
+            when center_mode=static.
+        center_index (int): Set the index of the root joint, whose x location
+            will be used as the flip center. Only used when center_mode=root.
+
+    Returns:
+        np.ndarray([..., K, C]): Flipped joints.
+    """
+    assert regression.ndim >= 2, f'Invalid pose shape {regression.shape}'
+
+    allowed_center_mode = {'static', 'root'}
+    assert center_mode in allowed_center_mode, 'Get invalid center_mode ' \
+        f'{center_mode}, allowed choices are {allowed_center_mode}'
+
+    if center_mode == 'static':
+        x_c = center_x
+    elif center_mode == 'root':
+        assert regression.shape[-2] > center_index
+        x_c = regression[..., center_index:center_index + 1, 0]
+
+    regression_flipped = regression.copy()
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        regression_flipped[..., left, :] = regression[..., right, :]
+        regression_flipped[..., right, :] = regression[..., left, :]
+
+    # Flip horizontally
+    regression_flipped[..., 0] = x_c * 2 - regression_flipped[..., 0]
+    return regression_flipped
+
+
+def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
+    """Flip the flipped heatmaps back to the original form.
+
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - heatmap height: H
+        - heatmap width: W
+
+    Args:
+        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
+            from the flipped images.
+        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        target_type (str): GaussianHeatmap or CombinedTarget
+
+    Returns:
+        np.ndarray: heatmaps that flipped back to the original image
+    """
+    assert output_flipped.ndim == 4, \
+        'output_flipped should be [batch_size, num_keypoints, height, width]'
+    shape_ori = output_flipped.shape
+    channels = 1
+    if target_type.lower() == 'CombinedTarget'.lower():
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(shape_ori[0], -1, channels,
+                                            shape_ori[2], shape_ori[3])
+    output_flipped_back = output_flipped.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape(shape_ori)
+    # Flip horizontally
+    output_flipped_back = output_flipped_back[..., ::-1]
+    return output_flipped_back
+
+
+def transform_preds(coords, center, scale, output_size, use_udp=False):
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (np.ndarray[K, ndims]):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        use_udp (bool): Use unbiased data processing
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    assert coords.shape[1] in (2, 4, 5)
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    if use_udp:
+        scale_x = scale[0] / (output_size[0] - 1.0)
+        scale_y = scale[1] / (output_size[1] - 1.0)
+    else:
+        scale_x = scale[0] / output_size[0]
+        scale_y = scale[1] / output_size[1]
+
+    target_coords = coords.copy()
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    # pixel_std is 200.
+    scale_tmp = scale * 200.0
+
+    shift = np.array(shift)
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    assert len(pt) == 2
+    new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """Calculate the transformation matrix under the constraint of unbiased.
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+
+    Returns:
+        np.ndarray: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+
+    Returns:
+        np.ndarray[..., 2]: Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(
+        np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
+        mat.T).reshape(shape)
+
+
+def affine_transform_torch(pts, t):
+    npts = pts.shape[0]
+    pts_homo = torch.cat([pts, torch.ones(npts, 1, device=pts.device)], dim=1)
+    out = torch.mm(t, torch.t(pts_homo))
+    return torch.t(out[:2, :])
\ No newline at end of file
diff --git a/EdgeCape/models/utils/post_processing/smoother.py b/EdgeCape/models/utils/post_processing/smoother.py
new file mode 100644
index 0000000000000000000000000000000000000000..d15db14ca98cc052da477db50c4d6e22df3b73d4
--- /dev/null
+++ b/EdgeCape/models/utils/post_processing/smoother.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import Dict, Union
+
+import numpy as np
+from mmcv import Config, is_seq_of
+
+from mmpose.core.post_processing.temporal_filters import build_filter
+
+
+class Smoother():
+    """Smoother to apply temporal smoothing on pose estimation results with a
+    filter.
+
+    Note:
+        T: The temporal length of the pose sequence
+        K: The keypoint number of each target
+        C: The keypoint coordinate dimension
+
+    Args:
+        filter_cfg (dict | str): The filter config. See example config files in
+            `configs/_base_/filters/` for details. Alternatively a config file
+            path can be accepted and the config will be loaded.
+        keypoint_dim (int): The keypoint coordinate dimension, which is
+            also indicated as C. Default: 2
+        keypoint_key (str): The dict key of the keypoints in the pose results.
+            Default: 'keypoints'
+    Example:
+        >>> import numpy as np
+        >>> # Build dummy pose result
+        >>> results = []
+        >>> for t in range(10):
+        >>>     results_t = []
+        >>>     for track_id in range(2):
+        >>>         result = {
+        >>>             'track_id': track_id,
+        >>>             'keypoints': np.random.rand(17, 3)
+        >>>         }
+        >>>         results_t.append(result)
+        >>>     results.append(results_t)
+        >>> # Example 1: Smooth multi-frame pose results offline.
+        >>> filter_cfg = dict(type='GaussianFilter', window_size=3)
+        >>> smoother = Smoother(filter_cfg, keypoint_dim=2)
+        >>> smoothed_results = smoother.smooth(results)
+        >>> # Example 2: Smooth pose results online frame-by-frame
+        >>> filter_cfg = dict(type='GaussianFilter', window_size=3)
+        >>> smoother = Smoother(filter_cfg, keypoint_dim=2)
+        >>> for result_t in results:
+        >>>     smoothed_result_t = smoother.smooth(result_t)
+    """
+
+    def __init__(self,
+                 filter_cfg: Union[Dict, str],
+                 keypoint_dim: int = 2,
+                 keypoint_key: str = 'keypoints'):
+        if isinstance(filter_cfg, str):
+            filter_cfg = Config.fromfile(filter_cfg).filter_cfg
+        self.filter_cfg = filter_cfg
+        self._filter = build_filter(filter_cfg)
+        self.keypoint_dim = keypoint_dim
+        self.key = keypoint_key
+        self.padding_size = self._filter.window_size - 1
+        self.history = {}
+
+    def _get_filter(self):
+        fltr = self._filter
+        if not fltr.shareable:
+            # If the filter is not shareable, build a new filter for the next
+            # requires
+            self._filter = build_filter(self.filter_cfg)
+        return fltr
+
+    def _collate_pose(self, results):
+        """Collate the pose results to pose sequences.
+
+        Args:
+            results (list[list[dict]]): The pose results of multiple frames.
+
+        Returns:
+            dict[str, np.ndarray]: A dict of collated pose sequences, where
+            the key is the track_id (in untracked scenario, the target index
+            will be used as the track_id), and the value is the pose sequence
+            in an array of shape [T, K, C]
+        """
+
+        if self._has_track_id(results):
+            # If the results have track_id, use it as the target indicator
+            results = [{res['track_id']: res
+                        for res in results_t} for results_t in results]
+            track_ids = results[0].keys()
+
+            for t, results_t in enumerate(results[1:]):
+                if results_t.keys() != track_ids:
+                    raise ValueError(f'Inconsistent track ids in frame {t+1}')
+
+            collated = {
+                id: np.stack([
+                    results_t[id][self.key][:, :self.keypoint_dim]
+                    for results_t in results
+                ])
+                for id in track_ids
+            }
+        else:
+            # If the results don't have track_id, use the target index
+            # as the target indicator
+            n_target = len(results[0])
+            for t, results_t in enumerate(results[1:]):
+                if len(results_t) != n_target:
+                    raise ValueError(
+                        f'Inconsistent target number in frame {t+1}: '
+                        f'{len(results_t)} vs {n_target}')
+
+            collated = {
+                id: np.stack([
+                    results_t[id][self.key][:, :self.keypoint_dim]
+                    for results_t in results
+                ])
+                for id in range(n_target)
+            }
+
+        return collated
+
+    def _scatter_pose(self, results, poses):
+        """Scatter the smoothed pose sequences and use them to update the pose
+        results.
+
+        Args:
+            results (list[list[dict]]): The original pose results
+            poses (dict[str, np.ndarray]): The smoothed pose sequences
+
+        Returns:
+            list[list[dict]]: The updated pose results
+        """
+        updated_results = []
+        for t, results_t in enumerate(results):
+            updated_results_t = []
+            if self._has_track_id(results):
+                id2result = ((result['track_id'], result)
+                             for result in results_t)
+            else:
+                id2result = enumerate(results_t)
+
+            for track_id, result in id2result:
+                result = copy.deepcopy(result)
+                result[self.key][:, :self.keypoint_dim] = poses[track_id][t]
+                updated_results_t.append(result)
+
+            updated_results.append(updated_results_t)
+        return updated_results
+
+    @staticmethod
+    def _has_track_id(results):
+        """Check if the pose results contain track_id."""
+        return 'track_id' in results[0][0]
+
+    def smooth(self, results):
+        """Apply temporal smoothing on pose estimation sequences.
+
+        Args:
+            results (list[dict] | list[list[dict]]): The pose results of a
+                single frame (non-nested list) or multiple frames (nested
+                list). The result of each target is a dict, which should
+                contains:
+
+                - track_id (optional, Any): The track ID of the target
+                - keypoints (np.ndarray): The keypoint coordinates in [K, C]
+
+        Returns:
+            (list[dict] | list[list[dict]]): Temporal smoothed pose results,
+            which has the same data structure as the input's.
+        """
+
+        # Check if input is empty
+        if not (results) or not (results[0]):
+            warnings.warn('Smoother received empty result.')
+            return results
+
+        # Check input is single frame or sequence
+        if is_seq_of(results, dict):
+            single_frame = True
+            results = [results]
+        else:
+            assert is_seq_of(results, list)
+            single_frame = False
+
+        # Get temporal length of input
+        T = len(results)
+
+        # Collate the input results to pose sequences
+        poses = self._collate_pose(results)
+
+        # Smooth the pose sequence of each target
+        smoothed_poses = {}
+        update_history = {}
+        for track_id, pose in poses.items():
+            if track_id in self.history:
+                # For tracked target, get its filter and pose history
+                pose_history, pose_filter = self.history[track_id]
+                if self.padding_size > 0:
+                    # Pad the pose sequence with pose history
+                    pose = np.concatenate((pose_history, pose), axis=0)
+            else:
+                # For new target, build a new filter
+                pose_filter = self._get_filter()
+
+            # Update the history information
+            if self.padding_size > 0:
+                pose_history = pose[-self.padding_size:].copy()
+            else:
+                pose_history = None
+            update_history[track_id] = (pose_history, pose_filter)
+
+            # Smooth the pose sequence with the filter
+            smoothed_pose = pose_filter(pose)
+            smoothed_poses[track_id] = smoothed_pose[-T:]
+
+        self.history = update_history
+
+        # Scatter the pose sequences back to the format of results
+        smoothed_results = self._scatter_pose(results, smoothed_poses)
+
+        # If the input is single frame, remove the nested list to keep the
+        # output structure consistent with the input's
+        if single_frame:
+            smoothed_results = smoothed_results[0]
+        return smoothed_results
\ No newline at end of file
diff --git a/EdgeCape/models/utils/transformer.py b/EdgeCape/models/utils/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..de56b404646363fefd47ac495ba01503c394d5e8
--- /dev/null
+++ b/EdgeCape/models/utils/transformer.py
@@ -0,0 +1,329 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import (build_activation_layer, build_conv_layer,
+                      build_norm_layer, xavier_init)
+from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.runner.base_module import BaseModule
+from EdgeCape.models.utils.builder import TRANSFORMER
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed, mask_query):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+
+        Notes:
+            x: query image features with shape [bs, c, h, w]
+            mask: mask for x with shape [bs, h, w]
+            pos_embed: positional embedding for x with shape [bs, c, h, w]
+            query_embed: sample keypoint features with shape [bs, num_query, c]
+            mask_query: mask for query_embed with shape [bs, num_query]
+        Outputs:
+            out_dec: [num_layers, bs, num_query, c]
+            memory: [bs, c, h, w]
+
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w] Note: this mask should be filled with False, since all images are with the same shape.
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)  # positional embeding for memory, i.e., the query.
+        memory = self.encoder(
+            query=x,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=mask) # output memory: [hw, bs, c]
+
+        query_embed = query_embed.permute(1, 0, 2)  # [bs, num_query, c] -> [num_query, bs, c]
+        # target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, c]
+        out_dec = self.decoder(
+            query=query_embed,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            # query_pos=query_embed,
+            query_key_padding_mask=mask_query,
+            key_padding_mask=mask)
+        out_dec = out_dec.transpose(1, 2) # [decoder_layer, bs, num_query, c]
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w) 
+        return out_dec, memory
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(DetrTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        # assert len(operation_order) == 6
+        # assert set(operation_order) == set(
+        #     ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(
+                post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            # assert not self.pre_norm, f'Use prenorm in ' \
+            #                           f'{self.__class__.__name__},' \
+            #                           f'Please specify post_norm_cfg'
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self,
+                 *args,
+                 post_norm_cfg=dict(type='LN'),
+                 return_intermediate=False,
+                 **kwargs):
+
+        super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg,
+                                              self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
+
+
+@TRANSFORMER.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        with_proj (bool): Project two-dimentional feature to
+            one-dimentional feature. Default to True.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=256,
+                 feat_channels=64,
+                 out_channels=None,
+                 input_feat_shape=7,
+                 with_proj=True,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.with_proj = with_proj
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        if self.with_proj:
+            self.fc_layer = nn.Linear(num_output, self.out_channels)
+            self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature, input_feature):
+        """Forward function for `DynamicConv`.
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        input_feature = input_feature.flatten(2).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        if self.with_proj:
+            features = features.flatten(1)
+            features = self.fc_layer(features)
+            features = self.fc_norm(features)
+            features = self.activation(features)
+
+        return features
\ No newline at end of file
diff --git a/EdgeCape/models/utils/visualization.py b/EdgeCape/models/utils/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..a400051e0b2fdb413d6ef0ebd44049071bc927fb
--- /dev/null
+++ b/EdgeCape/models/utils/visualization.py
@@ -0,0 +1,629 @@
+import collections
+import os
+import random
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import numpy as np
+import torch.nn.functional as F
+import uuid
+
+from matplotlib.colors import BoundaryNorm
+import matplotlib.patheffects as mpe
+from itertools import cycle
+
+colors = [
+    [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
+    [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
+    [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
+    [255, 0, 255], [255, 0, 170], [255, 0, 85]]
+
+
+def plot_heatmap(support_img, query_img, heatmaps, support_kp, support_w, query_kp, query_w, skeleton,
+                 initial_proposals, prediction, radius=6, n_heatmaps=5):
+    h, w, c = support_img.shape
+    fig, axes = plt.subplots(n_heatmaps + 1, 4, gridspec_kw={'wspace': 0, 'hspace': 0})
+    fig.set_size_inches(40, 10 * (n_heatmaps - 1), forward=True)
+    [axi.set_axis_off() for axi in axes.ravel()]
+    plt.subplots_adjust(wspace=0, hspace=0)
+    # Plot Skeleton
+    support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
+    query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img))
+    axes[0, 0].imshow(support_img)
+    axes[0, 1].imshow(query_img)
+    axes[0, 2].imshow(support_img)
+    axes[0, 3].imshow(query_img)
+    for k in range(support_kp.shape[0]):
+        if support_w[k] > 0:
+            kp = support_kp[k, :2]
+            c = (1, 0, 0, 0.75) if support_w[k] == 1 else (0, 0, 1, 0.6)
+            patch = plt.Circle(kp, radius, color=c)
+            axes[0, 0].add_patch(patch)
+            axes[0, 0].text(kp[0], kp[1], k)
+            kp = query_kp[k, :2]
+            c = (1, 0, 0, 0.75) if query_kp[k, 2] == 1 else (0, 0, 1, 0.6)
+            patch = plt.Circle(kp, radius, color=c)
+            axes[0, 1].add_patch(patch)
+            axes[0, 1].text(kp[0], kp[1], k)
+            plt.draw()
+    for l, limb in enumerate(skeleton):
+        if l > len(colors) - 1:
+            c = [x / 255 for x in random.sample(range(0, 255), 3)]
+        else:
+            c = [x / 255 for x in colors[l]]
+        if support_w[limb[0]] > 0 and support_w[limb[1]] > 0 and query_w[limb[0]] > 0 and query_w[limb[1]] > 0:
+            patch = plt.Line2D([support_kp[limb[0], 0], support_kp[limb[1], 0]],
+                               [support_kp[limb[0], 1], support_kp[limb[1], 1]],
+                               linewidth=2, color=c, alpha=0.5)
+            axes[0, 2].add_artist(patch)
+            patch = plt.Line2D([query_kp[limb[0], 0], query_kp[limb[1], 0]],
+                               [query_kp[limb[0], 1], query_kp[limb[1], 1]],
+                               linewidth=2, color=c, alpha=0.5)
+            axes[0, 3].add_artist(patch)
+    # Plot heatmap
+    prediction = prediction[-1] * h
+    initial_proposals = initial_proposals[0] * h
+    # similarity_map = F.interpolate(heatmaps[:, None], size=(h, w), mode='bilinear').squeeze()
+    similarity_map = heatmaps
+    # similarity_map_shape = similarity_map.shape
+    # similarity_map = similarity_map.reshape(*similarity_map_shape[:2], -1)
+    # similarity_map = (similarity_map - torch.min(
+    #     similarity_map, dim=2)[0].unsqueeze(2)) / (
+    #                          torch.max(similarity_map, dim=2)[0].unsqueeze(2) -
+    #                          torch.min(similarity_map, dim=2)[0].unsqueeze(2) + 1e-10)
+    j = 0
+    for i in range(n_heatmaps):
+        if support_w[j] > 0 and query_w[j] > 0:
+            if i > len(colors) - 1:
+                c = [x / 255 for x in random.sample(range(0, 255), 3)]
+            else:
+                c = [x / 255 for x in colors[i]]
+            kp = support_kp[j, :2]
+            patch = plt.Circle(kp, radius, color=c, alpha=0.6)
+            axes[i + 1, 0].add_patch(patch)
+            axes[i + 1, 0].text(kp[0], kp[1], j)
+            axes[i + 1, 0].imshow(support_img)
+            axes[i + 1, 1].imshow(similarity_map[j].cpu().numpy(), alpha=0.6, cmap='jet')
+            axes[i + 1, 2].imshow(query_img)
+            patch = plt.Circle(initial_proposals[j], 0.2 * h, color=c, alpha=0.6)
+            axes[i + 1, 2].add_patch(patch)
+            patch = plt.Circle(query_kp[j], radius, color=(1, 0, 0), alpha=0.8)
+            axes[i + 1, 2].add_patch(patch)
+            axes[i + 1, 2].text(initial_proposals[j][0], initial_proposals[j][1], j)
+            axes[i + 1, 3].imshow(query_img)
+            patch = plt.Circle(prediction[j], 0.2 * h, color=c, alpha=0.6)
+            axes[i + 1, 3].add_patch(patch)
+            patch = plt.Circle(query_kp[j], radius, color=(1, 0, 0), alpha=0.8)
+            axes[i + 1, 3].add_patch(patch)
+            axes[i + 1, 3].text(initial_proposals[j][0], initial_proposals[j][1], j)
+        j += 1
+        if j > 99:
+            break
+    img_names = [img.split(".")[0] for img in os.listdir('./heatmaps') if img.endswith('.png')]
+    if len(img_names) > 0:
+        name_idx = max([int(img_name) for img_name in img_names]) + 1
+    else:
+        name_idx = 0
+    plt.savefig(f'./heatmaps/{str(name_idx)}.png')
+    plt.clf()
+
+
+def plot_attn(support_img, query_img, similarity_map, support_kp, support_w, query_kp, query_w, skeleton,
+              attn_map, adjs, prediction, radius=14, n_heatmaps=1):
+    h, w, c = support_img.shape
+    plt.rc('xtick', labelsize=18)
+    plt.rc('ytick', labelsize=18)
+    fig, axes = plt.subplots(4, 4, gridspec_kw={'wspace': 0.2, 'hspace': 0.2})
+    fig.set_size_inches(50, 50, forward=True)
+    axes[0, 0].set_axis_off()
+    axes[0, 1].set_axis_off()
+    axes[0, 2].set_axis_off()
+    axes[0, 3].set_axis_off()
+    plt.subplots_adjust(wspace=0.2, hspace=0.2)
+    # Plot Skeleton
+    support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
+    query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img))
+    axes[0, 0].imshow(support_img)
+    axes[0, 1].imshow(query_img)
+    axes[0, 2].imshow(support_img)
+    axes[0, 3].imshow(query_img)
+    for k in range(support_kp.shape[0]):
+        if support_w[k] > 0:
+            kp = support_kp[k, :2]
+            c = (1, 0, 0, 0.75) if support_w[k] == 1 else (0, 0, 1, 0.6)
+            patch = plt.Circle(kp, radius, color=c)
+            axes[0, 0].add_patch(patch)
+            axes[0, 0].text(kp[0], kp[1], k)
+            kp = query_kp[k, :2]
+            c = (1, 0, 0, 0.75) if query_kp[k, 2] == 1 else (0, 0, 1, 0.6)
+            patch = plt.Circle(kp, radius, color=c)
+            axes[0, 1].add_patch(patch)
+            axes[0, 1].text(kp[0], kp[1], k)
+            plt.draw()
+    for l, limb in enumerate(skeleton):
+        if l > len(colors) - 1:
+            c = [x / 255 for x in random.sample(range(0, 255), 3)]
+        else:
+            c = [x / 255 for x in colors[l]]
+        if support_w[limb[0]] > 0 and support_w[limb[1]]:
+            patch = plt.Line2D([support_kp[limb[0], 0], support_kp[limb[1], 0]],
+                               [support_kp[limb[0], 1], support_kp[limb[1], 1]],
+                               linewidth=8, color=c, alpha=0.5)
+            axes[0, 2].add_artist(patch)
+        if query_w[limb[0]] > 0 and query_w[limb[1]]:
+            patch = plt.Line2D([query_kp[limb[0], 0], query_kp[limb[1], 0]],
+                               [query_kp[limb[0], 1], query_kp[limb[1], 1]],
+                               linewidth=8, color=c, alpha=0.5)
+            axes[0, 3].add_artist(patch)
+    # Plot heatmap
+    axes[1, 0].set_title("GT")
+    axes[1, 1].set_title("L1")
+    axes[1, 2].set_title("L2")
+    axes[1, 3].set_title("L3")
+    min_kp_pos = np.argmax(np.cumsum(query_w)) + 1
+    mask = torch.from_numpy(query_w).bool()[None]
+    gt_A = adj_mx_from_edges(num_pts=100, skeleton=[skeleton], device=mask.device).cpu().numpy()
+    gt_A = gt_A[:min_kp_pos, :min_kp_pos]
+    axes[1, 0].imshow(gt_A, alpha=0.6, cmap='Reds')
+    for i in range(min_kp_pos):
+        for j in range(min_kp_pos):
+            text = axes[1, 0].text(j, i, np.round(gt_A[i, j], 2), ha="center", va="center")
+    np.fill_diagonal(gt_A, 0)
+    axes[2, 0].imshow(gt_A, alpha=0.6, cmap='Reds')
+    for i in range(min_kp_pos):
+        for j in range(min_kp_pos):
+            text = axes[2, 0].text(j, i, np.round(gt_A[i, j], 2), ha="center", va="center")
+    for col, attn in enumerate(attn_map):
+        heatmap = attn[:, :min_kp_pos, :min_kp_pos].squeeze().cpu().numpy()
+        axes[1, col+1].imshow(heatmap, alpha=0.6, cmap='Reds')
+        for i in range(min_kp_pos):
+            for j in range(min_kp_pos):
+                text = axes[1, col+1].text(j, i, np.round(heatmap[i, j], 2), ha="center", va="center")
+        # np.fill_diagonal(heatmap, 0)
+        # heatmap = heatmap / heatmap.sum(1, keepdims=True)
+        axes[2, col+1].imshow(heatmap, alpha=0.6, cmap='Reds')
+        for i in range(min_kp_pos):
+            for j in range(min_kp_pos):
+                text = axes[2, col+1].text(j, i, np.round(heatmap[i, j], 2), ha="center", va="center")
+
+        # Plot self-attention on image
+        self_attention_skeleton = []
+        for i in range(min_kp_pos):
+            topk = np.argsort(heatmap[i])[::-1]
+            for m in range(5):
+                self_attention_skeleton.append([i, topk[m], heatmap[i, topk[m]]])
+        axes[3, col+1].imshow(query_img)
+        for k in range(support_kp.shape[0]):
+            if support_w[k] > 0:
+                kp = query_kp[k, :2]
+                c = (1, 0, 0, 0.75) if query_kp[k, 2] == 1 else (0, 0, 1, 0.6)
+                patch = plt.Circle(kp, radius//2, color=c)
+                axes[3, col+1].add_patch(patch)
+                axes[3, col+1].text(kp[0], kp[1], k, fontsize=12)
+                plt.draw()
+        for l, limb in enumerate(self_attention_skeleton):
+            if query_w[limb[0]] > 0 and query_w[limb[1]]:
+                patch = plt.Line2D(
+                    [query_kp[limb[0], 0], query_kp[limb[1], 0]],
+                    [query_kp[limb[0], 1], query_kp[limb[1], 1]],
+                    linewidth=30*limb[2], color='red', alpha=limb[2])
+                axes[3, col+1].add_artist(patch)
+        # cur_adj = torch.nn.functional.sigmoid(adjs[col])[:, :min_kp_pos, :min_kp_pos].squeeze().cpu().numpy()
+        # axes[3, col + 1].imshow(cur_adj, alpha=0.6, cmap='Reds')
+        # for i in range(min_kp_pos):
+        #     for j in range(min_kp_pos):
+        #         text = axes[3, col + 1].text(j, i, np.round(cur_adj[i, j], 2), ha="center", va="center")
+    img_names = [img.split(".")[0] for img in os.listdir('./heatmaps') if str_is_int(img.split(".")[0])]
+    if len(img_names) > 0:
+        name_idx = max([int(img_name) for img_name in img_names]) + 1
+    else:
+        name_idx = 0
+    # crete dir
+    # if not os.path.isdir(f'./heatmaps/{str(name_idx)}'):
+    #     os.mkdir(f'./heatmaps/{str(name_idx)}')
+    plt.savefig(f'./heatmaps/{str(name_idx)}.png')
+    extent = axes[3,3].get_window_extent().transformed(
+        fig.dpi_scale_trans.inverted())
+    fig.savefig(f'./heatmaps/layer_{str(name_idx)}.png', bbox_inches=extent)
+    # for k, row in enumerate(axes):
+    #     for i, ax in enumerate(row):
+    #         extent = ax.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
+    #         plt.savefig(f'./heatmaps/{str(name_idx)}/{str(k)}_{str(i)}.png', bbox_inches=extent)
+
+    plt.clf()
+
+
+def plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w,
+                 skeleton=None, initial_proposals=None, prediction=None,
+                 radius=6, out_dir='./heatmaps', file_name=None, in_color=None,
+                 original_skeleton=None, img_alpha=0.6, target_keypoints=None):
+    img_names = [img.split("_")[0] for img in os.listdir(out_dir) if str_is_int(img.split("_")[0])]
+    if file_name is None:
+        if len(img_names) > 0:
+            name_idx = str(max([int(img_name) for img_name in img_names]) + 1)
+        else:
+            name_idx = '0'
+    else:
+        name_idx = file_name
+    # crete dir
+    # if not os.path.isdir(f'./heatmaps/{str(name_idx)}'):
+    #     os.mkdir(f'./heatmaps/{str(name_idx)}')
+
+    h, w, c = support_img.shape
+    prediction = prediction[-1] * h
+    if isinstance(prediction, torch.Tensor):
+        prediction = prediction.cpu().numpy()
+    if isinstance(skeleton, list):
+        skeleton = adj_mx_from_edges(num_pts=100, skeleton=[skeleton]).cpu().numpy()[0]
+        original_skeleton = skeleton
+    support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
+    query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img))
+    error_mask = None
+    for id, (img, w, keypoint, adj) in enumerate(zip([support_img, support_img, query_img],
+                                                [support_w, support_w, query_w],
+                                                # [support_kp, query_kp])):
+                                                [support_kp, support_kp, prediction],
+                                                [original_skeleton, skeleton, skeleton])):
+        color = in_color
+        f, axes = plt.subplots()
+        plt.imshow(img, alpha=img_alpha)
+
+        # On qeury image plot
+        if id == 2 and target_keypoints is not None:
+            error = np.linalg.norm(keypoint - target_keypoints, axis=-1)
+            error_mask = error > (256 * 0.05)
+
+        for k in range(keypoint.shape[0]):
+            if w[k] > 0:
+                kp = keypoint[k, :2]
+                c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
+                if error_mask is not None and error_mask[k]:
+                    c = (1, 1, 0, 0.75)
+                    patch = plt.Circle(kp,
+                                       radius,
+                                       color=c,
+                                       path_effects=[mpe.withStroke(linewidth=8, foreground='black'),
+                                                     mpe.withStroke(linewidth=4, foreground='white'),
+                                                     mpe.withStroke(linewidth=2, foreground='black'),
+                                                     ],
+                                       zorder=260)
+                    axes.add_patch(patch)
+                    axes.text(kp[0], kp[1], k, fontsize=10, color='black', ha="center", va="center", zorder=320,)
+                else:
+                    patch = plt.Circle(kp,
+                                       radius,
+                                       color=c,
+                                       path_effects=[mpe.withStroke(linewidth=2, foreground='black')],
+                                       zorder=200)
+                    axes.add_patch(patch)
+                    axes.text(kp[0], kp[1], k, fontsize=(radius+4), color='white', ha="center", va="center", zorder=300,
+                              path_effects=[mpe.withStroke(linewidth=max(1, int((radius+4)/5)), foreground='black')])
+                # axes.text(kp[0], kp[1], k)
+                plt.draw()
+        # Create keypoint pairs index list
+        # color_hack = {
+        #     (0, 1): '3000ff',
+        #     (1, 2): 'ff008a',
+        #     (2, 3): 'ff00de',
+        #     (3, 4): 'd200ff',
+        #     (4, 5): '8400ff',
+        #     (5, 0): '003cff',
+        # }
+        # reverse_key_color_hack = {(k[1], k[0]): v for k, v in color_hack.items()}
+        # color_hack = {**color_hack, **reverse_key_color_hack}
+        if adj is not None:
+            # Make max value 6
+            draw_skeleton = adj ** 1
+            max_skel_val = np.max(draw_skeleton)
+            draw_skeleton = draw_skeleton / max_skel_val * 6
+            for i in range(1, keypoint.shape[0]):
+                for j in range(0, i):
+                    # if c_index > len(colors) - 1:
+                    #     c = [x / 255 for x in random.sample(range(0, 255), 3)]
+                    # else:
+                    #     c = [x / 255 for x in colors[c_index]]
+                    # if (i, j) in color_hack:
+                    #     c = color_hack[(i, j)]
+                    #     c = [int(c[i:i + 2], 16) / 255 for i in (0, 2, 4)]
+                    #     c_index -= 1
+                    if w[i] > 0 and w[j] > 0 and original_skeleton[i][j] > 0:
+                        if color is None:
+                            num_colors = int((skeleton > 0.05).sum() / 2)
+                            color = iter(plt.cm.rainbow(np.linspace(0, 1, num_colors+1)))
+                            c = next(color)
+                        elif isinstance(color, str):
+                            c = color
+                        elif isinstance(color, collections.Iterable):
+                            c = next(color)
+                        else:
+                            raise ValueError("Color must be a string or an iterable")
+                    if w[i] > 0 and w[j] > 0 and skeleton[i][j] > 0:
+                        width = draw_skeleton[i][j]
+                        stroke_width = width + (width / 3)
+                        patch = plt.Line2D([keypoint[i, 0], keypoint[j, 0]],
+                                           [keypoint[i, 1], keypoint[j, 1]],
+                                           linewidth=width, color=c, alpha=0.6,
+                                           path_effects=[mpe.withStroke(linewidth=stroke_width, foreground='black')], zorder=1)
+                        axes.add_artist(patch)
+
+        plt.axis('off')  # command for hiding the axis.
+        plt.savefig(f'./{out_dir}/{str(name_idx)}_{str(id)}.png', bbox_inches='tight', pad_inches=0)
+        plt.clf()
+        # plt.close('all')
+
+
+def old_plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w, skeleton,
+                 initial_proposals, prediction, radius=6, out_dir='./heatmaps',
+                 file_name=None):
+    img_names = [img.split("_")[0] for img in os.listdir(out_dir) if str_is_int(img.split("_")[0])]
+    if file_name is None:
+        if len(img_names) > 0:
+            name_idx = str(max([int(img_name) for img_name in img_names]) + 1)
+        else:
+            name_idx = '0'
+    else:
+        name_idx = file_name
+    # crete dir
+    # if not os.path.isdir(f'./heatmaps/{str(name_idx)}'):
+    #     os.mkdir(f'./heatmaps/{str(name_idx)}')
+
+    h, w, c = support_img.shape
+    prediction = prediction[-1].cpu().numpy() * h
+    support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
+    query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img))
+
+    for id, (img, w, keypoint) in enumerate(zip([support_img, query_img],
+                                                [support_w, query_w],
+                                                # [support_kp, query_kp])):
+                                                [support_kp, prediction])):
+        f, axes = plt.subplots()
+        plt.imshow(img)
+        for k in range(keypoint.shape[0]):
+            if w[k] > 0:
+                kp = keypoint[k, :2]
+                c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
+                patch = plt.Circle(kp, radius, color=c)
+                axes.add_patch(patch)
+                axes.text(kp[0], kp[1], k, fontsize=20)
+                # axes.text(kp[0], kp[1], k)
+                plt.draw()
+        # Create keypoint pairs index list
+        # color_hack = {
+        #     (0, 1): '3000ff',
+        #     (1, 2): 'ff008a',
+        #     (2, 3): 'ff00de',
+        #     (3, 4): 'd200ff',
+        #     (4, 5): '8400ff',
+        #     (5, 0): '003cff',
+        # }
+        # reverse_key_color_hack = {(k[1], k[0]): v for k, v in color_hack.items()}
+        # color_hack = {**color_hack, **reverse_key_color_hack}
+        # c_index = 0
+        # for i in range(1, keypoint.shape[0]):
+        #     for j in range(0, i):
+        #         if c_index > len(colors) - 1:
+        #             c = [x / 255 for x in random.sample(range(0, 255), 3)]
+        #         else:
+        #             c = [x / 255 for x in colors[c_index]]
+        #         if (i, j) in color_hack:
+        #             c = color_hack[(i, j)]
+        #             c = [int(c[i:i + 2], 16) / 255 for i in (0, 2, 4)]
+        #             c_index -= 1
+        #         if w[i] > 0 and w[j] > 0 and skeleton[i][j] > 0:
+        #             patch = plt.Line2D([keypoint[i, 0], keypoint[j, 0]],
+        #                                [keypoint[i, 1], keypoint[j, 1]],
+        #                                # linewidth=skeleton[i][j]*20, color=c, alpha=0.6)
+        #                                linewidth=5, color=c, alpha=0.6)
+        #             axes.add_artist(patch)
+        #             c_index += 1
+
+        for l, limb in enumerate(skeleton):
+            kp = keypoint[:, :2]
+            if l > len(colors) - 1:
+                c = [x / 255 for x in random.sample(range(0, 255), 3)]
+            else:
+                c = [x / 255 for x in colors[l]]
+            if w[limb[0]] > 0 and w[limb[1]] > 0:
+                patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
+                                   [kp[limb[0], 1], kp[limb[1], 1]],
+                                   linewidth=6, color=c, alpha=0.6)
+                axes.add_artist(patch)
+        plt.axis('off')  # command for hiding the axis.
+        plt.savefig(f'./{out_dir}/{str(name_idx)}_{str(id)}.png', bbox_inches='tight', pad_inches=0)
+        plt.clf()
+
+def str_is_int(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def adj_mx_from_edges(num_pts, skeleton, device='cuda', normalization_fix=True):
+    adj_mx = torch.empty(0, device=device)
+    batch_size = len(skeleton)
+    for b in range(batch_size):
+        edges = torch.tensor(skeleton[b])
+        adj = torch.zeros(num_pts, num_pts, device=device)
+        adj[edges[:, 0], edges[:, 1]] = 1
+        adj_mx = torch.concatenate((adj_mx, adj.unsqueeze(0)), dim=0)
+    trans_adj_mx = torch.transpose(adj_mx, 1, 2)
+    cond = (trans_adj_mx > adj_mx).float()
+    adj = adj_mx + trans_adj_mx * cond - adj_mx * cond
+    # if normalization_fix:
+    #     adj = adj * ~mask[..., None] * ~mask[:, None]
+    #     adj = torch.nan_to_num(adj / adj.sum(dim=-1, keepdim=True))
+    # else:
+    #     adj = torch.nan_to_num(adj / adj.sum(dim=2, keepdim=True)) * ~mask[..., None] * ~mask[:, None]
+    # adj = torch.stack((torch.diag_embed(~mask), adj), dim=1)
+    return adj
+
+def vis_skeleton(support_img, support_kp, support_w, a_pred, a_gt, file_name=None, radius=3, line_width=6, alpha=0.8):
+    h, w, c = support_img.shape
+    # Normalize the support image
+    support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
+    # Create figure
+    fig, axes = plt.subplots(2, 2, figsize=(20, 20), gridspec_kw={'height_ratios': [1, 1]})
+    axes[0, 0].imshow(support_img, alpha=0.6)
+    axes[0, 1].imshow(support_img, alpha=0.6)
+    axes[1, 0].imshow(support_img, alpha=0.6)
+    axes[1, 1].axis('off')  # Hide the unused subplot
+
+    a_pred = (a_pred + a_pred.T) / 2
+    scaled_a_pred = a_pred ** 1 * line_width
+    max_val = np.max(scaled_a_pred)
+    for i in range(a_pred.shape[0]):
+        for j in range(i + 1, a_pred.shape[1]):
+            if support_w[i] > 0 and support_w[j] > 0 and a_pred[i, j] > 0:
+                kp1 = support_kp[i, :2]
+                kp2 = support_kp[j, :2]
+                width = scaled_a_pred[i, j]
+                stroke_width = width + (width / 3)
+                outline = mpe.withStroke(linewidth=stroke_width, foreground='black')
+                patch = plt.Line2D([kp1[0], kp2[0]], [kp1[1], kp2[1]],
+                                   path_effects=[outline],
+                                   linewidth=width,
+                                   color='blue',
+                                   alpha=alpha)
+                axes[0, 0].add_artist(patch)
+
+    # Plot keypoints and skeleton for predicted adjacency matrix
+    for k in range(support_kp.shape[0]):
+        if support_w[k] > 0:
+            kp = support_kp[k, :2]
+            outline = mpe.withStroke(linewidth=2, foreground='black')
+            patch = plt.Circle(kp, radius, color=(1, 0, 0, 1), path_effects=[outline], zorder=200)
+            axes[0, 0].add_patch(patch)
+
+    a_gt = (a_gt + a_gt.T) / 2
+    for i in range(a_gt.shape[0]):
+        for j in range(i + 1, a_gt.shape[1]):
+            if support_w[i] > 0 and support_w[j] > 0 and a_gt[i, j] > 0:
+                kp1 = support_kp[i, :2]
+                kp2 = support_kp[j, :2]
+                width = a_gt[i, j] * max_val
+                outline = mpe.withStroke(linewidth=width+2, foreground='black')
+                patch = plt.Line2D([kp1[0], kp2[0]], [kp1[1], kp2[1]],
+                                   path_effects=[outline],
+                                   linewidth=width,
+                                   color='green',
+                                   alpha=alpha)
+                axes[0, 1].add_artist(patch)
+
+    # Plot keypoints and skeleton for predicted adjacency matrix
+    for k in range(support_kp.shape[0]):
+        if support_w[k] > 0:
+            kp = support_kp[k, :2]
+            outline = mpe.withStroke(linewidth=3, foreground='black')
+            patch = plt.Circle(kp, radius, color=(1, 0, 0, 1), path_effects=[outline], zorder=200)
+            axes[0, 1].add_patch(patch)
+            # axes[0, 0].text(kp[0], kp[1],
+            #                 k,
+            #                 path_effects=[mpe.Stroke(linewidth=2, foreground='black'), mpe.Normal()],
+            #                 fontsize=12,
+            #                 color='white',
+            #                 ha="center",
+            #                 va="center",
+            #                 zorder=300)
+
+    # Calculate the difference and plot the skeleton with color based on the difference
+    diff = (a_pred - a_gt) / (a_gt + 1e-10)
+    for k in range(support_kp.shape[0]):
+        if support_w[k] > 0:
+            kp = support_kp[k, :2]
+            patch = plt.Circle(kp, radius, color=(1, 0, 0, 0.75))
+            axes[1, 0].add_patch(patch)
+            axes[1, 0].text(kp[0], kp[1], k, fontsize=8)
+
+    cmap = shiftedColorMap(plt.cm.Spectral, midpoint=0.34)
+    norm = plt.Normalize(vmin=-1., vmax=2.)
+
+    for i in range(diff.shape[0]):
+        for j in range(i + 1, diff.shape[1]):
+            if support_w[i] > 0 and support_w[j] > 0 and diff[i, j] != 0:
+                kp1 = support_kp[i, :2]
+                kp2 = support_kp[j, :2]
+                color = cmap(norm(diff[i, j]))
+                patch = plt.Line2D([kp1[0], kp2[0]], [kp1[1], kp2[1]],
+                                   linewidth=line_width/2,
+                                   color=color,
+                                   alpha=alpha)
+                axes[1, 0].add_artist(patch)
+
+    # axes[0, 0].set_title('Predicted Adjacency Matrix')
+    # axes[0, 1].set_title('Ground-Truth Adjacency Matrix')
+    # axes[1, 0].set_title(r'$\frac{(a_{pred} - a_{gt})}{a_{gt}}$')
+
+    for ax in axes[0, :]:
+        ax.axis('off')
+    for ax in axes[1, :]:
+        ax.axis('off')
+
+    cbar = fig.colorbar(plt.cm.ScalarMappable(cmap=cmap, norm=norm), ax=axes[1, 0], orientation='vertical')
+    cbar.set_label('Difference')
+
+    if file_name:
+        path = f'./heatmaps/{file_name}'
+        plt.savefig(f'{path}_pred.png', bbox_inches='tight', pad_inches=0)
+        extent = axes[0, 0].get_window_extent().transformed(fig.dpi_scale_trans.inverted())
+        fig.savefig(f'{path}_prediction.png', bbox_inches=extent)
+        extent = axes[0, 1].get_window_extent().transformed(fig.dpi_scale_trans.inverted())
+        fig.savefig(f'{path}_gt.png', bbox_inches=extent)
+        extent = axes[1, 0].get_window_extent().transformed(fig.dpi_scale_trans.inverted())
+        fig.savefig(f'{path}_diff.png', bbox_inches=extent.expanded(1.6, 1.3))
+    plt.cla()
+
+
+def shiftedColorMap(cmap, start=0, midpoint=0.5, stop=1.0, name='shiftedcmap'):
+    '''
+    Function to offset the "center" of a colormap. Useful for
+    data with a negative min and positive max and you want the
+    middle of the colormap's dynamic range to be at zero.
+
+    Input
+    -----
+      cmap : The matplotlib colormap to be altered
+      start : Offset from lowest point in the colormap's range.
+          Defaults to 0.0 (no lower offset). Should be between
+          0.0 and `midpoint`.
+      midpoint : The new center of the colormap. Defaults to
+          0.5 (no shift). Should be between 0.0 and 1.0. In
+          general, this should be  1 - vmax / (vmax + abs(vmin))
+          For example if your data range from -15.0 to +5.0 and
+          you want the center of the colormap at 0.0, `midpoint`
+          should be set to  1 - 5/(5 + 15)) or 0.75
+      stop : Offset from highest point in the colormap's range.
+          Defaults to 1.0 (no upper offset). Should be between
+          `midpoint` and 1.0.
+    '''
+    cdict = {
+        'red': [],
+        'green': [],
+        'blue': [],
+        'alpha': []
+    }
+
+    # regular index to compute the colors
+    reg_index = np.linspace(start, stop, 257)
+
+    # shifted index to match the data
+    shift_index = np.hstack([
+        np.linspace(0.0, midpoint, 128, endpoint=False),
+        np.linspace(midpoint, 1.0, 129, endpoint=True)
+    ])
+
+    for ri, si in zip(reg_index, shift_index):
+        r, g, b, a = cmap(ri)
+
+        cdict['red'].append((si, r, r))
+        cdict['green'].append((si, g, g))
+        cdict['blue'].append((si, b, b))
+        cdict['alpha'].append((si, a, a))
+
+    newcmap = matplotlib.colors.LinearSegmentedColormap(name, cdict)
+    # plt.register_cmap(cmap=newcmap)
+
+    return newcmap
\ No newline at end of file
diff --git a/EdgeCape/version.py b/EdgeCape/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..11e2ca150513d566dca35e045548b914b1b6bfa7
--- /dev/null
+++ b/EdgeCape/version.py
@@ -0,0 +1,5 @@
+# GENERATED VERSION FILE
+# TIME: Wed May 31 16:07:32 2023
+__version__ = '0.2.0+818517e'
+short_version = '0.2.0'
+version_info = (0, 2, 0)
diff --git a/README.md b/README.md
index f39d3b782cd5df9a0bbc7cdd5a0e0ec662fb53fa..a214aa3e6322693b09c7989cfa7dc8a400b8e080 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,107 @@
----
-title: EdgeCape
-emoji: 🏆
-colorFrom: yellow
-colorTo: blue
-sdk: gradio
-sdk_version: 5.7.1
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Edge Weight Prediction For Category-Agnostic Pose Estimation
+<a href="https://orhir.github.io/edge_cape/"><img src="https://img.shields.io/static/v1?label=Project&message=Website&color=blue"></a>
+<a href="https://arxiv.org/pdf/2411.16665"><img src="https://img.shields.io/badge/arXiv-311.17891-b31b1b.svg"></a>
+<a href="https://www.apache.org/licenses/LICENSE-2.0.txt">
+<img src="https://img.shields.io/badge/License-Apache-yellow"></a>
+
+
+By [Or Hirschorn](https://scholar.google.co.il/citations?user=GgFuT_QAAAAJ&hl=iw&oi=ao) and [Shai Avidan](https://scholar.google.co.il/citations?hl=iw&user=hpItE1QAAAAJ)
+
+This repo is the official implementation of "[Edge Weight Prediction For Category-Agnostic Pose Estimation
+](https://arxiv.org/abs/2411.16665)".
+
+# Hugging Face Demo Coming Soon!
+### Stay tuned for the upcoming demo release!
+
+
+## 🔔 News
+- **`25 November 2024`** Initial Code Release
+
+
+## Introduction
+Given only one example image and skeleton,  our method refines the skeleton to enhance pose estimation on unseen categories.
+
+Using our method, given a support image and skeleton we can refine the structure for better pose estimation on images from unseen categories.
+
+## Citation
+Please consider citing our paper and GraphCape if you found our work useful:
+```bibtex
+@misc{hirschorn2024edgeweightpredictioncategoryagnostic,
+      title={Edge Weight Prediction For Category-Agnostic Pose Estimation}, 
+      author={Or Hirschorn and Shai Avidan},
+      year={2024},
+      eprint={2411.16665},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2411.16665}, 
+}
+
+@misc{hirschorn2023pose,
+      title={A Graph-Based Approach for Category-Agnostic Pose Estimation}, 
+      author={Or Hirschorn and Shai Avidan},
+      year={2024},
+      eprint={2311.17891},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2311.17891}, 
+}
+```
+
+## Getting Started
+
+### Docker [Recommended]
+We provide a docker image for easy use.
+You can simply pull the docker image from docker hub, containing all the required libraries and packages:
+
+```
+docker pull orhir/edgecape
+docker run --name edgecape -v {DATA_DIR}:/workspace/EdgeCape/EdgeCape/data/mp100 -it orhir/edgecape /bin/bash
+```
+### Conda Environment
+We train and evaluate our model on Python 3.8 and Pytorch 2.0.1 with CUDA 12.1. 
+
+Please first install pytorch and torchvision following official documentation Pytorch. 
+Then, follow [MMPose](https://mmpose.readthedocs.io/en/latest/installation.html) to install the following packages:
+```
+mmcv-full=1.7.2
+mmpose=0.29.0
+```
+Having installed these packages, run:
+```
+python setup.py develop
+```
+
+## MP-100 Dataset
+Please follow the [official guide](https://github.com/orhir/PoseAnything) to prepare the MP-100 dataset for training and evaluation, and organize the data structure properly.
+
+## Training
+
+### Training
+To train the model, run:
+```
+python run.py --config [path_to_config_file]  --work_dir [path_to_work_dir]
+```
+
+## Evaluation and Pretrained Models
+
+### Evaluation
+The evaluation on a single GPU will take approximately 30 min. 
+
+To evaluate the pretrained model, run:
+```
+python test.py [path_to_config_file] [path_to_pretrained_ckpt]
+```
+
+### Pretrained Models
+
+You can download the pretrained models from following [link](https://drive.google.com/drive/folders/1gbeeVQ-Y8Dj2FrsDatf5ZLWpzv5u8HyL?usp=sharing).
+
+## Acknowledgement
+
+Our code is based on code from:
+ - [MMPose](https://github.com/open-mmlab/mmpose)
+ - [PoseAnything](https://github.com/orhir/PoseAnything)
+
+
+## License
+This project is released under the Apache 2.0 license.
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0827e8b712b895edd76ceb408ce337b060fc56b
--- /dev/null
+++ b/app.py
@@ -0,0 +1,148 @@
+import argparse
+from pathlib import Path
+
+import gradio as gr
+import matplotlib
+
+from gradio_utils.utils import (process_img, get_select_coords, select_skeleton,
+                                reset_skeleton, reset_kp, process)
+
+LENGTH = 480  # Length of the square area displaying/editing images
+
+matplotlib.use('agg')
+model_dir = Path('./checkpoints')
+parser = argparse.ArgumentParser(description='EdgeCape Demo')
+parser.add_argument('--checkpoint',
+                    help='checkpoint path',
+                    default='ckpt/1shot_split1.pth')
+args = parser.parse_args()
+checkpoint_path = args.checkpoint
+device = 'cuda'
+TIMEOUT = 80
+
+with gr.Blocks() as demo:
+    gr.Markdown('''
+    # We introduce EdgeCape, a novel framework that overcomes these limitations by predicting the graph's edge weights which optimizes localization. 
+    To further leverage structural priors, we propose integrating Markovian Structural Bias, which modulates the self-attention interaction between nodes based on the number of hops between them. 
+    We show that this improves the model’s ability to capture global spatial dependencies. 
+    Evaluated on the MP-100 benchmark, which includes 100 categories and over 20K images, 
+    EdgeCape achieves state-of-the-art results in the 1-shot setting and leads among similar-sized methods in the 5-shot setting, significantly improving keypoint localization accuracy.
+    ### [Paper](https://arxiv.org/pdf/2411.16665) | [Project Page](https://orhir.github.io/edge_cape/) 
+    ## Instructions
+    1. Upload an image of the object you want to pose.
+    2. Mark keypoints on the image.
+    3. Mark limbs on the image.
+    4. Upload an image of the object you want to pose to the query image (**bottom**).
+    5. Click **Evaluate** to pose the query image.
+    ''')
+
+    global_state = gr.State({
+        "images": {},
+        "points": [],
+        "skeleton": [],
+        "prev_point": None,
+        "curr_type_point": "start",
+    })
+    with gr.Row():
+        # Upload & Preprocess Image Column
+        with gr.Column():
+            gr.Markdown(
+                """<p style="text-align: center; font-size: 20px">Upload & Preprocess Image</p>"""
+            )
+            support_image = gr.Image(
+                height=LENGTH,
+                width=LENGTH,
+                type="pil",
+                image_mode="RGB",
+                label="Preprocess Image",
+                show_label=True,
+                interactive=True,
+            )
+
+        # Click Points Column
+        with gr.Column():
+            gr.Markdown(
+                """<p style="text-align: center; font-size: 20px">Click Points</p>"""
+            )
+            kp_support_image = gr.Image(
+                type="pil",
+                label="Keypoints Image",
+                show_label=True,
+                height=LENGTH,
+                width=LENGTH,
+                interactive=False,
+                show_fullscreen_button=False,
+            )
+            with gr.Row():
+                confirm_kp_button = gr.Button("Confirm Clicked Points", scale=3)
+            with gr.Row():
+                undo_kp_button = gr.Button("Undo Clicked Points", scale=3)
+
+        # Editing Results Column
+        with gr.Column():
+            gr.Markdown(
+                """<p style="text-align: center; font-size: 20px">Click Skeleton</p>"""
+            )
+            skel_support_image = gr.Image(
+                type="pil",
+                label="Skeleton Image",
+                show_label=True,
+                height=LENGTH,
+                width=LENGTH,
+                interactive=False,
+                show_fullscreen_button=False,
+            )
+            with gr.Row():
+                pass
+            with gr.Row():
+                undo_skel_button = gr.Button("Undo Skeleton")
+
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown(
+                """<p style="text-align: center; font-size: 20px">Query Image</p>"""
+            )
+            query_image = gr.Image(
+                type="pil",
+                image_mode="RGB",
+                label="Query Image",
+                show_label=True,
+                interactive=True,
+            )
+        with gr.Column():
+            gr.Markdown(
+                """<p style="text-align: center; font-size: 20px">Output</p>"""
+            )
+            output_img = gr.Plot(label="Output Image",)
+    with gr.Row():
+        eval_btn = gr.Button(value="Evaluate")
+    with gr.Row():
+        gr.Markdown("## Examples")
+
+    support_image.change(process_img,
+                         inputs=[support_image, global_state],
+                         outputs=[kp_support_image, global_state])
+    kp_support_image.select(get_select_coords,
+                       [global_state],
+                       [global_state, kp_support_image],
+                       queue=False,)
+    confirm_kp_button.click(reset_skeleton,
+                            inputs=global_state,
+                            outputs=skel_support_image)
+    undo_kp_button.click(reset_kp,
+                         inputs=global_state,
+                         outputs=[kp_support_image, skel_support_image])
+    undo_skel_button.click(reset_skeleton,
+                            inputs=global_state,
+                            outputs=skel_support_image)
+    skel_support_image.select(select_skeleton,
+                              inputs=[global_state],
+                              outputs=[global_state, skel_support_image])
+    eval_btn.click(fn=process,
+                   inputs=[query_image, global_state],
+                   outputs=[output_img, global_state])
+
+if __name__ == "__main__":
+    print("Start app", parser.parse_args())
+    gr.close_all()
+    demo.launch(show_api=False)
diff --git a/ckpt/1shot_split1.pth b/ckpt/1shot_split1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d28320cd9a5366a9ed809ec4b267341572f20be3
--- /dev/null
+++ b/ckpt/1shot_split1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f99691d78a2145d0b4892c8d6cf53505929fedfd569a488f171de37f56971829
+size 480718083
diff --git a/ckpt/testing_log.txt b/ckpt/testing_log.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a7489b7377d43f951c66af4754417161a5aa6732
--- /dev/null
+++ b/ckpt/testing_log.txt
@@ -0,0 +1,11 @@
+**  config_file: configs/test/1shot_split1.py	 checkpoint: ckpt/1shot_split1.pth	 
+	 AUC: 0.9088144381031391
+	 EPE: 31.381722289753604
+	 NME: 0.06428522136244384
+	 PCK@0.05: 0.6425443164685964
+	 PCK@0.1: 0.8391598907887715
+	 PCK@0.15: 0.9041310635767583
+	 PCK@0.2: 0.9369288485314081
+	 PCK@0.25: 0.9571147947802892
+	 mPCK: 0.8559757828291646
+********************************************************************
diff --git a/configs/test/1shot_split1.py b/configs/test/1shot_split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d77ba0ed8248a4486bfd806edffb82e6b0bbf67
--- /dev/null
+++ b/configs/test/1shot_split1.py
@@ -0,0 +1,254 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split1_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split1_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split1_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/1shot_split2.py b/configs/test/1shot_split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc05bdaf0e6fe6c65bbcc0c513100d31a52d82b6
--- /dev/null
+++ b/configs/test/1shot_split2.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split2_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split2_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split2_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/1shot_split3.py b/configs/test/1shot_split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..150d2f80af1b2d1997fcc375468f4f46c843a9c6
--- /dev/null
+++ b/configs/test/1shot_split3.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split3_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split3_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split3_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/1shot_split4.py b/configs/test/1shot_split4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad63c1843f4edb5e62b9e11a2e6a23bf1fca98e
--- /dev/null
+++ b/configs/test/1shot_split4.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split4_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split4_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split4_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/1shot_split5.py b/configs/test/1shot_split5.py
new file mode 100644
index 0000000000000000000000000000000000000000..d460c1847a9f210fde4c172feb0c08b10e9e10e1
--- /dev/null
+++ b/configs/test/1shot_split5.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split5_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split5_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split5_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/5shot_split1.py b/configs/test/5shot_split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b870fa5bc198a0316abec38a63258126d8914f
--- /dev/null
+++ b/configs/test/5shot_split1.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split1_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split1_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split1_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/5shot_split2.py b/configs/test/5shot_split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..91671ff0ef7c9a0209678e167d1112a7ee9e593a
--- /dev/null
+++ b/configs/test/5shot_split2.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split2_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split2_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split2_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/5shot_split3.py b/configs/test/5shot_split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..07b4fc76c6a3e625ff1176b86abbe43e0feb6ade
--- /dev/null
+++ b/configs/test/5shot_split3.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split3_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split3_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split3_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/5shot_split4.py b/configs/test/5shot_split4.py
new file mode 100644
index 0000000000000000000000000000000000000000..c43ebcbc1b11bf9ed57c60473dbd25a76c9546b1
--- /dev/null
+++ b/configs/test/5shot_split4.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split4_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split4_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split4_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/test/5shot_split5.py b/configs/test/5shot_split5.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffefae18e2d1a36bed143e3aba42051c94b44b60
--- /dev/null
+++ b/configs/test/5shot_split5.py
@@ -0,0 +1,261 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(type='Adam', lr=1e-05)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0],
+    max_kpt_num=100)
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=True,
+            use_bias_attn_module=True,
+            attn_bias=True,
+            max_hops=4),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor', learn_skeleton=True),
+        learn_skeleton=True,
+        masked_supervision=True,
+        masking_ratio=0.5,
+        model_freeze='skeleton'),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11),
+    freeze_backbone=True)
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=1,
+    num_joints=1,
+    dataset_channel=[[0]],
+    inference_channel=[0])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton'
+        ])
+]
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split5_train.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(
+                type='TopDownGetRandomScaleRotation',
+                rot_factor=15,
+                scale_factor=0.15),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split5_val.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file='data/mp100/annotations/mp100_split5_test.json',
+        img_prefix='data/mp100/images/',
+        data_cfg=dict(
+            image_size=[256, 256],
+            heatmap_size=[64, 64],
+            num_output_channels=1,
+            num_joints=1,
+            dataset_channel=[[0]],
+            inference_channel=[0]),
+        valid_class_ids=None,
+        max_kpt_num=100,
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.1, 0.15, 0.2, 0.25],
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='TopDownAffineFewShot'),
+            dict(type='ToTensor'),
+            dict(
+                type='NormalizeTensor',
+                mean=[0.485, 0.456, 0.406],
+                std=[0.229, 0.224, 0.225]),
+            dict(type='TopDownGenerateTargetFewShot', sigma=1),
+            dict(
+                type='Collect',
+                keys=['img', 'target', 'target_weight'],
+                meta_keys=[
+                    'image_file', 'joints_3d', 'joints_3d_visible', 'center',
+                    'scale', 'rotation', 'bbox_score', 'flip_pairs',
+                    'category_id', 'skeleton'
+                ])
+        ]))
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend')
+]
+visualizer = dict(
+    type='PoseLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+        dict(type='TensorboardVisBackend')
+    ],
+    name='visualizer')
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/1shot_split1.py b/configs/train/1shot_split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..c110649abffc39685e4b97d6868f4bd50eebfef3
--- /dev/null
+++ b/configs/train/1shot_split1.py
@@ -0,0 +1,180 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    encoder_config=dict(),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(type='SkeletonPredictor')
+    ),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/1shot_split2.py b/configs/train/1shot_split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..255d380e3de2a42bd55265e3d04aa114dab24e20
--- /dev/null
+++ b/configs/train/1shot_split2.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/1shot_split3.py b/configs/train/1shot_split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfbe95f01fc1e2c0418e8e355a9ee05b804d25d6
--- /dev/null
+++ b/configs/train/1shot_split3.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/1shot_split4.py b/configs/train/1shot_split4.py
new file mode 100644
index 0000000000000000000000000000000000000000..385f13d689d1277d2ead696cd197cfd1c80e720b
--- /dev/null
+++ b/configs/train/1shot_split4.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/1shot_split5.py b/configs/train/1shot_split5.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc76e509ac3c26bd40eaf702a8e4d902de01a7b3
--- /dev/null
+++ b/configs/train/1shot_split5.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/5shot_split1.py b/configs/train/5shot_split1.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ffa78c699c5294b5e8c556f7d3ab09796bfbca5
--- /dev/null
+++ b/configs/train/5shot_split1.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/5shot_split2.py b/configs/train/5shot_split2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4801b3e4330d8cb66badddd6c302a440cb152ae
--- /dev/null
+++ b/configs/train/5shot_split2.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/5shot_split3.py b/configs/train/5shot_split3.py
new file mode 100644
index 0000000000000000000000000000000000000000..31c8032d77b8c9b48e871521504bbfecd8535dc5
--- /dev/null
+++ b/configs/train/5shot_split3.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/5shot_split4.py b/configs/train/5shot_split4.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa9edfadfad061d840268791555657b64e351d8b
--- /dev/null
+++ b/configs/train/5shot_split4.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/configs/train/5shot_split5.py b/configs/train/5shot_split5.py
new file mode 100644
index 0000000000000000000000000000000000000000..9956481e5db3b7cce9447e5593ad5720dc29e0f4
--- /dev/null
+++ b/configs/train/5shot_split5.py
@@ -0,0 +1,193 @@
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 100
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+
+# model settings
+model = dict(
+    type='EdgeCape',
+    
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        img_size=256
+    ),
+    keypoint_head=dict(
+        type='TwoStageHead',
+        in_channels=768,
+        transformer=dict(
+            type='TwoStageSupportRefineTransformer',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=False,
+        
+        heatmap_loss_weight=2.0,
+        skeleton_loss_weight=1.0,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True),
+        skeleton_head=dict(
+            type='SkeletonPredictor'
+        )),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+
+test_pipeline = valid_pipeline
+
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=5,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+shuffle_cfg = dict(interval=1)
diff --git a/demo.py b/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a58cfbef254808f6a166bcf338beed52dd29d0a
--- /dev/null
+++ b/demo.py
@@ -0,0 +1,273 @@
+import argparse
+import copy
+import pickle
+import random
+import cv2
+import numpy as np
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.runner import load_checkpoint
+from mmpose.core import wrap_fp16_model
+from mmpose.models import build_posenet
+from torchvision import transforms
+from EdgeCape import *  # noqa
+import torchvision.transforms.functional as F
+from EdgeCape.models.utils.visualization import old_plot_results, plot_results
+
+COLORS = [
+    [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
+    [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
+    [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
+    [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
+
+
+class Resize_Pad:
+    def __init__(self, w=256, h=256):
+        self.w = w
+        self.h = h
+
+    def __call__(self, image):
+        _, w_1, h_1 = image.shape
+        ratio_1 = w_1 / h_1
+        # check if the original and final aspect ratios are the same within a margin
+        if round(ratio_1, 2) != 1:
+            # padding to preserve aspect ratio
+            if ratio_1 > 1:  # Make the image higher
+                hp = int(w_1 - h_1)
+                hp = hp // 2
+                image = F.pad(image, (hp, 0, hp, 0), 0, "constant")
+                return F.resize(image, [self.h, self.w])
+            else:
+                wp = int(h_1 - w_1)
+                wp = wp // 2
+                image = F.pad(image, (0, wp, 0, wp), 0, "constant")
+                return F.resize(image, [self.h, self.w])
+        else:
+            return F.resize(image, [self.h, self.w])
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Pose Anything Demo')
+    parser.add_argument('--support', help='Image file')
+    parser.add_argument('--query', help='Image file')
+    parser.add_argument('--config', default=None, help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+             'the inference speed')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+             'in xxx=yyy format will be merged into config file. For example, '
+             "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+
+
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
+
+
+def main():
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.data.test.test_mode = True
+
+    # Load data
+    support_img = cv2.imread(args.support)
+    query_img = cv2.imread(args.query)
+    if support_img is None or query_img is None:
+        raise ValueError('Fail to read images')
+
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
+
+    # frame = copy.deepcopy(support_img)
+    padded_support_img = preprocess(support_img).cpu().numpy().transpose(1, 2, 0) * 255
+    frame = copy.deepcopy(padded_support_img.astype(np.uint8).copy())
+    kp_src = []
+    skeleton = []
+    count = 0
+    prev_pt = None
+    prev_pt_idx = None
+    color_idx = 0
+
+    def selectKP(event, x, y, flags, param):
+        nonlocal kp_src, frame
+        # if we are in points selection mode, the mouse was clicked,
+        # list of  points with the (x, y) location of the click
+        # and draw the circle
+
+        if event == cv2.EVENT_LBUTTONDOWN:
+            kp_src.append((x, y))
+            cv2.circle(frame, (x, y), 2, (0, 0, 255), 1)
+            cv2.imshow("Source", frame)
+
+        if event == cv2.EVENT_RBUTTONDOWN:
+            kp_src = []
+            frame = copy.deepcopy(support_img)
+            cv2.imshow("Source", frame)
+
+    def draw_line(event, x, y, flags, param):
+        nonlocal skeleton, kp_src, frame, count, prev_pt, prev_pt_idx, marked_frame, color_idx
+        if event == cv2.EVENT_LBUTTONDOWN:
+            closest_point = min(kp_src, key=lambda p: (p[0] - x) ** 2 + (p[1] - y) ** 2)
+            closest_point_index = kp_src.index(closest_point)
+            if color_idx < len(COLORS):
+                c = COLORS[color_idx]
+            else:
+                c = random.choices(range(256), k=3)
+            color = color_idx
+            cv2.circle(frame, closest_point, 2, c, 1)
+            if count == 0:
+                prev_pt = closest_point
+                prev_pt_idx = closest_point_index
+                count = count + 1
+                cv2.imshow("Source", frame)
+            else:
+                cv2.line(frame, prev_pt, closest_point, c, 2)
+                cv2.imshow("Source", frame)
+                count = 0
+                skeleton.append((prev_pt_idx, closest_point_index))
+                color_idx = color_idx + 1
+        elif event == cv2.EVENT_RBUTTONDOWN:
+            frame = copy.deepcopy(marked_frame)
+            cv2.imshow("Source", frame)
+            count = 0
+            color_idx = 0
+            skeleton = []
+            prev_pt = None
+
+
+    cv2.namedWindow("Source", cv2.WINDOW_NORMAL)
+    cv2.resizeWindow('Source', 800, 600)
+    cv2.setMouseCallback("Source", selectKP)
+    cv2.imshow("Source", frame)
+
+    # keep looping until points have been selected
+    while len(kp_src) < 1:
+        print('Press any key when finished marking the points!! ')
+        cv2.waitKey(0)
+
+    marked_frame = copy.deepcopy(frame)
+    cv2.setMouseCallback("Source", draw_line)
+    print('Press any key when finished creating skeleton!! ')
+    while True:
+        if cv2.waitKey(1) > 0:
+            break
+
+    kp_src = torch.tensor(kp_src).float()
+
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
+
+
+    support_img = preprocess(support_img).flip(0)[None]
+    query_img = preprocess(query_img).flip(0)[None]
+    # Create heatmap from keypoints
+    genHeatMap = TopDownGenerateTargetFewShot()
+    data_cfg = cfg.data_cfg
+    data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size])
+    data_cfg['joint_weights'] = None
+    data_cfg['use_different_joint_weights'] = False
+    kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1)
+    kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1)
+    target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=2)
+    target_s = torch.tensor(target_s).float()[None]
+    target_weight_s = torch.tensor(target_weight_s).float()[None]
+
+    original_support_img = support_img.clone()
+
+    data = {
+        'img_s': [support_img.cuda()],
+        'img_q': query_img.cuda(),
+        'target_s': [target_s.cuda()],
+        'target_weight_s': [target_weight_s.cuda()],
+        'target_q': None,
+        'target_weight_q': None,
+        'return_loss': False,
+        'img_metas': [{'sample_skeleton': [skeleton],
+                       'query_skeleton': skeleton,
+                       'sample_joints_3d': [kp_src_3d.cuda()],
+                       'query_joints_3d': kp_src_3d.cuda(),
+                       'sample_center': [kp_src.mean(dim=0)],
+                       'query_center': kp_src.mean(dim=0),
+                       'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]],
+                       'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0],
+                       'sample_rotation': [0],
+                       'query_rotation': 0,
+                       'sample_bbox_score': [1],
+                       'query_bbox_score': 1,
+                       'query_image_file': '',
+                       'sample_image_file': [''],
+                       }]
+    }
+
+    # Load model
+    model = build_posenet(cfg.model).cuda()
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    model.eval()
+
+    with torch.no_grad():
+        outputs = model(**data)
+
+    # visualize results
+    vis_s_weight = target_weight_s[0]
+    vis_q_weight = target_weight_s[0]
+    vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    support_kp = kp_src_3d
+    _, original_skeleton = model.keypoint_head.skeleton_head.adj_mx_from_edges(num_pts=outputs['points'].shape[2],
+                                                                            skeleton=[skeleton],
+                                                                            mask=target_weight_s.squeeze(-1).bool(),
+                                                                            device=target_weight_s.device)
+    skeleton = outputs['skeleton']
+    plot_results(vis_s_image,
+                 vis_q_image,
+                 support_kp,
+                 vis_s_weight,
+                 None,
+                 vis_s_weight,
+                 skeleton,
+                 None,
+                 torch.tensor(outputs['points']).squeeze(),
+                 out_dir='demo',
+                 original_skeleton=original_skeleton[0].cpu().numpy(),
+                 img_alpha=1.0,
+                 radius=3,
+                 )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gradio_utils/__pycache__/utils.cpython-39.pyc b/gradio_utils/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32c19762d57d865f88668d45c6a38e45b75db866
Binary files /dev/null and b/gradio_utils/__pycache__/utils.cpython-39.pyc differ
diff --git a/gradio_utils/utils.py b/gradio_utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c906fb26ed3041907d49aa6d4e034d447760e67
--- /dev/null
+++ b/gradio_utils/utils.py
@@ -0,0 +1,407 @@
+import random
+import collections
+import gradio as gr
+import numpy as np
+import psutil
+import torch
+from PIL import ImageDraw, Image, ImageEnhance
+from matplotlib import pyplot as plt
+from mmcv import Config
+from mmcv.runner import load_checkpoint
+from mmpose.core import wrap_fp16_model
+from mmpose.models import build_posenet
+from torchvision import transforms
+import matplotlib.patheffects as mpe
+from demo import Resize_Pad
+from EdgeCape.models import *
+
+
+def process_img(support_image, global_state):
+    global_state['images']['image_orig'] = support_image
+    global_state['images']['image_kp'] = support_image
+    reset_kp(global_state)
+    return support_image, global_state
+
+
+def adj_mx_from_edges(num_pts, skeleton, device='cuda', normalization_fix=True):
+    adj_mx = torch.empty(0, device=device)
+    batch_size = len(skeleton)
+    for b in range(batch_size):
+        edges = torch.tensor(skeleton[b])
+        adj = torch.zeros(num_pts, num_pts, device=device)
+        adj[edges[:, 0], edges[:, 1]] = 1
+        adj_mx = torch.concatenate((adj_mx, adj.unsqueeze(0)), dim=0)
+    trans_adj_mx = torch.transpose(adj_mx, 1, 2)
+    cond = (trans_adj_mx > adj_mx).float()
+    adj = adj_mx + trans_adj_mx * cond - adj_mx * cond
+    return adj
+
+
+def plot_results(support_img, query_img, support_kp, support_w, query_kp, query_w,
+                 skeleton=None, prediction=None, radius=6, in_color=None,
+                 original_skeleton=None, img_alpha=0.6, target_keypoints=None):
+    h, w, c = support_img.shape
+    prediction = prediction[-1] * h
+    if isinstance(prediction, torch.Tensor):
+        prediction = prediction.cpu().numpy()
+    if isinstance(skeleton, list):
+        skeleton = adj_mx_from_edges(num_pts=100, skeleton=[skeleton]).cpu().numpy()[0]
+        original_skeleton = skeleton
+    support_img = (support_img - np.min(support_img)) / (np.max(support_img) - np.min(support_img))
+    query_img = (query_img - np.min(query_img)) / (np.max(query_img) - np.min(query_img))
+    error_mask = None
+    for id, (img, w, keypoint, adj) in enumerate(zip([support_img, support_img, query_img],
+                                                     [support_w, support_w, query_w],
+                                                     # [support_kp, query_kp])):
+                                                     [support_kp, support_kp, prediction],
+                                                     [original_skeleton, skeleton, skeleton])):
+        color = in_color
+        f, axes = plt.subplots()
+        plt.imshow(img, alpha=img_alpha)
+
+        # On qeury image plot
+        if id == 2 and target_keypoints is not None:
+            error = np.linalg.norm(keypoint - target_keypoints, axis=-1)
+            error_mask = error > (256 * 0.05)
+
+        for k in range(keypoint.shape[0]):
+            if w[k] > 0:
+                kp = keypoint[k, :2]
+                c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
+                if error_mask is not None and error_mask[k]:
+                    c = (1, 1, 0, 0.75)
+                    patch = plt.Circle(kp,
+                                       radius,
+                                       color=c,
+                                       path_effects=[mpe.withStroke(linewidth=8, foreground='black'),
+                                                     mpe.withStroke(linewidth=4, foreground='white'),
+                                                     mpe.withStroke(linewidth=2, foreground='black'),
+                                                     ],
+                                       zorder=260)
+                    axes.add_patch(patch)
+                    axes.text(kp[0], kp[1], k, fontsize=10, color='black', ha="center", va="center", zorder=320, )
+                else:
+                    patch = plt.Circle(kp,
+                                       radius,
+                                       color=c,
+                                       path_effects=[mpe.withStroke(linewidth=2, foreground='black')],
+                                       zorder=200)
+                    axes.add_patch(patch)
+                    axes.text(kp[0], kp[1], k, fontsize=(radius + 4), color='white', ha="center", va="center",
+                              zorder=300,
+                              path_effects=[
+                                  mpe.withStroke(linewidth=max(1, int((radius + 4) / 5)), foreground='black')])
+                # axes.text(kp[0], kp[1], k)
+                plt.draw()
+
+        if adj is not None:
+            # Make max value 6
+            draw_skeleton = adj ** 1
+            max_skel_val = np.max(draw_skeleton)
+            draw_skeleton = draw_skeleton / max_skel_val * 6
+            for i in range(1, keypoint.shape[0]):
+                for j in range(0, i):
+                    if w[i] > 0 and w[j] > 0 and original_skeleton[i][j] > 0:
+                        if color is None:
+                            num_colors = int((skeleton > 0.05).sum() / 2)
+                            color = iter(plt.cm.rainbow(np.linspace(0, 1, num_colors + 1)))
+                            c = next(color)
+                        elif isinstance(color, str):
+                            c = color
+                        elif isinstance(color, collections.Iterable):
+                            c = next(color)
+                        else:
+                            raise ValueError("Color must be a string or an iterable")
+                    if w[i] > 0 and w[j] > 0 and skeleton[i][j] > 0:
+                        width = draw_skeleton[i][j]
+                        stroke_width = width + (width / 3)
+                        patch = plt.Line2D([keypoint[i, 0], keypoint[j, 0]],
+                                           [keypoint[i, 1], keypoint[j, 1]],
+                                           linewidth=width, color=c, alpha=0.6,
+                                           path_effects=[mpe.withStroke(linewidth=stroke_width, foreground='black')],
+                                           zorder=1)
+                        axes.add_artist(patch)
+
+        plt.axis('off')  # command for hiding the axis.
+        plt.subplots_adjust(0, 0, 1, 1, 0, 0)
+        return plt
+
+
+def process(query_img, state,
+            cfg_path='configs/test/1shot_split1.py',
+            checkpoint_path='ckpt/1shot_split1.pth'):
+    cfg = Config.fromfile(cfg_path)
+    width, height, _ = state['original_support_image'].shape
+    kp_src_np = np.array(state['kp_src']).copy().astype(np.float32)
+    kp_src_np[:, 0] = kp_src_np[:, 0] / (width // 4) * cfg.model.encoder_config.img_size
+    kp_src_np[:, 1] = kp_src_np[:, 1] / (height // 4) * cfg.model.encoder_config.img_size
+    kp_src_np = np.flip(kp_src_np, 1).copy()
+    kp_src_tensor = torch.tensor(kp_src_np).float()
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        Resize_Pad(cfg.model.encoder_config.img_size,
+                   cfg.model.encoder_config.img_size)])
+
+    if len(state['skeleton']) == 0:
+        state['skeleton'] = [(0, 0)]
+
+    support_img = preprocess(state['images']['image_orig']).flip(0)[None]
+    np_query = np.array(query_img)[:, :, ::-1].copy()
+    q_img = preprocess(np_query).flip(0)[None]
+    # Create heatmap from keypoints
+    genHeatMap = TopDownGenerateTargetFewShot()
+    data_cfg = cfg.data_cfg
+    data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size,
+                                       cfg.model.encoder_config.img_size])
+    data_cfg['joint_weights'] = None
+    data_cfg['use_different_joint_weights'] = False
+    kp_src_3d = torch.cat(
+        (kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
+    kp_src_3d_weight = torch.cat(
+        (torch.ones_like(kp_src_tensor),
+         torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
+    target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg,
+                                                                 kp_src_3d,
+                                                                 kp_src_3d_weight,
+                                                                 sigma=1)
+    target_s = torch.tensor(target_s).float()[None]
+    target_weight_s = torch.ones_like(
+        torch.tensor(target_weight_s).float()[None])
+
+    data = {
+        'img_s': [support_img],
+        'img_q': q_img,
+        'target_s': [target_s],
+        'target_weight_s': [target_weight_s],
+        'target_q': None,
+        'target_weight_q': None,
+        'return_loss': False,
+        'img_metas': [{'sample_skeleton': [state['skeleton']],
+                       'query_skeleton': state['skeleton'],
+                       'sample_joints_3d': [kp_src_3d],
+                       'query_joints_3d': kp_src_3d,
+                       'sample_center': [kp_src_tensor.mean(dim=0)],
+                       'query_center': kp_src_tensor.mean(dim=0),
+                       'sample_scale': [
+                           kp_src_tensor.max(dim=0)[0] -
+                           kp_src_tensor.min(dim=0)[0]],
+                       'query_scale': kp_src_tensor.max(dim=0)[0] -
+                                      kp_src_tensor.min(dim=0)[0],
+                       'sample_rotation': [0],
+                       'query_rotation': 0,
+                       'sample_bbox_score': [1],
+                       'query_bbox_score': 1,
+                       'query_image_file': '',
+                       'sample_image_file': [''],
+                       }]
+    }
+    # Load model
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, checkpoint_path, map_location='cpu')
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**data)
+    # visualize results
+    vis_s_weight = target_weight_s[0]
+    vis_s_image = support_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    support_kp = kp_src_3d
+    out = plot_results(vis_s_image,
+                       vis_q_image,
+                       support_kp,
+                       vis_s_weight,
+                       None,
+                       vis_s_weight,
+                       outputs['skeleton'],
+                       torch.tensor(outputs['points']).squeeze(),
+                       original_skeleton=state['skeleton'],
+                       img_alpha=1.0,
+                       )
+    return out, state
+
+
+def update_examples(support_img, posed_support, query_img, state, r=0.015, width=0.02):
+    state['color_idx'] = 0
+    state['original_support_image'] = np.array(support_img)[:, :, ::-1].copy()
+    support_img, posed_support, _ = set_query(support_img, state, example=True)
+    w, h = support_img.size
+    draw_pose = ImageDraw.Draw(support_img)
+    draw_limb = ImageDraw.Draw(posed_support)
+    r = int(r * w)
+    width = int(width * w)
+    for pixel in state['kp_src']:
+        leftUpPoint = (pixel[1] - r, pixel[0] - r)
+        rightDownPoint = (pixel[1] + r, pixel[0] + r)
+        twoPointList = [leftUpPoint, rightDownPoint]
+        draw_pose.ellipse(twoPointList, fill=(255, 0, 0, 255))
+        draw_limb.ellipse(twoPointList, fill=(255, 0, 0, 255))
+    for limb in state['skeleton']:
+        point_a = state['kp_src'][limb[0]][::-1]
+        point_b = state['kp_src'][limb[1]][::-1]
+        if state['color_idx'] < len(COLORS):
+            c = COLORS[state['color_idx']]
+            state['color_idx'] += 1
+        else:
+            c = random.choices(range(256), k=3)
+        draw_limb.line([point_a, point_b], fill=tuple(c), width=width)
+    return support_img, posed_support, query_img, state
+
+
+def get_select_coords(global_state,
+                      evt: gr.SelectData
+                      ):
+    """This function only support click for point selection
+    """
+    xy = evt.index
+    global_state["points"].append(xy)
+    # point_idx = get_latest_points_pair(points)
+    # if point_idx is None:
+    #     points[0] = {'start': xy, 'target': None}
+    #     print(f'Click Image - Start - {xy}')
+    # elif points[point_idx].get('target', None) is None:
+    #     points[point_idx]['target'] = xy
+    #     print(f'Click Image - Target - {xy}')
+    # else:
+    #     points[point_idx + 1] = {'start': xy, 'target': None}
+    #     print(f'Click Image - Start - {xy}')
+
+    image_raw = global_state['images']['image_kp']
+    image_draw = update_image_draw(
+        image_raw,
+        xy,
+        global_state
+    )
+    global_state['images']['image_kp'] = image_draw
+    return global_state, image_draw
+
+def get_closest_point_idx(pts_list, xy):
+    x, y = xy
+    closest_point = min(pts_list, key=lambda p: (p[0] - x) ** 2 + (p[1] - y) ** 2)
+    closest_point_index = pts_list.index(closest_point)
+    return closest_point_index
+
+
+def reset_skeleton(global_state):
+    image = global_state["images"]["image_kp"]
+    global_state["images"]["image_skel"] = image
+    global_state["skeleton"] = []
+    global_state["curr_type_point"] = "start"
+    global_state["prev_point"] = None
+    return image
+
+
+def reset_kp(global_state):
+    image = global_state["images"]["image_orig"]
+    global_state["images"]["image_kp"] = image
+    global_state["images"]["image_skel"] = image
+    global_state["skeleton"] = []
+    global_state["points"] = []
+    global_state["curr_type_point"] = "start"
+    global_state["prev_point"] = None
+    return image, image
+
+
+def select_skeleton(global_state,
+                    evt: gr.SelectData,
+                    ):
+    xy = evt.index
+    pts_list = global_state["points"]
+    closest_point_idx = get_closest_point_idx(pts_list, xy)
+    image_raw = global_state['images']['image_skel']
+    if global_state["curr_type_point"] == "end":
+        prev_point_idx = global_state["prev_point_idx"]
+        prev_point = pts_list[prev_point_idx]
+        points = [prev_point, xy]
+        image_draw = draw_limbs_on_image(image_raw,
+                                         points
+                                         )
+        global_state['images']['image_skel'] = image_draw
+        global_state['skeleton'].append([prev_point_idx, closest_point_idx])
+        global_state["curr_type_point"] = "start"
+        global_state["prev_point_idx"] = None
+    else:
+        global_state["prev_point_idx"] = closest_point_idx
+        global_state["curr_type_point"] = "end"
+    return global_state, global_state['images']['image_skel']
+
+
+def reverse_point_pairs(points):
+    new_points = []
+    for p in points:
+        new_points.append([p[1], p[0]])
+    return new_points
+
+
+def update_image_draw(image, points, global_state):
+    if len(global_state["points"]) < 2:
+        alpha = 0.5
+    else:
+        alpha = 1.0
+    image_draw = draw_points_on_image(image, points, alpha=alpha)
+    return image_draw
+
+
+def print_memory_usage():
+    # Print system memory usage
+    print(f"System memory usage: {psutil.virtual_memory().percent}%")
+
+    # Print GPU memory usage
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print(f"GPU memory usage: {torch.cuda.memory_allocated() / 1e9} GB")
+        print(
+            f"Max GPU memory usage: {torch.cuda.max_memory_allocated() / 1e9} GB")
+        device_properties = torch.cuda.get_device_properties(device)
+        available_memory = device_properties.total_memory - \
+                           torch.cuda.max_memory_allocated()
+        print(f"Available GPU memory: {available_memory / 1e9} GB")
+    else:
+        print("No GPU available")
+
+def draw_limbs_on_image(image,
+                        points,):
+    color = tuple(random.choices(range(256), k=3))
+    overlay_rgba = Image.new("RGBA", image.size, 0)
+    overlay_draw = ImageDraw.Draw(overlay_rgba)
+    p_start, p_target = points
+    if p_start is not None and p_target is not None:
+        p_draw = int(p_start[0]), int(p_start[1])
+        t_draw = int(p_target[0]), int(p_target[1])
+        overlay_draw.line(
+            (p_draw[0], p_draw[1], t_draw[0], t_draw[1]),
+            fill=color,
+            width=10,
+        )
+
+    return Image.alpha_composite(image.convert("RGBA"),
+                                 overlay_rgba).convert("RGB")
+
+
+def draw_points_on_image(image,
+                         points,
+                         radius_scale=0.01,
+                         alpha=1.):
+    if alpha < 1:
+        enhancer = ImageEnhance.Brightness(image)
+        image = enhancer.enhance(1.1)
+    overlay_rgba = Image.new("RGBA", image.size, 0)
+    overlay_draw = ImageDraw.Draw(overlay_rgba)
+    p_color = (255, 0, 0)
+    rad_draw = int(image.size[0] * radius_scale)
+    if points is not None:
+        p_draw = int(points[0]), int(points[1])
+        overlay_draw.ellipse(
+            (
+                p_draw[0] - rad_draw,
+                p_draw[1] - rad_draw,
+                p_draw[0] + rad_draw,
+                p_draw[1] + rad_draw,
+            ),
+            fill=p_color,
+            )
+
+    return Image.alpha_composite(image.convert("RGBA"), overlay_rgba).convert("RGB")
diff --git a/rename_ckpt.py b/rename_ckpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ee79b5fc5e55e9e4d02d8b4c3fba69da9725cf
--- /dev/null
+++ b/rename_ckpt.py
@@ -0,0 +1,29 @@
+import os
+import sys
+
+import torch
+
+def load_state_dicts(folder_path):
+    state_dicts = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".pth"):
+            print('Processing {}'.format(filename))
+            file_path = os.path.join(folder_path, filename)
+            state_dict = torch.load(file_path)
+            new_state_dict = {"state_dict": {},
+                              "optimizer": state_dict['optimizer'],
+                              "meta": state_dict['meta'],
+                              }
+            for key in state_dict['state_dict'].keys():
+                if 'spatial_pos_encoder' in key or 'skeleton_head.MLP' in key or 'skeleton_head.adj_output_mlp' in key:
+                    continue
+                new_key = key.replace("keypoint_head.", "keypoint_head_module.").replace('bias_function_prior_weight', 'markov_structural_mlp')
+                new_state_dict['state_dict'][new_key] = state_dict['state_dict'][key]
+            new_file_path = os.path.join(folder_path, f'{filename}')
+            print(f'Saving to {new_file_path}')
+            torch.save(new_state_dict, new_file_path)
+    return state_dicts
+
+if __name__ == "__main__":
+    folder_path = sys.argv[1]
+    load_state_dicts(folder_path)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1879427fbda711317e9a10993320639d3104d72f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+json_tricks
+numpy
+opencv-python
+pillow
+xtcocotools
+scipy
diff --git a/run.py b/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..63011c2542113137879fcb39bab45b6d04be0ed3
--- /dev/null
+++ b/run.py
@@ -0,0 +1,108 @@
+import re
+import subprocess
+import os
+import argparse
+from mmcv import Config, DictAction
+
+
+def init_parser():
+    # Get config and work_dir from user
+    parser = argparse.ArgumentParser(description='Run the pipeline')
+    parser.add_argument('--config', help='config file', required=True)
+    parser.add_argument('--work_dir', help='work directory', required=True)
+    parser.add_argument('--best', action='store_true', help='work directory')
+    parser.add_argument('--supervision', type=str, default='decoder', help='adj supervision')
+    parser.add_argument('--ft_epochs', type=int, default=100, help='work directory')
+    parser.add_argument('--masking_ratio', type=float, default=0.5, help='work directory')
+    parser.add_argument('--lamda_masking', type=float, default=1.0, help='work directory')
+    args = parser.parse_args()
+    return args
+
+
+def get_best_model(work_dir):
+    if os.path.exists(work_dir):
+        file_names = [filename for filename in os.listdir(work_dir) if filename.startswith("best_")]
+        if len(file_names) > 0:
+            file_name = file_names[0]
+            ckpt_path = f'{work_dir}/{file_name}'
+        else:
+            ckpt_path = f'{work_dir}/latest.pth'
+    return ckpt_path
+
+
+def main():
+    args = init_parser()
+    config = args.config
+    work_dir = args.work_dir
+    if args.best:
+        work_dir = f'{work_dir}_best_ckpt'
+
+    if not os.path.exists(work_dir):
+        os.makedirs(work_dir)
+        subprocess.run(['cp', config, work_dir])
+
+    # -----------------------------BASE MODEL TRAINING--------------------------------
+    base_workdir = f'{work_dir}/base'
+    cfg = Config.fromfile(args.config)
+    num_epochs = cfg.total_epochs
+    final_epoch_path = f'{base_workdir}/epoch_{num_epochs}.pth'
+    if not os.path.exists(final_epoch_path):
+
+        print("Running Base Model Training")
+        subprocess.run(['python', 'train.py', '--config', config, '--work-dir', base_workdir])
+
+    # -----------------------------SKELETON MODEL TRAINING--------------------------------
+    skeleton_work_dir = f'{work_dir}/base_skeleton'
+    skeleton_final_epoch_path = f'{skeleton_work_dir}/epoch_{args.ft_epochs}.pth'
+
+    if args.best:
+        best_ckpt = get_best_model(base_workdir)
+        load_from = best_ckpt
+    else:
+        load_from = final_epoch_path
+
+    new_cfg = Config.fromfile(args.config)
+    new_cfg.load_from = load_from
+    new_cfg.total_epochs = args.ft_epochs
+    new_cfg.model.freeze_backbone = True
+    new_cfg.model.keypoint_head.skeleton_head['learn_skeleton'] = True
+    new_cfg.model.keypoint_head.learn_skeleton = True
+    new_cfg.model.keypoint_head.masked_supervision = True
+    new_cfg.model.keypoint_head.masking_ratio = args.masking_ratio
+    new_cfg.model.keypoint_head.skeleton_loss_weight = args.lamda_masking
+    Config.dump(new_cfg, f'{work_dir}/skeleton_config.py')
+
+    if not os.path.exists(skeleton_final_epoch_path):
+        print("Running Base Model Training")
+        subprocess.run(
+            ['python', 'train.py', '--config', f'{work_dir}/skeleton_config.py', '--work-dir', skeleton_work_dir])
+
+    # -----------------------------BIAS MODEL TRAINING--------------------------------
+    bias_work_dir = f'{work_dir}/base_skeleton_bias'
+    bias_final_epoch_path = f'{bias_work_dir}/epoch_{args.ft_epochs}.pth'
+    if args.best:
+        best_ckpt = get_best_model(skeleton_work_dir)
+        load_from = best_ckpt
+    else:
+        load_from = skeleton_final_epoch_path
+
+    new_cfg.load_from = load_from
+    new_cfg.model.keypoint_head.transformer.use_bias_attn_module = True
+    new_cfg.model.keypoint_head.transformer.attn_bias = True
+    new_cfg.model.keypoint_head.transformer.max_hops = 4
+    new_cfg.model.keypoint_head.model_freeze = 'skeleton'
+    Config.dump(new_cfg, f'{work_dir}/bias_config.py')
+
+    if not os.path.exists(bias_final_epoch_path):
+        print("Running Bias Model Training")
+        subprocess.run(
+            ['python', 'train.py', '--config', f'{work_dir}/bias_config.py', '--work-dir', bias_work_dir])
+
+    # -----------------------------EVALUATION--------------------------------
+    best_ckpt = get_best_model(bias_work_dir)
+    subprocess.run(['python', 'test.py', f'{work_dir}/bias_config.py', f'{bias_work_dir}/latest.pth'])
+    subprocess.run(['python', 'test.py', f'{work_dir}/bias_config.py', best_ckpt])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/runai.py b/runai.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa0f0594e34f926e12a11ba20312ec4aee710a7
--- /dev/null
+++ b/runai.py
@@ -0,0 +1,371 @@
+# Description: Script to run multiple experiments on runai
+import re
+import subprocess
+import os
+import argparse
+import time
+from prettytable import PrettyTable
+
+class Bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+def pretty_table(dct):
+    table = PrettyTable(['Job', 'Status'])
+    for c in sorted(dct.keys()):
+        table.add_row([c, dct[c]])
+    print(table)
+
+
+def init_parser():
+    parser = argparse.ArgumentParser(prog="RUNAI SCRIPT")
+    parser.add_argument('action', type=str, default=None, help='Train or Test', choices=['train', 'test', 'run'])
+    parser.add_argument('--config_folder', type=str, default=None, help='Run all configs in folder')
+    parser.add_argument('--config', type=str, default=None, help='Run all configs in folder')
+    parser.add_argument('--name', type=str, default=None, help='prefix')
+    parser.add_argument('--delete', action='store_true', help='Delete job')
+    parser.add_argument('--delete_fail', action='store_true', help='Delete job')
+    parser.add_argument('--delete_pending', action='store_true', help='Delete job')
+    parser.add_argument('--log', action='store_true', help='Show logs')
+    parser.add_argument('--delete_folder', action='store_true', help='Delete workdir folder')
+    parser.add_argument('--permute_keypoints', action='store_true', help='Delete workdir folder')
+    parser.add_argument('--dist', action='store_true', help='Distributed  Training')
+    parser.add_argument('--find_best', action='store_true', help='Find best according to val')
+    parser.add_argument('--results', action='store_true', help='Show Results')
+    parser.add_argument('--no_base', action='store_true', help='Skip base models')
+    parser.add_argument('--show_cmd', action='store_true', help='Show command')
+    parser.add_argument('--large', action='store_true', help='Use large node')
+    parser.add_argument('--eval_three', action='store_true', help='Evaluate on 3 ckpts')
+    parser.add_argument('--pck', type=float, default=0.2, help='PCK threshold')
+    parser.add_argument('--auc',  action='store_true', help='Evaluate AUC')
+    parser.add_argument('--mpck',  action='store_true', help='Evaluate mPCK')
+    parser.add_argument('--check_logs', action='store_true', help='check runai logs instead of workdir')
+    parser.add_argument('--stat', action='store_true', help='check runai status')
+    parser.add_argument('--CVPR24', action='store_true', help='run on CVPR24 legacy folder')
+    parser.add_argument('--run_best_ckpt', action='store_true', help='run on CVPR24 legacy folder')
+    parser.add_argument('--num_samples', type=int, default=32, help='PCK threshold')
+    parser.add_argument('--ft_epochs', type=int, default=None, help='Num of FT epochs')
+    parser.add_argument('--masking', type=float, default=None, help='Num of FT epochs')
+    parser.add_argument('--masking_lamda', type=float, default=None, help='Num of FT epochs')
+
+    return parser.parse_args()
+
+
+def check_status(job_name):
+    status = None
+    status_command = f'runai describe job {job_name}'
+    log = subprocess.run(status_command, shell=True, capture_output=True)
+    log = log.stdout.decode('utf-8')
+    pattern = r"Status:\s+(\w+)"
+    match = re.search(pattern, log)
+    if match:
+        status = match.group(1)
+    return status
+
+
+def train_is_running(job_name, status=['Running', 'Pending', 'Failed']):
+    run_status = check_status(job_name)
+    for stat in status:
+        if run_status == stat:
+            print(f'{Bcolors.FAIL}{job_name} is {stat}{Bcolors.ENDC}')
+            return True
+    return False
+
+
+def get_best_run(workdir_path, config, find_best):
+    file_name = None
+    ckpt_path = f'{workdir_path}/latest.pth'
+    if find_best == 'best':
+        local_path = f'work_dir_runai/{config.split(".")[0]}'
+        if os.path.exists(local_path):
+            file_names = [filename for filename in os.listdir(local_path) if filename.startswith("best_")]
+            if len(file_names) > 0:
+                file_name = file_names[0]
+                ckpt_path = f'{workdir_path}/{file_name}'
+    elif find_best == 'epoch_100':
+        local_path = f'work_dir_runai/{config.split(".")[0]}'
+        if os.path.exists(local_path):
+            file_name = 'epoch_100.pth'
+            if len(file_name) > 0:
+                ckpt_path = f'{workdir_path}/{file_name}'
+    return ckpt_path, file_name
+
+def check_runai_logs(job_name):
+    os_command = f'runai logs {job_name}'
+    # status = subprocess.run(os_command, shell=True, capture_output=True)
+    # status = status.decode('utf-8')
+    status = subprocess.run(os_command, shell=True, capture_output=True, text=True)
+    status = status.stdout
+    return status
+
+
+def get_run_name(config, args, run):
+    run = run.replace('_', '-')
+    lwr_config = config.lower()
+    train_job_name = f'or-{lwr_config.split(".")[0].replace("_", "-")}'
+    if len(train_job_name) > 60:
+        renamed_config = name_abriviator(lwr_config)
+        train_job_name = f'or-{renamed_config.split(".")[0].replace("_", "-")}'[:60]
+    test_job_name = f'ev-{run}-{lwr_config.split(".")[0].replace("_", "-")}'
+    if len(test_job_name) > 40:
+        renamed_config = name_abriviator(lwr_config)
+        test_job_name = f'ev-{run}-{renamed_config.split(".")[0].replace("_", "-")}'[:58]
+    job_names = [train_job_name, test_job_name]
+    for i in range(len(job_names)):
+        if job_names[i].endswith('-'):
+            job_names[i] = job_names[i][:-1]
+        if args.name is not None:
+            job_names[i] = f'{args.name}-{job_names[i]}'
+    return job_names
+
+
+def name_abriviator(name):
+    replace_dict = {
+        'encoder': 'enc',
+        'decoder': 'dec',
+        'look_twice': 'lt',
+        'cross_category': 'cc',
+        'max_hops': 'hops',
+        'lamda': 'l',
+        'symmetric': 'sym',
+        'auxiliary': 'aux',
+        'batch_size': 'bs',
+    }
+    for key, value in replace_dict.items():
+        name = name.replace(key, value)
+    return name
+
+
+def check_skip(lwr_config, args):
+    if args.no_base and 'base' in lwr_config:
+        print(f'Skipping {Bcolors.OKCYAN}{lwr_config}{Bcolors.ENDC} - base model')
+        return True
+    # if not args.action == "train" and ('cross_category' in lwr_config or 'cross_cat' in lwr_config):
+    #     print(
+    #         f'Skipping {Bcolors.OKCYAN}{lwr_config}{Bcolors.ENDC} - test on cross_caregory, validation is the same as test')
+    #     return True
+    return False
+
+
+def print_results(results):
+    print(f'\n\n\n{Bcolors.OKGREEN}Scores{Bcolors.ENDC}')
+    config_length = max(15, max(len(key) for key in results.keys()))
+    config_column_width = config_length + 2
+    print(f'| {"Config":<{config_column_width}} | {"Max Value":<11} | {"Latest Value":<13} | {"Best Value":<10} | {"Best Epoch":<10} |')
+    print(f'|{"-" * (config_column_width + 2)}|{"-" * 13}|{"-" * 15}|{"-" * 13}|{"-" * 11}|')
+    for config, val_dict in sorted(results.items()):
+        config_print = config.split('/')[-1].replace('.py', '')
+        other_results = val_dict.copy()
+        del other_results['latest']
+        best_key = max(other_results, key=other_results.get)
+        latest_val = parse_result(val_dict['latest'], Bcolors.OKBLUE)
+        best_val = parse_result(val_dict[best_key], Bcolors.HEADER)
+        if val_dict['latest'] is None and val_dict[best_key] is None:
+            max_val = f'{Bcolors.WARNING}No results{Bcolors.ENDC}'
+        elif val_dict['latest'] is None:
+            max_val = best_val
+        elif val_dict[best_key] is None:
+            max_val = latest_val
+        else:
+            max_val = latest_val if val_dict['latest'] > val_dict[best_key] else best_val
+        # print as a table: config, max_val, latest_val, best_val
+        print(f'| {config_print:<{config_column_width}} | {max_val:<20} | {latest_val:<22} | {best_val:<20} |{best_key:<10} |')
+
+        # print(f'{config_print}: {round(max_val * 100, 2)}   '
+        #       f'Latest: {latest_val}   {best_key}: {best_val}')
+
+
+def parse_result(value, color):
+    if value is None:
+        return f'{Bcolors.WARNING}No results{Bcolors.ENDC}'
+    else:
+        return f'{color}{round(value * 100, 2)}{Bcolors.ENDC}'
+
+
+def main():
+    delay = 1
+    args = init_parser()
+    scores = {}
+    stat = {}
+    best_run = None
+    if args.config_folder:
+        configs = []
+        # list all py files in folder and subfolders
+        if '*' in args.config_folder:
+            config_folder = args.config_folder.strip("'")
+            parent_folder = os.path.relpath(os.path.join(config_folder, os.pardir))
+            configs = [os.path.join(parent_folder, f) for f in os.listdir(parent_folder) if config_folder.split('*')[0] in os.path.join(parent_folder, f)]
+        else:
+            matched_folders = [args.config_folder]
+            for matched_folder in matched_folders:
+                for root, dirs, files in os.walk(matched_folder):
+                    for file in files:
+                        if file.endswith(".py"):
+                            configs.append(os.path.join(root, file))
+    else:
+        configs = [args.config]
+    print(f"{Bcolors.OKGREEN}Running {args.action} on {len(configs)} configs{Bcolors.ENDC}")
+    if args.action == "test" and not args.eval_three and not args.find_best:
+        runs = ['latest', 'best']
+    elif args.eval_three:
+        runs = ['latest', 'best', 'epoch_100']
+    elif args.find_best:
+        runs = ['best']
+    else:
+        runs = ['latest']
+    for config_path in sorted(configs):
+        for run in runs:
+            config = config_path.split("/")[-2] + "_" + config_path.split("/")[-1].replace('_config', '')
+            if args.CVPR24:
+                workdir_path = f'/storage/orhir/capeformer_legacy/{config.split(".")[0]}'
+            else:
+                workdir_path = f'/storage/orhir/capeformer/{config.split(".")[0]}'
+            local_workdir_path = f'work_dir_runai/{config.split(".")[0]}'
+            lwr_config = config.lower()
+            if check_skip(lwr_config, args):
+                continue
+            if args.action == "train" or args.action == "run":
+                gpu = 4 if args.dist else 1
+                resource = f' -g {gpu}'
+            else:
+                # resource = f' --gpu-memory 4G --cpu 2 --memory 4G'
+                resource = f' -g 0.3'
+            if args.large:
+                resource += f' --node-pools blaufer'
+            if args.stat:
+                train_job_name, job_name = get_run_name(config, args, run)
+                if args.action == "train" or args.action == "run":
+                    job_name = train_job_name
+                print(f'{"-" * 30 + Bcolors.OKCYAN + job_name + Bcolors.ENDC + "-" * 30}')
+                status = check_status(job_name)
+                stat[job_name] = status
+                continue
+            # else:
+            #     resource += f' --node-pools faculty'
+            if args.action == "train":
+                job_name, _ = get_run_name(config, args, run)
+                if args.dist:
+                    py_command = (f'python -m torch.distributed.launch '
+                                  f'--nproc_per_node={gpu} --master_port=29500 '
+                                  f'train.py --gpus {gpu} --config {config_path} '
+                                  f'--work-dir {workdir_path}  --autoscale-lr '
+                                  f'--launcher pytorch')
+                else:
+                    py_command = (f'python train.py  '
+                                  f' --config {config_path}'
+                                  f' --work-dir {workdir_path}')
+            elif args.action == "run":
+                job_name, _ = get_run_name(config, args, run)
+                if args.masking is not None:
+                    masking_precent = int(args.masking * 100)
+                    workdir_path = f'/storage/orhir/capeformer/CVPR25_ablation_mask_{masking_precent}'
+                    job_name += f'-{masking_precent}'
+                if args.masking_lamda:
+                    workdir_path = f'/storage/orhir/capeformer/CVPR25_ablation_mask_lamda_{int(args.masking_lamda)}'
+                    job_name += f'-lamda-{int(args.masking_lamda)}'
+                py_command = (f'python run.py  '
+                              f' --config {config_path}'
+                              f' --work_dir {workdir_path}')
+                if args.run_best_ckpt:
+                    py_command += ' --best'
+                    job_name += '-best'
+                if args.ft_epochs:
+                    py_command += f' --ft_epochs {args.ft_epochs}'
+                if args.masking:
+                    py_command += f' --masking_ratio {args.masking}'
+                if args.masking_lamda:
+                    py_command += f' --lamda_masking {args.masking_lamda}'
+            else:
+                train_job_name, job_name = get_run_name(config, args, run)
+                ckpt_path, best_run = get_best_run(workdir_path, config, run)
+                py_command = f'python test.py {config_path} {ckpt_path} --num_samples {args.num_samples}'
+                if args.permute_keypoints:
+                    py_command += ' --permute_keypoints'
+                    job_name = (job_name + '-permute-keypoints')[:60]
+            print(f'{"-" * 30 + Bcolors.OKCYAN + job_name + Bcolors.ENDC + "-" * 30}')
+            if args.log:
+                os_command = f'runai logs {job_name}'
+            elif args.delete_fail:
+                if not train_is_running(job_name, ['Failed', 'Error']):
+                    print("Job not failed, skipping...")
+                    continue
+                os_command = f'runai delete job {job_name}'
+            elif args.delete_pending:
+                if not train_is_running(job_name, ['Pending']):
+                    continue
+                os_command = f'runai delete job {job_name}'
+            elif args.delete:
+                os_command = f'runai delete job {job_name}'
+            elif args.results:
+                if args.check_logs:
+                    # First check if the job is completed
+                    status = check_runai_logs(job_name)
+                else:
+                    if args.action == 'run':
+                        log_file = os.path.join(f'work_dir_runai/{config.split(".")[0]}',
+                                                'base_skeleton_bias',
+                                                'testing_log.txt')
+                    else:
+                        log_file = os.path.join(f'work_dir_runai/{config.split(".")[0]}',
+                                                'testing_log.txt')
+                    if os.path.exists(log_file):
+                        with open(log_file, 'r') as f:
+                            status = f.read()
+                        # Parse config:
+                        match = re.search(f'\*\*[\s\S]*?checkpoint:\s*.*?{run}[\s\S]*?(AUC:[\s\S]*?mPCK:\s*[\d.]+)', status)
+                        if match:
+                            status = match.group(1)
+                        else:
+                            status = ''
+                        delay = 0
+                    else:
+                        status = check_runai_logs(job_name)
+                if args.auc and 'AUC' in status:
+                    score = float(status.split('AUC: ')[1].split('\n')[0])
+                elif args.mpck and 'mPCK' in status:
+                    score = float(status.split('mPCK: ')[1].split('\n')[0])
+                elif f'PCK@{args.pck}:' in status:
+                    score = float(status.split(f'PCK@{args.pck}: ')[1].split('\n')[0])
+                else:
+                    score = None
+                best_run = best_run.replace('best_PCK_', '').strip('.pth') if best_run else "No Best"
+                key = 'latest' if run == 'latest' else best_run
+                if config in scores:
+                    scores[config][key] = score
+                else:
+                    scores[config] = {key: score}
+                continue
+            else:
+                if args.action == 'test':
+                    if not train_is_running(train_job_name, ['Completed', 'Succeeded']):
+                        print('Train not completed')
+                        continue
+                os_command = (f'runai submit --pvc=storage:/storage -i orhir/capeformer '
+                              f' --name {job_name} {resource} --large-shm '
+                              f' --command -- {py_command}')
+            # print(os_command)
+            if args.show_cmd:
+                print(f'{Bcolors.OKGREEN}{os_command}{Bcolors.ENDC}')
+            subprocess.run(os_command, shell=True)
+            if args.delete_folder:
+                if os.path.exists(local_workdir_path):
+                    subprocess.run(f'rm -rf {local_workdir_path}', shell=True)
+                else:
+                    subprocess.run(f'echo {Bcolors.WARNING}No workdir folder to delete{Bcolors.ENDC}', shell=True)
+            # print(f'\n{"-" * 150}')
+            time.sleep(delay)
+    if args.results:
+        print_results(scores)
+    if args.stat:
+        pretty_table(stat)
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..e641c8d2f5f61b7b69b1554fe494fc6ea26b481f
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,22 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[tool:pytest]
+addopts=tests/
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+
+[isort]
+line_length = 79
+multi_line_output = 0
+known_standard_library = pkg_resources,setuptools
+known_first_party = mmpose
+known_third_party = cv2,json_tricks,mmcv,mmdet,munkres,numpy,xtcocotools,torch
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f7a432c91de128e7f6fcfd0f7ef109bbffbc47
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,111 @@
+import os
+import subprocess
+import time
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md', encoding='utf-8') as f:
+        content = f.read()
+    return content
+
+
+version_file = 'EdgeCape/version.py'
+
+
+def get_git_hash():
+
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+        out = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+    except OSError:
+        sha = 'unknown'
+
+    return sha
+
+
+def get_hash():
+    if os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    elif os.path.exists(version_file):
+        try:
+            from EdgeCape.version import __version__
+            sha = __version__.split('+')[-1]
+        except ImportError:
+            raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+
+    return sha
+
+
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+__version__ = '{}'
+short_version = '{}'
+version_info = ({})
+"""
+    sha = get_hash()
+    with open('EdgeCape/VERSION', 'r') as f:
+        SHORT_VERSION = f.read().strip()
+    VERSION_INFO = ', '.join(SHORT_VERSION.split('.'))
+    VERSION = SHORT_VERSION + '+' + sha
+
+    version_file_str = content.format(time.asctime(), VERSION, SHORT_VERSION,
+                                      VERSION_INFO)
+    with open(version_file, 'w') as f:
+        f.write(version_file_str)
+
+
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def get_requirements(filename='requirements.txt'):
+    here = os.path.dirname(os.path.realpath(__file__))
+    with open(os.path.join(here, filename), 'r') as f:
+        requires = [line.replace('\n', '') for line in f.readlines()]
+    return requires
+
+
+if __name__ == '__main__':
+    write_version_py()
+    setup(
+        name='edgecape',
+        version=get_version(),
+        description='A template for pytorch projects.',
+        long_description=readme(),
+        packages=find_packages(exclude=('configs', 'tools', 'demo')),
+        package_data={'edgecape.ops': ['*/*.so']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+        ],
+        license='Apache License 2.0',
+        setup_requires=['pytest-runner', 'cython', 'numpy'],
+        tests_require=['pytest', 'xdoctest'],
+        install_requires=get_requirements(),
+        zip_safe=False)
diff --git a/test.py b/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c1afcdb1f1321d0b9bbd6877af424acfc3977b0
--- /dev/null
+++ b/test.py
@@ -0,0 +1,164 @@
+import argparse
+import os
+import os.path as osp
+import random
+import uuid
+
+import mmcv
+import numpy as np
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import get_dist_info, init_dist, load_checkpoint
+from EdgeCape import *  # noqa
+from EdgeCape.datasets import build_dataset
+from EdgeCape.apis.test import multi_gpu_test, single_gpu_test
+from mmpose.core import wrap_fp16_model
+from mmpose.datasets import build_dataloader
+from mmpose.models import build_posenet
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='mmpose test model')
+    parser.add_argument('config', default=None, help='test config file path')
+    parser.add_argument('checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--out', help='output result file')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase the inference speed')
+    parser.add_argument(
+        '--eval',
+        default=None,
+        nargs='+',
+        help='evaluation metric, which depends on the dataset,'
+        ' e.g., "mAP" for MSCOCO')
+    parser.add_argument(
+        '--permute_keypoints',
+        action='store_true',
+        help='whether to randomly permute keypoints')
+    parser.add_argument(
+        '--gpu_collect',
+        action='store_true',
+        help='whether to use gpu to collect results')
+    parser.add_argument('--tmpdir', help='tmp dir for writing some results')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument('--num_samples', type=int, default=1)
+
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
+
+
+def main():
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    uuid.UUID(int=0)
+
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    # cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    args.work_dir = osp.join('./work_dirs',
+                             osp.splitext(osp.basename(args.config))[0])
+    mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test, dict(test_mode=True))
+    dataloader_setting = dict(
+        samples_per_gpu=args.num_samples,
+        workers_per_gpu=cfg.data.get('workers_per_gpu', 12),
+        dist=distributed,
+        shuffle=False,
+        drop_last=False)
+    dataloader_setting = dict(dataloader_setting,
+                              **cfg.data.get('test_dataloader', {}))
+    data_loader = build_dataloader(dataset, **dataloader_setting)
+    # build the model and load checkpoint
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+
+    if not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    eval_config = cfg.get('evaluation', {})
+    eval_config = merge_configs(eval_config, dict(metric=args.eval))
+
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            mmcv.dump(outputs, args.out)
+
+        results = dataset.evaluate(outputs, **eval_config)
+        print('\n')
+        for k, v in sorted(results.items()):
+            print(f'{k}: {v}')
+
+        # save testing log
+        test_log_path = osp.dirname(args.checkpoint)
+        test_log_file = "testing_log.txt"
+        test_log = osp.join(test_log_path, test_log_file)
+        with open(test_log, 'a') as f:
+            f.write("**  config_file: " + args.config + "\t checkpoint: " + args.checkpoint + "\t \n")
+            for k, v in sorted(results.items()):
+                f.write(f'\t {k}: {v}'+'\n')
+            f.write("********************************************************************\n")
+        
+if __name__ == '__main__':
+    main()
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f64fff608890a9b8e0915ecb8f22c90e17eb7c9
--- /dev/null
+++ b/train.py
@@ -0,0 +1,201 @@
+import argparse
+import copy
+import os
+import os.path as osp
+import time
+import random
+import uuid
+import numpy as np
+import mmcv
+import torch
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, set_random_seed
+from mmcv.utils import get_git_hash
+
+from EdgeCape import *  # noqa
+from EdgeCape.apis import train_model
+from EdgeCape.datasets import build_dataset
+
+from mmpose import __version__
+from mmpose.models import build_posenet
+from mmpose.utils import collect_env, get_root_logger
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a pose model')
+    parser.add_argument('--config', default=None, help='train config file path')
+    parser.add_argument('--work-dir', default=None, help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument('--load-from', help='the checkpoint file to load from')
+    parser.add_argument(
+        '--auto-resume', type=bool, default=True, help='automatically detect the latest checkpoint in word dir and resume from it.')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=None, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.') 
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. For example, '
+        "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        help='whether to display the prediction results in a window.')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+    # torch.autograd.set_detect_anomaly(True)
+    cfg = Config.fromfile(args.config)
+
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    torch.backends.cudnn.benchmark = True
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI 
+    # > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    # auto resume
+    if args.auto_resume:
+        checkpoint = os.path.join(args.work_dir, 'latest.pth')
+        if os.path.exists(checkpoint):
+            cfg.resume_from = checkpoint
+    if args.load_from is not None:
+        cfg.load_from = args.load_from
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        os.environ['NCCL_BLOCKING_WAIT'] = '0'  # not to enforce timeout
+        os.environ['NCCL_P2P_DISABLE'] = '1'
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    args.seed = 1
+    args.deterministic = True
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    uuid.UUID(int=args.seed)
+
+    model = build_posenet(cfg.model)
+    train_datasets = [build_dataset(cfg.data.train)]
+
+    # if len(cfg.workflow) == 2:
+    #     val_dataset = copy.deepcopy(cfg.data.val)
+    #     val_dataset.pipeline = cfg.data.train.pipeline
+    #     datasets.append(build_dataset(val_dataset))
+
+    val_dataset = copy.deepcopy(cfg.data.val)
+    val_dataset = build_dataset(val_dataset, dict(test_mode=True))
+
+    if cfg.checkpoint_config is not None:
+        # save mmpose version, config file content
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmpose_version=__version__ + get_git_hash(digits=7),
+            config=cfg.pretty_text,
+        )
+    train_model(
+        model,
+        train_datasets,
+        val_dataset,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()