Spaces:

ananthu-aniraj
/

pdiscoformer

Sleeping

App Files Files Community

ananthu-aniraj commited on Oct 18, 2024

Commit

20239f9

1 Parent(s): b507f8e

add initial files

Browse files

Files changed (22) hide show

.gitignore +40 -0
files/images/Laysan_Albatross_0050_870.jpg +0 -0
layers/__init__.py +2 -0
layers/independent_mlp.py +69 -0
layers/transformer_layers.py +54 -0
load_model.py +226 -0
models/__init__.py +4 -0
models/individual_landmark_convnext.py +110 -0
models/individual_landmark_resnet.py +141 -0
models/individual_landmark_vit.py +366 -0
models/vit_baseline.py +239 -0
requirements.txt +5 -1
utils/__init__.py +6 -0
utils/data_utils/__init__.py +5 -0
utils/data_utils/class_balanced_distributed_sampler.py +100 -0
utils/data_utils/class_balanced_sampler.py +31 -0
utils/data_utils/dataset_utils.py +161 -0
utils/data_utils/reversible_affine_transform.py +82 -0
utils/data_utils/transform_utils.py +118 -0
utils/get_landmark_coordinates.py +41 -0
utils/misc_utils.py +135 -0
utils/visualize_att_maps.py +135 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,40 @@

+# editor settings
+.idea
+.vscode
+_darcs
+# compilation and distribution
+__pycache__
+_ext
+*.pyc
+*.pyd
+*.so
+*.dll
+*.egg-info/
+build/
+dist/
+wheels/
+# pytorch/python/numpy formats
+*.pth
+*.pkl
+*.npy
+*.ts
+*.pt
+# ipython/jupyter notebooks
+*.ipynb
+**/.ipynb_checkpoints/
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+# Results temporary
+*.png
+*.txt
+*.tsv
+wandb/
+exps/

files/images/Laysan_Albatross_0050_870.jpg ADDED Viewed

layers/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .transformer_layers import *
2	+ from .independent_mlp import *

layers/independent_mlp.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# This file contains the implementation of the IndependentMLPs class
+import torch
+class IndependentMLPs(torch.nn.Module):
+    """
+    This class implements the MLP used for classification with the option to use an additional independent MLP layer
+    """
+    def __init__(self, part_dim, latent_dim, bias=False, num_lin_layers=1, act_layer=True, out_dim=None, stack_dim=-1):
+        """
+        :param part_dim: Number of parts
+        :param latent_dim: Latent dimension
+        :param bias: Whether to use bias
+        :param num_lin_layers: Number of linear layers
+        :param act_layer: Whether to use activation layer
+        :param out_dim: Output dimension (default: None)
+        :param stack_dim: Dimension to stack the outputs (default: -1)
+        """
+        super().__init__()
+        self.bias = bias
+        self.latent_dim = latent_dim
+        if out_dim is None:
+            out_dim = latent_dim
+        self.out_dim = out_dim
+        self.part_dim = part_dim
+        self.stack_dim = stack_dim
+        layer_stack = torch.nn.ModuleList()
+        for i in range(part_dim):
+            layer_stack.append(torch.nn.Sequential())
+            for j in range(num_lin_layers):
+                layer_stack[i].add_module(f"fc_{j}", torch.nn.Linear(latent_dim, self.out_dim, bias=bias))
+                if act_layer:
+                    layer_stack[i].add_module(f"act_{j}", torch.nn.GELU())
+        self.feature_layers = layer_stack
+        self.reset_weights()
+    def __repr__(self):
+        return f"IndependentMLPs(part_dim={self.part_dim}, latent_dim={self.latent_dim}), bias={self.bias}"
+    def reset_weights(self):
+        """ Initialize weights with a identity matrix"""
+        for layer in self.feature_layers:
+            for m in layer.modules():
+                if isinstance(m, torch.nn.Linear):
+                    # Initialize weights with a truncated normal distribution
+                    torch.nn.init.trunc_normal_(m.weight, std=0.02)
+                    if m.bias is not None:
+                        torch.nn.init.zeros_(m.bias)
+    def forward(self, x):
+        """ Input X has the dimensions batch x latent_dim x part_dim """
+        outputs = []
+        for i, layer in enumerate(self.feature_layers):
+            if self.stack_dim == -1:
+                in_ = x[..., i]
+            else:
+                in_ = x[:, i, ...]  # Select feature i
+            out = layer(in_)  # Apply MLP to feature i
+            outputs.append(out)
+        x = torch.stack(outputs, dim=self.stack_dim)  # Stack the outputs
+        return x

layers/transformer_layers.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Attention Block with option to return the mean of k over heads from attention
+import torch
+from timm.models.vision_transformer import Attention, Block
+import torch.nn.functional as F
+from typing import Tuple
+class AttentionWQKVReturn(Attention):
+    """
+    Modifications:
+         - Return the qkv tensors from the attention
+    """
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, torch.stack((q, k, v), dim=0)
+class BlockWQKVReturn(Block):
+    """
+    Modifications:
+        - Use AttentionWQKVReturn instead of Attention
+        - Return the qkv tensors from the attention
+    """
+    def forward(self, x: torch.Tensor, return_qkv: bool = False) -> torch.Tensor | Tuple[torch.Tensor, torch.Tensor]:
+        # Note: this is copied from timm.models.vision_transformer.Block with modifications.
+        x_attn, qkv = self.attn(self.norm1(x))
+        x = x + self.drop_path1(self.ls1(x_attn))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        if return_qkv:
+            return x, qkv
+        else:
+            return x

load_model.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import copy
+import os
+from pathlib import Path
+import torch
+from timm.models import create_model
+from torchvision.models import get_model
+from models import pdiscoformer_vit_bb, pdisconet_vit_bb, pdisconet_resnet_torchvision_bb
+from models.individual_landmark_resnet import IndividualLandmarkResNet
+from models.individual_landmark_convnext import IndividualLandmarkConvNext
+from models.individual_landmark_vit import IndividualLandmarkViT
+from utils import load_state_dict_pdisco
+def load_model_arch(args, num_cls):
+    """
+    Function to load the model
+    :param args: Arguments from the command line
+    :param num_cls: Number of classes in the dataset
+    :return:
+    """
+    if 'resnet' in args.model_arch:
+        num_layers_split = [int(s) for s in args.model_arch if s.isdigit()]
+        num_layers = int(''.join(map(str, num_layers_split)))
+        if num_layers >= 100:
+            timm_model_arch = args.model_arch + ".a1h_in1k"
+        else:
+            timm_model_arch = args.model_arch + ".a1_in1k"
+    if "resnet" in args.model_arch and args.use_torchvision_resnet_model:
+        weights = "DEFAULT" if args.pretrained_start_weights else None
+        base_model = get_model(args.model_arch, weights=weights)
+    elif "resnet" in args.model_arch and not args.use_torchvision_resnet_model:
+        if args.eval_only:
+            base_model = create_model(
+                timm_model_arch,
+                pretrained=args.pretrained_start_weights,
+                num_classes=num_cls,
+                output_stride=args.output_stride,
+            )
+        else:
+            base_model = create_model(
+                timm_model_arch,
+                pretrained=args.pretrained_start_weights,
+                drop_path_rate=args.drop_path,
+                num_classes=num_cls,
+                output_stride=args.output_stride,
+            )
+    elif "convnext" in args.model_arch:
+        if args.eval_only:
+            base_model = create_model(
+                args.model_arch,
+                pretrained=args.pretrained_start_weights,
+                num_classes=num_cls,
+                output_stride=args.output_stride,
+            )
+        else:
+            base_model = create_model(
+                args.model_arch,
+                pretrained=args.pretrained_start_weights,
+                drop_path_rate=args.drop_path,
+                num_classes=num_cls,
+                output_stride=args.output_stride,
+            )
+    elif "vit" in args.model_arch:
+        if args.eval_only:
+            base_model = create_model(
+                args.model_arch,
+                pretrained=args.pretrained_start_weights,
+                img_size=args.image_size,
+            )
+        else:
+            base_model = create_model(
+                args.model_arch,
+                pretrained=args.pretrained_start_weights,
+                drop_path_rate=args.drop_path,
+                img_size=args.image_size,
+            )
+        vit_patch_size = base_model.patch_embed.proj.kernel_size[0]
+        if args.image_size % vit_patch_size != 0:
+            raise ValueError(f"Image size {args.image_size} must be divisible by patch size {vit_patch_size}")
+    else:
+        raise ValueError('Model not supported.')
+    return base_model
+def init_pdisco_model(base_model, args, num_cls):
+    """
+    Function to initialize the model
+    :param base_model: Base model
+    :param args: Arguments from the command line
+    :param num_cls: Number of classes in the dataset
+    :return:
+    """
+    # Initialize the network
+    if 'convnext' in args.model_arch:
+        sl_channels = base_model.stages[-1].downsample[-1].in_channels
+        fl_channels = base_model.head.in_features
+        model = IndividualLandmarkConvNext(base_model, args.num_parts, num_classes=num_cls,
+                                           sl_channels=sl_channels, fl_channels=fl_channels,
+                                           part_dropout=args.part_dropout, modulation_type=args.modulation_type,
+                                           gumbel_softmax=args.gumbel_softmax,
+                                           gumbel_softmax_temperature=args.gumbel_softmax_temperature,
+                                           gumbel_softmax_hard=args.gumbel_softmax_hard,
+                                           modulation_orth=args.modulation_orth, classifier_type=args.classifier_type,
+                                           noise_variance=args.noise_variance)
+    elif 'resnet' in args.model_arch:
+        sl_channels = base_model.layer4[0].conv1.in_channels
+        fl_channels = base_model.fc.in_features
+        model = IndividualLandmarkResNet(base_model, args.num_parts, num_classes=num_cls,
+                                         sl_channels=sl_channels, fl_channels=fl_channels,
+                                         use_torchvision_model=args.use_torchvision_resnet_model,
+                                         part_dropout=args.part_dropout, modulation_type=args.modulation_type,
+                                         gumbel_softmax=args.gumbel_softmax,
+                                         gumbel_softmax_temperature=args.gumbel_softmax_temperature,
+                                         gumbel_softmax_hard=args.gumbel_softmax_hard,
+                                         modulation_orth=args.modulation_orth, classifier_type=args.classifier_type,
+                                         noise_variance=args.noise_variance)
+    elif 'vit' in args.model_arch:
+        model = IndividualLandmarkViT(base_model, num_landmarks=args.num_parts, num_classes=num_cls,
+                                      part_dropout=args.part_dropout,
+                                      modulation_type=args.modulation_type, gumbel_softmax=args.gumbel_softmax,
+                                      gumbel_softmax_temperature=args.gumbel_softmax_temperature,
+                                      gumbel_softmax_hard=args.gumbel_softmax_hard,
+                                      modulation_orth=args.modulation_orth, classifier_type=args.classifier_type,
+                                      noise_variance=args.noise_variance)
+    else:
+        raise ValueError('Model not supported.')
+    return model
+def load_model_pdisco(args, num_cls):
+    """
+    Function to load the model
+    :param args: Arguments from the command line
+    :param num_cls: Number of classes in the dataset
+    :return:
+    """
+    base_model = load_model_arch(args, num_cls)
+    model = init_pdisco_model(base_model, args, num_cls)
+    return model
+def pdiscoformer_vit(pretrained=True, model_dataset="cub", k=8, model_url="", img_size=224, num_cls=200):
+    """
+    Function to load the PDiscoFormer model with ViT backbone
+    :param pretrained: Boolean flag to load the pretrained weights
+    :param model_dataset: Dataset for which the model is trained
+    :param k: Number of unsupervised landmarks the model is trained on
+    :param model_url: URL to load the model weights from
+    :param img_size: Image size
+    :param num_cls: Number of classes in the dataset
+    :return: PDiscoFormer model with ViT backbone
+    """
+    model = pdiscoformer_vit_bb("vit_base_patch14_reg4_dinov2.lvd142m", num_cls=num_cls, k=k, img_size=img_size)
+    if pretrained:
+        hub_dir = torch.hub.get_dir()
+        model_dir = os.path.join(hub_dir, "pdiscoformer_checkpoints", f"pdiscoformer_{model_dataset}")
+        Path(model_dir).mkdir(parents=True, exist_ok=True)
+        url_path = model_url + str(k) + "_parts_snapshot_best.pt"
+        snapshot_data = torch.hub.load_state_dict_from_url(url_path, model_dir=model_dir, map_location='cpu')
+        if 'model_state' in snapshot_data:
+            _, state_dict = load_state_dict_pdisco(snapshot_data)
+        else:
+            state_dict = copy.deepcopy(snapshot_data)
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def pdisconet_vit(pretrained=True, model_dataset="nabirds", k=8, model_url="", img_size=224, num_cls=555):
+    """
+    Function to load the PDiscoNet model with ViT backbone
+    :param pretrained: Boolean flag to load the pretrained weights
+    :param model_dataset: Dataset for which the model is trained
+    :param k: Number of unsupervised landmarks the model is trained on
+    :param model_url: URL to load the model weights from
+    :param img_size: Image size
+    :param num_cls: Number of classes in the dataset
+    :return: PDiscoNet model with ViT backbone
+    """
+    model = pdisconet_vit_bb("vit_base_patch14_reg4_dinov2.lvd142m", num_cls=num_cls, k=k, img_size=img_size)
+    if pretrained:
+        hub_dir = torch.hub.get_dir()
+        model_dir = os.path.join(hub_dir, "pdiscoformer_checkpoints", f"pdisconet_{model_dataset}")
+        Path(model_dir).mkdir(parents=True, exist_ok=True)
+        url_path = model_url + str(k) + "_parts_snapshot_best.pt"
+        snapshot_data = torch.hub.load_state_dict_from_url(url_path, model_dir=model_dir, map_location='cpu')
+        if 'model_state' in snapshot_data:
+            _, state_dict = load_state_dict_pdisco(snapshot_data)
+        else:
+            state_dict = copy.deepcopy(snapshot_data)
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def pdisconet_resnet101(pretrained=True, model_dataset="nabirds", k=8, model_url="", num_cls=555):
+    """
+    Function to load the PDiscoNet model with ResNet-101 backbone
+    :param pretrained: Boolean flag to load the pretrained weights
+    :param model_dataset: Dataset for which the model is trained
+    :param k: Number of unsupervised landmarks the model is trained on
+    :param model_url: URL to load the model weights from
+    :param num_cls: Number of classes in the dataset
+    :return: PDiscoNet model with ResNet-101 backbone
+    """
+    model = pdisconet_resnet_torchvision_bb("resnet101", num_cls=num_cls, k=k)
+    if pretrained:
+        hub_dir = torch.hub.get_dir()
+        model_dir = os.path.join(hub_dir, "pdiscoformer_checkpoints", f"pdisconet_{model_dataset}")
+        Path(model_dir).mkdir(parents=True, exist_ok=True)
+        url_path = model_url + str(k) + "_parts_snapshot_best.pt"
+        snapshot_data = torch.hub.load_state_dict_from_url(url_path, model_dir=model_dir, map_location='cpu')
+        if 'model_state' in snapshot_data:
+            _, state_dict = load_state_dict_pdisco(snapshot_data)
+        else:
+            state_dict = copy.deepcopy(snapshot_data)
+        model.load_state_dict(state_dict, strict=True)
+    return model

models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .individual_landmark_resnet import *
+from .individual_landmark_convnext import *
+from .vit_baseline import *
+from .individual_landmark_vit import *

models/individual_landmark_convnext.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+from torch import Tensor
+from torch.nn import Parameter
+from typing import Any
+from layers.independent_mlp import IndependentMLPs
+# Baseline model, a modified convnext with reduced downsampling for a spatially larger feature tensor in the last layer
+class IndividualLandmarkConvNext(torch.nn.Module):
+    def __init__(self, init_model: torch.nn.Module, num_landmarks: int = 8,
+                 num_classes: int = 200, sl_channels: int = 1024, fl_channels: int = 2048, part_dropout: float = 0.3,
+                 modulation_type: str = "original", modulation_orth: bool = False, gumbel_softmax: bool = False,
+                 gumbel_softmax_temperature: float = 1.0, gumbel_softmax_hard: bool = False,
+                 classifier_type: str = "linear", noise_variance: float = 0.0) -> None:
+        super().__init__()
+        self.num_landmarks = num_landmarks
+        self.num_classes = num_classes
+        self.noise_variance = noise_variance
+        self.stem = init_model.stem
+        self.stages = init_model.stages
+        self.feature_dim = sl_channels + fl_channels
+        self.fc_landmarks = torch.nn.Conv2d(self.feature_dim, num_landmarks + 1, 1, bias=False)
+        self.gumbel_softmax = gumbel_softmax
+        self.gumbel_softmax_temperature = gumbel_softmax_temperature
+        self.gumbel_softmax_hard = gumbel_softmax_hard
+        self.modulation_type = modulation_type
+        if modulation_type == "layer_norm":
+            self.modulation = torch.nn.LayerNorm([self.feature_dim, self.num_landmarks + 1])
+        elif modulation_type == "original":
+            self.modulation = torch.nn.Parameter(torch.ones(1, self.feature_dim, self.num_landmarks + 1))
+        elif modulation_type == "parallel_mlp":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=True, bias=True)
+        elif modulation_type == "parallel_mlp_no_bias":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=True, bias=False)
+        elif modulation_type == "parallel_mlp_no_act":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=False, bias=True)
+        elif modulation_type == "parallel_mlp_no_act_no_bias":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=False, bias=False)
+        elif modulation_type == "none":
+            self.modulation = torch.nn.Identity()
+        else:
+            raise ValueError("modulation_type not implemented")
+        self.modulation_orth = modulation_orth
+        self.dropout_full_landmarks = torch.nn.Dropout1d(part_dropout)
+        self.classifier_type = classifier_type
+        if classifier_type == "independent_mlp":
+            self.fc_class_landmarks = IndependentMLPs(part_dim=self.num_landmarks, latent_dim=self.feature_dim,
+                                                      num_lin_layers=1, act_layer=False, out_dim=num_classes,
+                                                      bias=False, stack_dim=1)
+        elif classifier_type == "linear":
+            self.fc_class_landmarks = torch.nn.Linear(in_features=self.feature_dim, out_features=num_classes,
+                                                      bias=False)
+        else:
+            raise ValueError("classifier_type not implemented")
+    def forward(self, x: Tensor) -> tuple[Any, Any, Any, Any, Parameter, int | Any]:
+        # Pretrained ConvNeXt part of the model
+        x = self.stem(x)
+        x = self.stages[0](x)
+        x = self.stages[1](x)
+        l3 = self.stages[2](x)
+        x = self.stages[3](l3)
+        x = torch.nn.functional.interpolate(x, size=(l3.shape[-2], l3.shape[-1]), mode='bilinear', align_corners=False)
+        x = torch.cat((x, l3), dim=1)
+        # Compute per landmark attention maps
+        # (b - a)^2 = b^2 - 2ab + a^2, b = feature maps resnet, a = convolution kernel
+        batch_size = x.shape[0]
+        ab = self.fc_landmarks(x)
+        b_sq = x.pow(2).sum(1, keepdim=True)
+        b_sq = b_sq.expand(-1, self.num_landmarks + 1, -1, -1).contiguous()
+        a_sq = self.fc_landmarks.weight.pow(2).sum(1).unsqueeze(1).expand(-1, batch_size, x.shape[-2],
+                                                                          x.shape[-1]).contiguous()
+        a_sq = a_sq.permute(1, 0, 2, 3).contiguous()
+        dist = b_sq - 2 * ab + a_sq
+        maps = -dist
+        # Softmax so that the attention maps for each pixel add up to 1
+        if self.gumbel_softmax:
+            maps = torch.nn.functional.gumbel_softmax(maps, dim=1, tau=self.gumbel_softmax_temperature,
+                                                      hard=self.gumbel_softmax_hard)  # [B, num_landmarks + 1, H, W]
+        else:
+            maps = torch.nn.functional.softmax(maps, dim=1)  # [B, num_landmarks + 1, H, W]
+        # Use maps to get weighted average features per landmark
+        all_features = (maps.unsqueeze(1) * x.unsqueeze(2)).mean(-1).mean(-1).contiguous()
+        if self.noise_variance > 0.0:
+            all_features += torch.randn_like(all_features,
+                                             device=all_features.device) * x.std().detach() * self.noise_variance
+        # Modulate the features
+        if self.modulation_type == "original":
+            all_features_mod = all_features * self.modulation
+        else:
+            all_features_mod = self.modulation(all_features)
+        # Classification based on the landmark features
+        scores = self.fc_class_landmarks(
+            self.dropout_full_landmarks(all_features_mod[..., :-1].permute(0, 2, 1).contiguous())).permute(0, 2,
+                                                                                                           1).contiguous()
+        if self.modulation_orth:
+            return all_features_mod, maps, scores, dist
+        else:
+            return all_features, maps, scores, dist

models/individual_landmark_resnet.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Modified from https://github.com/robertdvdk/part_detection/blob/main/nets.py
+import torch
+from torch import Tensor
+from timm.models import create_model
+from torchvision.models import get_model
+from torch.nn import Parameter
+from typing import Any
+from layers.independent_mlp import IndependentMLPs
+# Baseline model, a modified ResNet with reduced downsampling for a spatially larger feature tensor in the last layer
+class IndividualLandmarkResNet(torch.nn.Module):
+    def __init__(self, init_model: torch.nn.Module, num_landmarks: int = 8,
+                 num_classes: int = 200, sl_channels: int = 1024, fl_channels: int = 2048,
+                 use_torchvision_model: bool = False, part_dropout: float = 0.3,
+                 modulation_type: str = "original", modulation_orth: bool = False, gumbel_softmax: bool = False,
+                 gumbel_softmax_temperature: float = 1.0, gumbel_softmax_hard: bool = False,
+                 classifier_type: str = "linear", noise_variance: float = 0.0) -> None:
+        super().__init__()
+        self.num_landmarks = num_landmarks
+        self.num_classes = num_classes
+        self.noise_variance = noise_variance
+        self.conv1 = init_model.conv1
+        self.bn1 = init_model.bn1
+        if use_torchvision_model:
+            self.act1 = init_model.relu
+        else:
+            self.act1 = init_model.act1
+        self.maxpool = init_model.maxpool
+        self.layer1 = init_model.layer1
+        self.layer2 = init_model.layer2
+        self.layer3 = init_model.layer3
+        self.layer4 = init_model.layer4
+        self.feature_dim = sl_channels + fl_channels
+        self.fc_landmarks = torch.nn.Conv2d(self.feature_dim, num_landmarks + 1, 1, bias=False)
+        self.gumbel_softmax = gumbel_softmax
+        self.gumbel_softmax_temperature = gumbel_softmax_temperature
+        self.gumbel_softmax_hard = gumbel_softmax_hard
+        self.modulation_type = modulation_type
+        if modulation_type == "layer_norm":
+            self.modulation = torch.nn.LayerNorm([self.feature_dim, self.num_landmarks + 1])
+        elif modulation_type == "original":
+            self.modulation = torch.nn.Parameter(torch.ones(1, self.feature_dim, self.num_landmarks + 1))
+        elif modulation_type == "parallel_mlp":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=True, bias=True)
+        elif modulation_type == "parallel_mlp_no_bias":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=True, bias=False)
+        elif modulation_type == "parallel_mlp_no_act":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=False, bias=True)
+        elif modulation_type == "parallel_mlp_no_act_no_bias":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=False, bias=False)
+        elif modulation_type == "none":
+            self.modulation = torch.nn.Identity()
+        else:
+            raise ValueError("modulation_type not implemented")
+        self.modulation_orth = modulation_orth
+        self.dropout_full_landmarks = torch.nn.Dropout1d(part_dropout)
+        self.classifier_type = classifier_type
+        if classifier_type == "independent_mlp":
+            self.fc_class_landmarks = IndependentMLPs(part_dim=self.num_landmarks, latent_dim=self.feature_dim,
+                                                      num_lin_layers=1, act_layer=False, out_dim=num_classes,
+                                                      bias=False, stack_dim=1)
+        elif classifier_type == "linear":
+            self.fc_class_landmarks = torch.nn.Linear(in_features=self.feature_dim, out_features=num_classes,
+                                                      bias=False)
+        else:
+            raise ValueError("classifier_type not implemented")
+    def forward(self, x: Tensor) -> tuple[Any, Any, Any, Any, Parameter, int | Any]:
+        # Pretrained ResNet part of the model
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        l3 = self.layer3(x)
+        x = self.layer4(l3)
+        x = torch.nn.functional.interpolate(x, size=(l3.shape[-2], l3.shape[-1]), mode='bilinear', align_corners=False)
+        x = torch.cat((x, l3), dim=1)
+        # Compute per landmark attention maps
+        # (b - a)^2 = b^2 - 2ab + a^2, b = feature maps resnet, a = convolution kernel
+        batch_size = x.shape[0]
+        ab = self.fc_landmarks(x)
+        b_sq = x.pow(2).sum(1, keepdim=True)
+        b_sq = b_sq.expand(-1, self.num_landmarks + 1, -1, -1).contiguous()
+        a_sq = self.fc_landmarks.weight.pow(2).sum(1).unsqueeze(1).expand(-1, batch_size, x.shape[-2],
+                                                                          x.shape[-1]).contiguous()
+        a_sq = a_sq.permute(1, 0, 2, 3).contiguous()
+        dist = b_sq - 2 * ab + a_sq
+        maps = -dist
+        # Softmax so that the attention maps for each pixel add up to 1
+        if self.gumbel_softmax:
+            maps = torch.nn.functional.gumbel_softmax(maps, dim=1, tau=self.gumbel_softmax_temperature,
+                                                      hard=self.gumbel_softmax_hard)  # [B, num_landmarks + 1, H, W]
+        else:
+            maps = torch.nn.functional.softmax(maps, dim=1)  # [B, num_landmarks + 1, H, W]
+        # Use maps to get weighted average features per landmark
+        all_features = (maps.unsqueeze(1) * x.unsqueeze(2)).mean(-1).mean(-1).contiguous()
+        if self.noise_variance > 0.0:
+            all_features += torch.randn_like(all_features,
+                                             device=all_features.device) * x.std().detach() * self.noise_variance
+        # Modulate the features
+        if self.modulation_type == "original":
+            all_features_mod = all_features * self.modulation
+        else:
+            all_features_mod = self.modulation(all_features)
+        # Classification based on the landmark features
+        scores = self.fc_class_landmarks(
+            self.dropout_full_landmarks(all_features_mod[..., :-1].permute(0, 2, 1).contiguous())).permute(0, 2,
+                                                                                                           1).contiguous()
+        if self.modulation_orth:
+            return all_features_mod, maps, scores, dist
+        else:
+            return all_features, maps, scores, dist
+def pdisconet_resnet_torchvision_bb(backbone, num_cls=200, k=8, **kwargs):
+    base_model = get_model(backbone)
+    return IndividualLandmarkResNet(base_model, num_landmarks=k, num_classes=num_cls,
+                                    modulation_type="original")
+def pdisconet_resnet_timm_bb(backbone, num_cls=200, k=8, output_stride=32, **kwargs):
+    base_model = create_model(backbone, pretrained=True, output_stride=output_stride)
+    return IndividualLandmarkResNet(base_model, num_landmarks=k, num_classes=num_cls,
+                                    modulation_type="original")

models/individual_landmark_vit.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# Compostion of the VisionTransformer class from timm with extra features: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+from pathlib import Path
+import os
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Any, Union, Sequence, Optional, Dict
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+from timm.models import create_model
+from timm.models.vision_transformer import Block, Attention
+from utils.misc_utils import compute_attention
+from layers.transformer_layers import BlockWQKVReturn, AttentionWQKVReturn
+from layers.independent_mlp import IndependentMLPs
+SAFETENSORS_SINGLE_FILE = "model.safetensors"
+class IndividualLandmarkViT(torch.nn.Module, PyTorchModelHubMixin,
+                            pipeline_tag='image-classification',
+                            repo_url='https://github.com/ananthu-aniraj/pdiscoformer'):
+    def __init__(self, init_model: torch.nn.Module, num_landmarks: int = 8, num_classes: int = 200,
+                 part_dropout: float = 0.3, return_transformer_qkv: bool = False,
+                 modulation_type: str = "original", gumbel_softmax: bool = False,
+                 gumbel_softmax_temperature: float = 1.0, gumbel_softmax_hard: bool = False,
+                 modulation_orth: bool = False, classifier_type: str = "linear", noise_variance: float = 0.0) -> None:
+        super().__init__()
+        self.num_landmarks = num_landmarks
+        self.num_classes = num_classes
+        self.noise_variance = noise_variance
+        self.num_prefix_tokens = init_model.num_prefix_tokens
+        self.num_reg_tokens = init_model.num_reg_tokens
+        self.has_class_token = init_model.has_class_token
+        self.no_embed_class = init_model.no_embed_class
+        self.cls_token = init_model.cls_token
+        self.reg_token = init_model.reg_token
+        self.feature_dim = init_model.embed_dim
+        self.patch_embed = init_model.patch_embed
+        self.pos_embed = init_model.pos_embed
+        self.pos_drop = init_model.pos_drop
+        self.norm_pre = init_model.norm_pre
+        self.blocks = init_model.blocks
+        self.norm = init_model.norm
+        self.return_transformer_qkv = return_transformer_qkv
+        self.h_fmap = int(self.patch_embed.img_size[0] // self.patch_embed.patch_size[0])
+        self.w_fmap = int(self.patch_embed.img_size[1] // self.patch_embed.patch_size[1])
+        self.unflatten = nn.Unflatten(1, (self.h_fmap, self.w_fmap))
+        self.fc_landmarks = torch.nn.Conv2d(self.feature_dim, num_landmarks + 1, 1, bias=False)
+        self.gumbel_softmax = gumbel_softmax
+        self.gumbel_softmax_temperature = gumbel_softmax_temperature
+        self.gumbel_softmax_hard = gumbel_softmax_hard
+        self.modulation_type = modulation_type
+        if modulation_type == "layer_norm":
+            self.modulation = torch.nn.LayerNorm([self.feature_dim, self.num_landmarks + 1])
+        elif modulation_type == "original":
+            self.modulation = torch.nn.Parameter(torch.ones(1, self.feature_dim, self.num_landmarks + 1))
+        elif modulation_type == "parallel_mlp":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=True, bias=True)
+        elif modulation_type == "parallel_mlp_no_bias":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=True, bias=False)
+        elif modulation_type == "parallel_mlp_no_act":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=False, bias=True)
+        elif modulation_type == "parallel_mlp_no_act_no_bias":
+            self.modulation = IndependentMLPs(part_dim=self.num_landmarks + 1, latent_dim=self.feature_dim,
+                                              num_lin_layers=1, act_layer=False, bias=False)
+        elif modulation_type == "none":
+            self.modulation = torch.nn.Identity()
+        else:
+            raise ValueError("modulation_type not implemented")
+        self.modulation_orth = modulation_orth
+        self.dropout_full_landmarks = torch.nn.Dropout1d(part_dropout)
+        self.classifier_type = classifier_type
+        if classifier_type == "independent_mlp":
+            self.fc_class_landmarks = IndependentMLPs(part_dim=self.num_landmarks, latent_dim=self.feature_dim,
+                                                      num_lin_layers=1, act_layer=False, out_dim=num_classes,
+                                                      bias=False, stack_dim=1)
+        elif classifier_type == "linear":
+            self.fc_class_landmarks = torch.nn.Linear(in_features=self.feature_dim, out_features=num_classes,
+                                                      bias=False)
+        else:
+            raise ValueError("classifier_type not implemented")
+        self.convert_blocks_and_attention()
+        self._init_weights()
+    def _init_weights_head(self):
+        # Initialize weights with a truncated normal distribution
+        if self.classifier_type == "independent_mlp":
+            self.fc_class_landmarks.reset_weights()
+        else:
+            torch.nn.init.trunc_normal_(self.fc_class_landmarks.weight, std=0.02)
+            if self.fc_class_landmarks.bias is not None:
+                torch.nn.init.zeros_(self.fc_class_landmarks.bias)
+    def _init_weights(self):
+        self._init_weights_head()
+    def convert_blocks_and_attention(self):
+        for module in self.modules():
+            if isinstance(module, Block):
+                module.__class__ = BlockWQKVReturn
+            elif isinstance(module, Attention):
+                module.__class__ = AttentionWQKVReturn
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        pos_embed = self.pos_embed
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+        return self.pos_drop(x)
+    def forward(self, x: Tensor) -> tuple[Any, Any, Any, Any, int | Any] | tuple[Any, Any, Any, Any, int | Any]:
+        x = self.patch_embed(x)
+        # Position Embedding
+        x = self._pos_embed(x)
+        # Forward pass through transformer
+        x = self.norm_pre(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        # Compute per landmark attention maps
+        # (b - a)^2 = b^2 - 2ab + a^2, b = feature maps vit, a = convolution kernel
+        batch_size = x.shape[0]
+        x = x[:, self.num_prefix_tokens:, :]  # [B, num_patch_tokens, embed_dim]
+        x = self.unflatten(x)  # [B, H, W, embed_dim]
+        x = x.permute(0, 3, 1, 2).contiguous()  # [B, embed_dim, H, W]
+        ab = self.fc_landmarks(x)  # [B, num_landmarks + 1, H, W]
+        b_sq = x.pow(2).sum(1, keepdim=True)
+        b_sq = b_sq.expand(-1, self.num_landmarks + 1, -1, -1).contiguous()
+        a_sq = self.fc_landmarks.weight.pow(2).sum(1, keepdim=True).expand(-1, batch_size, x.shape[-2],
+                                                                           x.shape[-1]).contiguous()
+        a_sq = a_sq.permute(1, 0, 2, 3).contiguous()
+        dist = b_sq - 2 * ab + a_sq
+        maps = -dist
+        # Softmax so that the attention maps for each pixel add up to 1
+        if self.gumbel_softmax:
+            maps = torch.nn.functional.gumbel_softmax(maps, dim=1, tau=self.gumbel_softmax_temperature,
+                                                      hard=self.gumbel_softmax_hard)  # [B, num_landmarks + 1, H, W]
+        else:
+            maps = torch.nn.functional.softmax(maps, dim=1)  # [B, num_landmarks + 1, H, W]
+        # Use maps to get weighted average features per landmark
+        all_features = (maps.unsqueeze(1) * x.unsqueeze(2)).contiguous()
+        if self.noise_variance > 0.0:
+            all_features += torch.randn_like(all_features,
+                                             device=all_features.device) * x.std().detach() * self.noise_variance
+        all_features = all_features.mean(-1).mean(-1).contiguous()  # [B, embed_dim, num_landmarks + 1]
+        # Modulate the features
+        if self.modulation_type == "original":
+            all_features_mod = all_features * self.modulation  # [B, embed_dim, num_landmarks + 1]
+        else:
+            all_features_mod = self.modulation(all_features)  # [B, embed_dim, num_landmarks + 1]
+        # Classification based on the landmark features
+        scores = self.fc_class_landmarks(
+            self.dropout_full_landmarks(all_features_mod[..., :-1].permute(0, 2, 1).contiguous())).permute(0, 2,
+                                                                                                           1).contiguous()
+        if self.modulation_orth:
+            return all_features_mod, maps, scores, dist
+        else:
+            return all_features, maps, scores, dist
+    def get_specific_intermediate_layer(
+            self,
+            x: torch.Tensor,
+            n: int = 1,
+            return_qkv: bool = False,
+            return_att_weights: bool = False,
+    ):
+        num_blocks = len(self.blocks)
+        attn_weights = []
+        if n >= num_blocks:
+            raise ValueError(f"n must be less than {num_blocks}")
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        if n == -1:
+            if return_qkv:
+                raise ValueError("take_indice cannot be -1 if return_transformer_qkv is True")
+            else:
+                return x
+        for i, blk in enumerate(self.blocks):
+            if self.return_transformer_qkv:
+                x, qkv = blk(x, return_qkv=True)
+                if return_att_weights:
+                    attn_weight, _ = compute_attention(qkv)
+                    attn_weights.append(attn_weight.detach())
+            else:
+                x = blk(x)
+            if i == n:
+                output = x.clone()
+                if self.return_transformer_qkv and return_qkv:
+                    qkv_output = qkv.clone()
+                break
+        if self.return_transformer_qkv and return_qkv and return_att_weights:
+            return output, qkv_output, attn_weights
+        elif self.return_transformer_qkv and return_qkv:
+            return output, qkv_output
+        elif self.return_transformer_qkv and return_att_weights:
+            return output, attn_weights
+        else:
+            return output
+    def _intermediate_layers(
+            self,
+            x: torch.Tensor,
+            n: Union[int, Sequence] = 1,
+    ):
+        outputs, num_blocks = [], len(self.blocks)
+        if self.return_transformer_qkv:
+            qkv_outputs = []
+        take_indices = set(range(num_blocks - n, num_blocks) if isinstance(n, int) else n)
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            if self.return_transformer_qkv:
+                x, qkv = blk(x, return_qkv=True)
+            else:
+                x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+                if self.return_transformer_qkv:
+                    qkv_outputs.append(qkv)
+        if self.return_transformer_qkv:
+            return outputs, qkv_outputs
+        else:
+            return outputs
+    def get_intermediate_layers(
+            self,
+            x: torch.Tensor,
+            n: Union[int, Sequence] = 1,
+            reshape: bool = False,
+            return_prefix_tokens: bool = False,
+            norm: bool = False,
+    ) -> tuple[tuple, Any]:
+        """ Intermediate layer accessor (NOTE: This is a WIP experiment).
+        Inspired by DINO / DINOv2 interface
+        """
+        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        if self.return_transformer_qkv:
+            outputs, qkv = self._intermediate_layers(x, n)
+        else:
+            outputs = self._intermediate_layers(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        prefix_tokens = [out[:, 0:self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens:] for out in outputs]
+        if reshape:
+            grid_size = self.patch_embed.grid_size
+            outputs = [
+                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_prefix_tokens:
+            return_out = tuple(zip(outputs, prefix_tokens))
+        else:
+            return_out = tuple(outputs)
+        if self.return_transformer_qkv:
+            return return_out, qkv
+        else:
+            return return_out
+    @classmethod
+    def _from_pretrained(
+            cls,
+            *,
+            model_id: str,
+            revision: Optional[str],
+            cache_dir: Optional[Union[str, Path]],
+            force_download: bool,
+            proxies: Optional[Dict],
+            resume_download: Optional[bool],
+            local_files_only: bool,
+            token: Union[str, bool, None],
+            map_location: str = "cpu",
+            strict: bool = False,
+            timm_backbone: str = "hf_hub:timm/vit_base_patch14_reg4_dinov2.lvd142m",
+            input_size: int = 518,
+            **model_kwargs):
+        base_model = create_model(timm_backbone, pretrained=False, img_size=input_size)
+        model = cls(base_model, **model_kwargs)
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            model_file = os.path.join(model_id, SAFETENSORS_SINGLE_FILE)
+            return cls._load_as_safetensor(model, model_file, map_location, strict)
+        else:
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename=SAFETENSORS_SINGLE_FILE,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+            return cls._load_as_safetensor(model, model_file, map_location, strict)
+def pdiscoformer_vit_bb(backbone, img_size=224, num_cls=200, k=8, **kwargs):
+    base_model = create_model(
+        backbone,
+        pretrained=False,
+        img_size=img_size,
+    )
+    model = IndividualLandmarkViT(base_model, num_landmarks=k, num_classes=num_cls,
+                                  modulation_type="layer_norm", gumbel_softmax=True,
+                                  modulation_orth=True)
+    return model
+def pdisconet_vit_bb(backbone, img_size=224, num_cls=200, k=8, **kwargs):
+    base_model = create_model(
+        backbone,
+        pretrained=False,
+        img_size=img_size,
+    )
+    model = IndividualLandmarkViT(base_model, num_landmarks=k, num_classes=num_cls,
+                                  modulation_type="original")
+    return model

models/vit_baseline.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Compostion of the VisionTransformer class from timm with extra features: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+import torch
+import torch.nn as nn
+from typing import Tuple, Union, Sequence, Any
+from timm.layers import trunc_normal_
+from timm.models.vision_transformer import Block, Attention
+from layers.transformer_layers import BlockWQKVReturn, AttentionWQKVReturn
+from utils.misc_utils import compute_attention
+class BaselineViT(torch.nn.Module):
+    """
+    Modifications:
+    - Use PDiscoBlock instead of Block
+    - Use PDiscoAttention instead of Attention
+    - Return the mean of k over heads from attention
+    - Option to use only class tokens or only patch tokens or both (concat) for classification
+    """
+    def __init__(self, init_model: torch.nn.Module, num_classes: int,
+                 class_tokens_only: bool = False,
+                 patch_tokens_only: bool = False, return_transformer_qkv: bool = False) -> None:
+        super().__init__()
+        self.num_classes = num_classes
+        self.class_tokens_only = class_tokens_only
+        self.patch_tokens_only = patch_tokens_only
+        self.num_prefix_tokens = init_model.num_prefix_tokens
+        self.num_reg_tokens = init_model.num_reg_tokens
+        self.has_class_token = init_model.has_class_token
+        self.no_embed_class = init_model.no_embed_class
+        self.cls_token = init_model.cls_token
+        self.reg_token = init_model.reg_token
+        self.patch_embed = init_model.patch_embed
+        self.pos_embed = init_model.pos_embed
+        self.pos_drop = init_model.pos_drop
+        self.part_embed = nn.Identity()
+        self.patch_prune = nn.Identity()
+        self.norm_pre = init_model.norm_pre
+        self.blocks = init_model.blocks
+        self.norm = init_model.norm
+        self.fc_norm = init_model.fc_norm
+        if class_tokens_only or patch_tokens_only:
+            self.head = nn.Linear(init_model.embed_dim, num_classes)
+        else:
+            self.head = nn.Linear(init_model.embed_dim * 2, num_classes)
+        self.h_fmap = int(self.patch_embed.img_size[0] // self.patch_embed.patch_size[0])
+        self.w_fmap = int(self.patch_embed.img_size[1] // self.patch_embed.patch_size[1])
+        self.return_transformer_qkv = return_transformer_qkv
+        self.convert_blocks_and_attention()
+        self._init_weights_head()
+    def convert_blocks_and_attention(self):
+        for module in self.modules():
+            if isinstance(module, Block):
+                module.__class__ = BlockWQKVReturn
+            elif isinstance(module, Attention):
+                module.__class__ = AttentionWQKVReturn
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        pos_embed = self.pos_embed
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+        return self.pos_drop(x)
+    def _init_weights_head(self):
+        trunc_normal_(self.head.weight, std=.02)
+        if self.head.bias is not None:
+            nn.init.constant_(self.head.bias, 0.)
+    def forward(self, x: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, torch.Tensor]:
+        x = self.patch_embed(x)
+        # Position Embedding
+        x = self._pos_embed(x)
+        x = self.part_embed(x)
+        x = self.patch_prune(x)
+        # Forward pass through transformer
+        x = self.norm_pre(x)
+        if self.return_transformer_qkv:
+            # Return keys of last attention layer
+            for i, blk in enumerate(self.blocks):
+                x, qkv = blk(x, return_qkv=True)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        # Classification head
+        x = self.fc_norm(x)
+        if self.class_tokens_only:  # only use class token
+            x = x[:, 0, :]
+        elif self.patch_tokens_only:  # only use patch tokens
+            x = x[:, self.num_prefix_tokens:, :].mean(dim=1)
+        else:
+            x = torch.cat([x[:, 0, :], x[:, self.num_prefix_tokens:, :].mean(dim=1)], dim=1)
+        x = self.head(x)
+        if self.return_transformer_qkv:
+            return x, qkv
+        else:
+            return x
+    def get_specific_intermediate_layer(
+            self,
+            x: torch.Tensor,
+            n: int = 1,
+            return_qkv: bool = False,
+            return_att_weights: bool = False,
+    ):
+        num_blocks = len(self.blocks)
+        attn_weights = []
+        if n >= num_blocks:
+            raise ValueError(f"n must be less than {num_blocks}")
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        if n == -1:
+            if return_qkv:
+                raise ValueError("take_indice cannot be -1 if return_transformer_qkv is True")
+            else:
+                return x
+        for i, blk in enumerate(self.blocks):
+            if self.return_transformer_qkv:
+                x, qkv = blk(x, return_qkv=True)
+                if return_att_weights:
+                    attn_weight, _ = compute_attention(qkv)
+                    attn_weights.append(attn_weight.detach())
+            else:
+                x = blk(x)
+            if i == n:
+                output = x.clone()
+                if self.return_transformer_qkv and return_qkv:
+                    qkv_output = qkv.clone()
+                break
+        if self.return_transformer_qkv and return_qkv and return_att_weights:
+            return output, qkv_output, attn_weights
+        elif self.return_transformer_qkv and return_qkv:
+            return output, qkv_output
+        elif self.return_transformer_qkv and return_att_weights:
+            return output, attn_weights
+        else:
+            return output
+    def _intermediate_layers(
+            self,
+            x: torch.Tensor,
+            n: Union[int, Sequence] = 1,
+    ):
+        outputs, num_blocks = [], len(self.blocks)
+        if self.return_transformer_qkv:
+            qkv_outputs = []
+        take_indices = set(range(num_blocks - n, num_blocks) if isinstance(n, int) else n)
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            if self.return_transformer_qkv:
+                x, qkv = blk(x, return_qkv=True)
+            else:
+                x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+                if self.return_transformer_qkv:
+                    qkv_outputs.append(qkv)
+        if self.return_transformer_qkv:
+            return outputs, qkv_outputs
+        else:
+            return outputs
+    def get_intermediate_layers(
+            self,
+            x: torch.Tensor,
+            n: Union[int, Sequence] = 1,
+            reshape: bool = False,
+            return_prefix_tokens: bool = False,
+            norm: bool = False,
+    ) -> tuple[tuple, Any]:
+        """ Intermediate layer accessor (NOTE: This is a WIP experiment).
+        Inspired by DINO / DINOv2 interface
+        """
+        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        if self.return_transformer_qkv:
+            outputs, qkv = self._intermediate_layers(x, n)
+        else:
+            outputs = self._intermediate_layers(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        prefix_tokens = [out[:, 0:self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens:] for out in outputs]
+        if reshape:
+            grid_size = self.patch_embed.grid_size
+            outputs = [
+                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_prefix_tokens:
+            return_out = tuple(zip(outputs, prefix_tokens))
+        else:
+            return_out = tuple(outputs)
+        if self.return_transformer_qkv:
+            return return_out, qkv
+        else:
+            return return_out

requirements.txt CHANGED Viewed

@@ -3,4 +3,8 @@ timm
 colorcet
 matplotlib
 torchvision
-streamlit

 colorcet
 matplotlib
 torchvision
+streamlit
+numpy
+pillow
+scikit-image
+huggingface-hub

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .data_utils import *
+from .visualize_att_maps import *
+from .misc_utils import *
+from .get_landmark_coordinates import *

utils/data_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .dataset_utils import *
+from .reversible_affine_transform import *
+from .transform_utils import *
+from .class_balanced_distributed_sampler import *
+from .class_balanced_sampler import *

utils/data_utils/class_balanced_distributed_sampler.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+from torch.utils.data import Dataset
+from typing import Optional
+import math
+import torch.distributed as dist
+class ClassBalancedDistributedSampler(torch.utils.data.Sampler):
+    """
+    A custom sampler that sub-samples a given dataset based on class labels. Based on the DistributedSampler class
+    Ref: https://github.com/pytorch/pytorch/blob/04c1df651aa58bea50977f4efcf19b09ce27cefd/torch/utils/data/distributed.py#L13
+    """
+    def __init__(self, dataset: Dataset, num_replicas: Optional[int] = None, rank: Optional[int] = None,
+                 shuffle: bool = True, seed: int = 0, drop_last: bool = False, num_samples_per_class=100) -> None:
+        if not shuffle:
+            raise ValueError("ClassBalancedDatasetSubSampler requires shuffling, otherwise use DistributedSampler")
+        # Check if the dataset has a generate_class_balanced_indices method
+        if not hasattr(dataset, 'generate_class_balanced_indices'):
+            raise ValueError("Dataset does not have a generate_class_balanced_indices method")
+        self.shuffle = shuffle
+        self.seed = seed
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        if rank >= num_replicas or rank < 0:
+            raise ValueError(
+                f"Invalid rank {rank}, rank should be in the interval [0, {num_replicas - 1}]")
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.drop_last = drop_last
+        # Calculate the number of samples
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+        self.num_samples_per_class = num_samples_per_class
+        indices = dataset.generate_class_balanced_indices(torch.Generator(),
+                                                          num_samples_per_class=num_samples_per_class)
+        dataset_size = len(indices)
+        # If the dataset length is evenly divisible by # of replicas, then there
+        # is no need to drop any data, since the dataset will be split equally.
+        if self.drop_last and len(self.dataset) % self.num_replicas != 0:  # type: ignore[arg-type]
+            # Split to nearest available length that is evenly divisible.
+            # This is to ensure each rank receives the same amount of data when
+            # using this Sampler.
+            self.num_samples = math.ceil(
+                (dataset_size - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
+            )
+        else:
+            self.num_samples = math.ceil(dataset_size / self.num_replicas)  # type: ignore[arg-type]
+        self.total_size = self.num_samples * self.num_replicas
+    def __iter__(self):
+        # deterministically shuffle based on epoch and seed, here shuffle is assumed to be True
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+        indices = self.dataset.generate_class_balanced_indices(g, num_samples_per_class=self.num_samples_per_class)
+        if not self.drop_last:
+            # add extra samples to make it evenly divisible
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make it evenly divisible.
+            indices = indices[:self.total_size]
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        return iter(indices)
+    def __len__(self) -> int:
+        return self.num_samples
+    def set_epoch(self, epoch: int) -> None:
+        r"""
+        Set the epoch for this sampler.
+        When :attr:`shuffle=True`, this ensures all replicas
+        use a different random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch

utils/data_utils/class_balanced_sampler.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+from torch.utils.data import Dataset
+class ClassBalancedRandomSampler(torch.utils.data.Sampler):
+    """
+    A custom sampler that sub-samples a given dataset based on class labels. Based on the RandomSampler class
+    This is essentially the non-ddp version of ClassBalancedDistributedSampler
+    Ref: https://github.com/pytorch/pytorch/blob/abe3c55a6a01c5b625eeb4fc9aab1421a5965cd2/torch/utils/data/sampler.py#L117
+    """
+    def __init__(self, dataset: Dataset, num_samples_per_class=100, seed: int = 0) -> None:
+        self.dataset = dataset
+        self.seed = seed
+        # Calculate the number of samples
+        self.generator = torch.Generator()
+        self.generator.manual_seed(self.seed)
+        self.num_samples_per_class = num_samples_per_class
+        indices = dataset.generate_class_balanced_indices(self.generator,
+                                                          num_samples_per_class=num_samples_per_class)
+        self.num_samples = len(indices)
+    def __iter__(self):
+        # Change seed for every function call
+        seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        self.generator.manual_seed(seed)
+        indices = self.dataset.generate_class_balanced_indices(self.generator, num_samples_per_class=self.num_samples_per_class)
+        return iter(indices)
+    def __len__(self) -> int:
+        return self.num_samples

utils/data_utils/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from PIL import Image
+from torch import Tensor
+from typing import List, Optional
+import numpy as np
+import torchvision
+import json
+def load_json(path: str):
+    """
+    Load json file from path and return the data
+    :param path: Path to the json file
+    :return:
+    data: Data in the json file
+    """
+    with open(path, 'r') as f:
+        data = json.load(f)
+    return data
+def save_json(data: dict, path: str):
+    """
+    Save data to a json file
+    :param data: Data to be saved
+    :param path: Path to save the data
+    :return:
+    """
+    with open(path, "w") as f:
+        json.dump(data, f)
+def pil_loader(path):
+    """
+    Load image from path using PIL
+    :param path: Path to the image
+    :return:
+    img: PIL Image
+    """
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+def get_dimensions(image: Tensor):
+    """
+    Get the dimensions of the image
+    :param image: Tensor or PIL Image or np.ndarray
+    :return:
+    h: Height of the image
+    w: Width of the image
+    """
+    if isinstance(image, Tensor):
+        _, h, w = image.shape
+    elif isinstance(image, np.ndarray):
+        h, w, _ = image.shape
+    elif isinstance(image, Image.Image):
+        w, h = image.size
+    else:
+        raise ValueError(f"Invalid image type: {type(image)}")
+    return h, w
+def center_crop_boxes_kps(img: Tensor, output_size: Optional[List[int]] = 448, parts: Optional[Tensor] = None,
+                          boxes: Optional[Tensor] = None, num_keypoints: int = 15):
+    """
+    Calculate the center crop parameters for the bounding boxes and landmarks and update them
+    :param img: Image
+    :param output_size: Output size of the cropped image
+    :param parts: Locations of the landmarks of following format: <part_id> <x> <y> <visible>
+    :param boxes: Bounding boxes of the landmarks of following format: <image_id> <x> <y> <width> <height>
+    :param num_keypoints: Number of keypoints
+    :return:
+    cropped_img: Center cropped image
+    parts: Updated locations of the landmarks
+    boxes: Updated bounding boxes of the landmarks
+    """
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 2:
+        output_size = output_size
+    else:
+        raise ValueError(f"Invalid output size: {output_size}")
+    crop_height, crop_width = output_size
+    image_height, image_width = get_dimensions(img)
+    img = torchvision.transforms.functional.center_crop(img, output_size)
+    crop_top, crop_left = _get_center_crop_params_(image_height, image_width, output_size)
+    if parts is not None:
+        for j in range(num_keypoints):
+            # Skip if part is invisible
+            if parts[j][-1] == 0:
+                continue
+            parts[j][1] -= crop_left
+            parts[j][2] -= crop_top
+            # Skip if part is outside the crop
+            if parts[j][1] > crop_width or parts[j][2] > crop_height:
+                parts[j][-1] = 0
+            if parts[j][1] < 0 or parts[j][2] < 0:
+                parts[j][-1] = 0
+            parts[j][1] = min(crop_width, parts[j][1])
+            parts[j][2] = min(crop_height, parts[j][2])
+            parts[j][1] = max(0, parts[j][1])
+            parts[j][2] = max(0, parts[j][2])
+    if boxes is not None:
+        boxes[1] -= crop_left
+        boxes[2] -= crop_top
+        boxes[1] = max(0, boxes[1])
+        boxes[2] = max(0, boxes[2])
+        boxes[1] = min(crop_width, boxes[1])
+        boxes[2] = min(crop_height, boxes[2])
+    return img, parts, boxes
+def _get_center_crop_params_(image_height: int, image_width: int, output_size: Optional[List[int]] = 448):
+    """
+    Get the parameters for center cropping the image
+    :param image_height: Height of the image
+    :param image_width: Width of the image
+    :param output_size: Output size of the cropped image
+    :return:
+    crop_top: Top coordinate of the cropped image
+    crop_left: Left coordinate of the cropped image
+    """
+    if isinstance(output_size, int):
+        output_size = (output_size, output_size)
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        output_size = (output_size[0], output_size[0])
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 2:
+        output_size = output_size
+    else:
+        raise ValueError(f"Invalid output size: {output_size}")
+    crop_height, crop_width = output_size
+    if crop_width > image_width or crop_height > image_height:
+        padding_ltrb = [
+            (crop_width - image_width) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height) // 2 if crop_height > image_height else 0,
+            (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+            (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+        ]
+        crop_top, crop_left = padding_ltrb[1], padding_ltrb[0]
+        return crop_top, crop_left
+    if crop_width == image_width and crop_height == image_height:
+        crop_top = 0
+        crop_left = 0
+        return crop_top, crop_left
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return crop_top, crop_left

utils/data_utils/reversible_affine_transform.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Description: This file contains the code for the reversible affine transform
+import torchvision.transforms as transforms
+import torch
+from typing import List, Optional, Tuple, Any
+def generate_affine_trans_params(
+        degrees: List[float],
+        translate: Optional[List[float]],
+        scale_ranges: Optional[List[float]],
+        shears: Optional[List[float]],
+        img_size: List[int],
+) -> Tuple[float, Tuple[int, int], float, Any]:
+    """Get parameters for affine transformation
+    Returns:
+        params to be passed to the affine transformation
+    """
+    angle = float(torch.empty(1).uniform_(float(degrees[0]), float(degrees[1])).item())
+    if translate is not None:
+        max_dx = float(translate[0] * img_size[0])
+        max_dy = float(translate[1] * img_size[1])
+        tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
+        ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
+        translations = (tx, ty)
+    else:
+        translations = (0, 0)
+    if scale_ranges is not None:
+        scale = float(torch.empty(1).uniform_(scale_ranges[0], scale_ranges[1]).item())
+    else:
+        scale = 1.0
+    shear_x = shear_y = 0.0
+    if shears is not None:
+        shear_x = float(torch.empty(1).uniform_(shears[0], shears[1]).item())
+        if len(shears) == 4:
+            shear_y = float(torch.empty(1).uniform_(shears[2], shears[3]).item())
+    shear = (shear_x, shear_y)
+    if shear_x == 0.0 and shear_y == 0.0:
+        shear = 0.0
+    return angle, translations, scale, shear
+def rigid_transform(img, angle, translate, scale, invert=False, shear=0,
+                    interpolation=transforms.InterpolationMode.BILINEAR):
+    """
+    Affine transforms input image
+    Modified from: https://github.com/robertdvdk/part_detection/blob/eec53f2f40602113f74c6c1f60a2034823b0fcaf/lib.py#L54
+    Parameters
+    ----------
+    img: Tensor
+        Input image
+    angle: int
+        Rotation angle between -180 and 180 degrees
+    translate: [int]
+        Sequence of horizontal/vertical translations
+    scale: float
+        How to scale the image
+    invert: bool
+        Whether to invert the transformation
+    shear: float
+        Shear angle in degrees
+    interpolation: InterpolationMode
+        Interpolation mode to calculate output values
+    Returns
+    ----------
+    img: Tensor
+        Transformed image
+    """
+    if not invert:
+        img = transforms.functional.affine(img, angle=angle, translate=translate, scale=scale, shear=shear,
+                                           interpolation=interpolation)
+    else:
+        translate = [-t for t in translate]
+        img = transforms.functional.affine(img=img, angle=0, translate=translate, scale=1, shear=shear)
+        img = transforms.functional.affine(img=img, angle=-angle, translate=[0, 0], scale=1 / scale, shear=shear)
+    return img

utils/data_utils/transform_utils.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+from torchvision import transforms as transforms
+from torchvision.transforms import Compose
+from timm.data.constants import \
+    IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from timm.data import create_transform
+def make_train_transforms(args):
+    train_transforms: Compose = transforms.Compose([
+        transforms.Resize(size=args.image_size, antialias=True),
+        transforms.RandomHorizontalFlip(p=args.hflip),
+        transforms.RandomVerticalFlip(p=args.vflip),
+        transforms.ColorJitter(),
+        transforms.RandomAffine(degrees=90, translate=(0.2, 0.2), scale=(0.8, 1.2)),
+        transforms.RandomCrop(args.image_size),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
+    ])
+    return train_transforms
+def make_test_transforms(args):
+    test_transforms: Compose = transforms.Compose([
+        transforms.Resize(size=args.image_size, antialias=True),
+        transforms.CenterCrop(args.image_size),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
+    ])
+    return test_transforms
+def build_transform_timm(args, is_train=True):
+    resize_im = args.image_size > 32
+    imagenet_default_mean_and_std = args.imagenet_default_mean_and_std
+    mean = IMAGENET_INCEPTION_MEAN if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_MEAN
+    std = IMAGENET_INCEPTION_STD if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_STD
+    if is_train:
+        # this should always dispatch to transforms_imagenet_train
+        transform = create_transform(
+            input_size=args.image_size,
+            is_training=True,
+            color_jitter=args.color_jitter,
+            hflip=args.hflip,
+            vflip=args.vflip,
+            auto_augment=args.aa,
+            interpolation=args.train_interpolation,
+            re_prob=args.reprob,
+            re_mode=args.remode,
+            re_count=args.recount,
+            mean=mean,
+            std=std,
+        )
+        if not resize_im:
+            transform.transforms[0] = transforms.RandomCrop(
+                args.image_size, padding=4)
+        return transform
+    t = []
+    if resize_im:
+        # warping (no cropping) when evaluated at 384 or larger
+        if args.image_size >= 384:
+            t.append(
+                transforms.Resize((args.image_size, args.image_size),
+                                  interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
+            )
+            print(f"Warping {args.image_size} size input images...")
+        else:
+            if args.crop_pct is None:
+                args.crop_pct = 224 / 256
+            size = int(args.image_size / args.crop_pct)
+            t.append(
+                # to maintain same ratio w.r.t. 224 images
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
+            )
+            t.append(transforms.CenterCrop(args.image_size))
+    t.append(transforms.ToTensor())
+    t.append(transforms.Normalize(mean, std))
+    return transforms.Compose(t)
+def inverse_normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD):
+    mean = torch.as_tensor(mean)
+    std = torch.as_tensor(std)
+    un_normalize = transforms.Normalize((-mean / std).tolist(), (1.0 / std).tolist())
+    return un_normalize
+def normalize_only(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD):
+    normalize = transforms.Normalize(mean=mean, std=std)
+    return normalize
+def inverse_normalize_w_resize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD,
+                               resize_resolution=(256, 256)):
+    mean = torch.as_tensor(mean)
+    std = torch.as_tensor(std)
+    resize_unnorm = transforms.Compose([
+        transforms.Normalize((-mean / std).tolist(), (1.0 / std).tolist()),
+        transforms.Resize(size=resize_resolution, antialias=True)])
+    return resize_unnorm
+def load_transforms(args):
+    # Get the transforms and load the dataset
+    if args.augmentations_to_use == 'timm':
+        train_transforms = build_transform_timm(args, is_train=True)
+    elif args.augmentations_to_use == 'cub_original':
+        train_transforms = make_train_transforms(args)
+    else:
+        raise ValueError('Augmentations not supported.')
+    test_transforms = make_test_transforms(args)
+    return train_transforms, test_transforms

utils/get_landmark_coordinates.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# This file contains the function to generate the center coordinates as tensor for the current net.
+import torch
+def landmark_coordinates(maps, grid_x=None, grid_y=None):
+    """
+    Generate the center coordinates as tensor for the current net.
+    Modified from: https://github.com/robertdvdk/part_detection/blob/eec53f2f40602113f74c6c1f60a2034823b0fcaf/lib.py#L19
+    Parameters
+    ----------
+    maps: torch.Tensor
+        Attention map with shape (batch_size, channels, height, width) where channels is the landmark probability
+    grid_x: torch.Tensor
+        The grid x coordinates
+    grid_y: torch.Tensor
+        The grid y coordinates
+    Returns
+    ----------
+    loc_x: Tensor
+        The centroid x coordinates
+    loc_y: Tensor
+        The centroid y coordinates
+    grid_x: Tensor
+    grid_y: Tensor
+    """
+    return_grid = False
+    if grid_x is None or grid_y is None:
+        return_grid = True
+        grid_x, grid_y = torch.meshgrid(torch.arange(maps.shape[2]),
+                                        torch.arange(maps.shape[3]), indexing='ij')
+        grid_x = grid_x.unsqueeze(0).unsqueeze(0).contiguous().to(maps.device, non_blocking=True)
+        grid_y = grid_y.unsqueeze(0).unsqueeze(0).contiguous().to(maps.device, non_blocking=True)
+    map_sums = maps.sum(3).sum(2).detach()
+    maps_x = grid_x * maps
+    maps_y = grid_y * maps
+    loc_x = maps_x.sum(3).sum(2) / map_sums
+    loc_y = maps_y.sum(3).sum(2) / map_sums
+    if return_grid:
+        return loc_x, loc_y, grid_x, grid_y
+    else:
+        return loc_x, loc_y

utils/misc_utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+from functools import reduce
+import torch
+import numpy as np
+import os
+from pathlib import Path
+def factors(n):
+    return reduce(list.__add__,
+                  ([i, n // i] for i in range(1, int(n ** 0.5) + 1) if n % i == 0))
+def file_line_count(filename: str) -> int:
+    """Count the number of lines in a file"""
+    with open(filename, 'rb') as f:
+        return sum(1 for _ in f)
+def compute_attention(qkv, scale=None):
+    """
+    Compute attention matrix (same as in the pytorch scaled dot product attention)
+    Ref: https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+    :param qkv: Query, key and value tensors concatenated along the first dimension
+    :param scale: Scale factor for the attention computation
+    :return:
+    """
+    if isinstance(qkv, torch.Tensor):
+        query, key, value = qkv.unbind(0)
+    else:
+        query, key, value = qkv
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    L, S = query.size(-2), key.size(-2)
+    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    attn_out = attn_weight @ value
+    return attn_weight, attn_out
+def compute_dot_product_similarity(a, b):
+    scores = a @ b.transpose(-1, -2)
+    return scores
+def compute_cross_entropy(p, q):
+    q = torch.nn.functional.log_softmax(q, dim=-1)
+    loss = torch.sum(p * q, dim=-1)
+    return - loss.mean()
+def rollout(attentions, discard_ratio=0.9, head_fusion="max", device=torch.device("cuda")):
+    """
+    Perform attention rollout,
+    Ref: https://github.com/jacobgil/vit-explain/blob/15a81d355a5aa6128ea4e71bbd56c28888d0f33b/vit_rollout.py#L9C1-L42C16
+    Parameters
+    ----------
+    attentions : list
+        List of attention matrices, one for each transformer layer
+    discard_ratio : float
+        Ratio of lowest attention values to discard
+    head_fusion : str
+        Type of fusion to use for attention heads. One of "mean", "max", "min"
+    device : torch.device
+        Device to use for computation
+    Returns
+    -------
+    mask : np.ndarray
+        Mask of shape (width, width), where width is the square root of the number of patches
+    """
+    result = torch.eye(attentions[0].size(-1), device=device)
+    attentions = [attention.to(device) for attention in attentions]
+    with torch.no_grad():
+        for attention in attentions:
+            if head_fusion == "mean":
+                attention_heads_fused = attention.mean(axis=1)
+            elif head_fusion == "max":
+                attention_heads_fused = attention.max(axis=1).values
+            elif head_fusion == "min":
+                attention_heads_fused = attention.min(axis=1).values
+            else:
+                raise "Attention head fusion type Not supported"
+            # Drop the lowest attentions, but
+            # don't drop the class token
+            flat = attention_heads_fused.view(attention_heads_fused.size(0), -1)
+            _, indices = flat.topk(int(flat.size(-1) * discard_ratio), -1, False)
+            indices = indices[indices != 0]
+            flat[0, indices] = 0
+            I = torch.eye(attention_heads_fused.size(-1), device=device)
+            a = (attention_heads_fused + 1.0 * I) / 2
+            a = a / a.sum(dim=-1)
+            result = torch.matmul(a, result)
+    # Normalize the result by max value in each row
+    result = result / result.max(dim=-1, keepdim=True)[0]
+    return result
+def sync_bn_conversion(model: torch.nn.Module):
+    """
+    Convert BatchNorm to SyncBatchNorm (used for DDP)
+    :param model: PyTorch model
+    :return:
+    model: PyTorch model with SyncBatchNorm layers
+    """
+    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    return model
+def check_snapshot(args):
+    """
+    Create directory to save training checkpoints, otherwise load the existing checkpoint.
+    Additionally, if it is an array training job, create a new directory for each training job.
+    :param args: Arguments from the argument parser
+    :return:
+    """
+    # Check if it is an array training job (i.e. training with multiple random seeds on the same settings)
+    if args.array_training_job and not args.resume_training:
+        args.snapshot_dir = os.path.join(args.snapshot_dir, str(args.seed))
+        if not os.path.exists(args.snapshot_dir):
+            save_dir = Path(args.snapshot_dir)
+            save_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        # Create directory to save training checkpoints, otherwise load the existing checkpoint
+        if not os.path.exists(args.snapshot_dir):
+            if ".pt" not in args.snapshot_dir or ".pth" not in args.snapshot_dir:
+                save_dir = Path(args.snapshot_dir)
+                save_dir.mkdir(parents=True, exist_ok=True)
+            else:
+                raise ValueError('Snapshot checkpoint does not exist.')

utils/visualize_att_maps.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+import colorcet as cc
+import numpy as np
+import skimage
+from pathlib import Path
+import os
+import torch
+from utils.data_utils.transform_utils import inverse_normalize_w_resize
+from utils.misc_utils import factors
+# Define the colors to use for the attention maps
+colors = cc.glasbey_category10
+class VisualizeAttentionMaps:
+    def __init__(self, snapshot_dir="", save_resolution=(256, 256), alpha=0.5, sub_path_test="",
+                 dataset_name="", bg_label=0, batch_size=32, num_parts=15, plot_ims_separately=False,
+                 plot_landmark_amaps=False):
+        """
+        Plot attention maps and optionally landmark centroids on images.
+        :param snapshot_dir: Directory to save the visualization results
+        :param save_resolution: Size of the images to save
+        :param alpha: The transparency of the attention maps
+        :param sub_path_test: The sub-path of the test dataset
+        :param dataset_name: The name of the dataset
+        :param bg_label: The background label index in the attention maps
+        :param batch_size: The batch size
+        :param num_parts: The number of parts in the attention maps
+        :param plot_ims_separately: Whether to plot the images separately
+        :param plot_landmark_amaps: Whether to plot the landmark attention maps
+        """
+        self.save_resolution = save_resolution
+        self.alpha = alpha
+        self.sub_path_test = sub_path_test
+        self.dataset_name = dataset_name
+        self.bg_label = bg_label
+        self.snapshot_dir = snapshot_dir
+        self.resize_unnorm = inverse_normalize_w_resize(resize_resolution=self.save_resolution)
+        self.batch_size = batch_size
+        self.nrows = factors(self.batch_size)[-1]
+        self.ncols = factors(self.batch_size)[-2]
+        self.num_parts = num_parts
+        self.req_colors = colors[:num_parts]
+        self.plot_ims_separately = plot_ims_separately
+        self.plot_landmark_amaps = plot_landmark_amaps
+        if self.nrows == 1 and self.ncols == 1:
+            self.figs_size = (10, 10)
+        else:
+            self.figs_size = (self.ncols * 2, self.nrows * 2)
+    def recalculate_nrows_ncols(self):
+        self.nrows = factors(self.batch_size)[-1]
+        self.ncols = factors(self.batch_size)[-2]
+        if self.nrows == 1 and self.ncols == 1:
+            self.figs_size = (10, 10)
+        else:
+            self.figs_size = (self.ncols * 2, self.nrows * 2)
+    @torch.no_grad()
+    def show_maps(self, ims, maps, epoch=0, curr_iter=0, extra_info=""):
+        """
+        Plot images, attention maps and landmark centroids.
+        Parameters
+        ----------
+        ims: Tensor, [batch_size, 3, width_im, height_im]
+            Input images on which to show the attention maps
+        maps: Tensor, [batch_size, number of parts + 1, width_map, height_map]
+            The attention maps to display
+        epoch: int
+            The epoch number
+        curr_iter: int
+            The current iteration number
+        extra_info: str
+            Any extra information to add to the file name
+        """
+        ims = self.resize_unnorm(ims)
+        if ims.shape[0] != self.batch_size:
+            self.batch_size = ims.shape[0]
+            self.recalculate_nrows_ncols()
+        fig, axs = plt.subplots(nrows=self.nrows, ncols=self.ncols, squeeze=False, figsize=self.figs_size)
+        ims = (ims.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
+        map_argmax = torch.nn.functional.interpolate(maps.clone().detach(), size=self.save_resolution,
+                                                     mode='bilinear',
+                                                     align_corners=True).argmax(dim=1).cpu().numpy()
+        for i, ax in enumerate(axs.ravel()):
+            curr_map = skimage.color.label2rgb(label=map_argmax[i], image=ims[i], colors=self.req_colors,
+                                               bg_label=self.bg_label, alpha=self.alpha)
+            ax.imshow(curr_map)
+            ax.axis('off')
+        save_dir = Path(os.path.join(self.snapshot_dir, 'results_vis_' + self.sub_path_test))
+        save_dir.mkdir(parents=True, exist_ok=True)
+        save_path = os.path.join(save_dir, f'{epoch}_{curr_iter}_{self.dataset_name}{extra_info}.png')
+        fig.tight_layout()
+        if self.snapshot_dir != "":
+            plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
+        else:
+            plt.show()
+        plt.close('all')
+        if self.plot_ims_separately:
+            fig, axs = plt.subplots(nrows=self.nrows, ncols=self.ncols, squeeze=False, figsize=self.figs_size)
+            for i, ax in enumerate(axs.ravel()):
+                ax.imshow(ims[i])
+                ax.axis('off')
+            save_path = os.path.join(save_dir, f'image_{epoch}_{curr_iter}_{self.dataset_name}{extra_info}.jpg')
+            fig.tight_layout()
+            if self.snapshot_dir != "":
+                plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
+            else:
+                plt.show()
+        plt.close('all')
+        if self.plot_landmark_amaps:
+            if self.batch_size > 1:
+                raise ValueError('Not implemented for batch size > 1')
+            for i in range(self.num_parts):
+                fig, ax = plt.subplots(1, 1, figsize=self.figs_size)
+                divider = make_axes_locatable(ax)
+                cax = divider.append_axes('right', size='5%', pad=0.05)
+                im = ax.imshow(maps[0, i, ...].detach().cpu().numpy(), cmap='cet_gouldian')
+                fig.colorbar(im, cax=cax, orientation='vertical')
+                ax.axis('off')
+                save_path = os.path.join(save_dir,
+                                         f'landmark_{i}_{epoch}_{curr_iter}_{self.dataset_name}{extra_info}.png')
+                fig.tight_layout()
+                if self.snapshot_dir != "":
+                    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
+                else:
+                    plt.show()
+                plt.close()
+        plt.close('all')