Spaces:

ananthu-aniraj
/

pdiscoformer

Running

App Files Files Community

ananthu-aniraj commited on Oct 18, 2024

Commit

a8d9779

1 Parent(s): 20239f9

upload initial version

Browse files

Files changed (5) hide show

app.py +28 -0
models/individual_landmark_vit.py +4 -11
requirements.txt +2 -1
utils/data_utils/transform_utils.py +4 -83
utils/visualize_att_maps.py +6 -90

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import streamlit as st
+import torch
+from PIL import Image
+from models import IndividualLandmarkViT
+from utils import VisualizeAttentionMaps
+from utils.data_utils.transform_utils import make_test_transforms
+st.title("Pdiscoformer Part Discovery Visualizer")
+# Set the device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load the model
+model = IndividualLandmarkViT.from_pretrained("ananthu-aniraj/pdiscoformer_cub_k_8").eval().to(device)
+amap_vis = VisualizeAttentionMaps(num_parts=9, bg_label=8)
+image_size = 518
+test_transforms = make_test_transforms(image_size)
+image_name = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])  # Upload an image
+if image_name is not None:
+    image = Image.open(image_name).convert("RGB")
+    image_tensor = test_transforms(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        maps, scores = model(image_tensor)
+    coloured_map = amap_vis.show_maps(image_tensor, maps)
+    st.image(coloured_map, caption="Attention Map", use_column_width=True)

models/individual_landmark_vit.py CHANGED Viewed

@@ -26,11 +26,10 @@ class IndividualLandmarkViT(torch.nn.Module, PyTorchModelHubMixin,
                  part_dropout: float = 0.3, return_transformer_qkv: bool = False,
                  modulation_type: str = "original", gumbel_softmax: bool = False,
                  gumbel_softmax_temperature: float = 1.0, gumbel_softmax_hard: bool = False,
-                 modulation_orth: bool = False, classifier_type: str = "linear", noise_variance: float = 0.0) -> None:
         super().__init__()
         self.num_landmarks = num_landmarks
         self.num_classes = num_classes
-        self.noise_variance = noise_variance
         self.num_prefix_tokens = init_model.num_prefix_tokens
         self.num_reg_tokens = init_model.num_reg_tokens
         self.has_class_token = init_model.has_class_token
@@ -75,7 +74,6 @@ class IndividualLandmarkViT(torch.nn.Module, PyTorchModelHubMixin,
             self.modulation = torch.nn.Identity()
         else:
             raise ValueError("modulation_type not implemented")
-        self.modulation_orth = modulation_orth
         self.dropout_full_landmarks = torch.nn.Dropout1d(part_dropout)
         self.classifier_type = classifier_type
         if classifier_type == "independent_mlp":
@@ -168,10 +166,6 @@ class IndividualLandmarkViT(torch.nn.Module, PyTorchModelHubMixin,
         # Use maps to get weighted average features per landmark
         all_features = (maps.unsqueeze(1) * x.unsqueeze(2)).contiguous()
-        if self.noise_variance > 0.0:
-            all_features += torch.randn_like(all_features,
-                                             device=all_features.device) * x.std().detach() * self.noise_variance
         all_features = all_features.mean(-1).mean(-1).contiguous()  # [B, embed_dim, num_landmarks + 1]
         # Modulate the features
@@ -184,10 +178,9 @@ class IndividualLandmarkViT(torch.nn.Module, PyTorchModelHubMixin,
         scores = self.fc_class_landmarks(
             self.dropout_full_landmarks(all_features_mod[..., :-1].permute(0, 2, 1).contiguous())).permute(0, 2,
                                                                                                            1).contiguous()
-        if self.modulation_orth:
-            return all_features_mod, maps, scores, dist
-        else:
-            return all_features, maps, scores, dist
     def get_specific_intermediate_layer(
             self,

                  part_dropout: float = 0.3, return_transformer_qkv: bool = False,
                  modulation_type: str = "original", gumbel_softmax: bool = False,
                  gumbel_softmax_temperature: float = 1.0, gumbel_softmax_hard: bool = False,
+                 classifier_type: str = "linear") -> None:
         super().__init__()
         self.num_landmarks = num_landmarks
         self.num_classes = num_classes
         self.num_prefix_tokens = init_model.num_prefix_tokens
         self.num_reg_tokens = init_model.num_reg_tokens
         self.has_class_token = init_model.has_class_token
             self.modulation = torch.nn.Identity()
         else:
             raise ValueError("modulation_type not implemented")
         self.dropout_full_landmarks = torch.nn.Dropout1d(part_dropout)
         self.classifier_type = classifier_type
         if classifier_type == "independent_mlp":
         # Use maps to get weighted average features per landmark
         all_features = (maps.unsqueeze(1) * x.unsqueeze(2)).contiguous()
         all_features = all_features.mean(-1).mean(-1).contiguous()  # [B, embed_dim, num_landmarks + 1]
         # Modulate the features
         scores = self.fc_class_landmarks(
             self.dropout_full_landmarks(all_features_mod[..., :-1].permute(0, 2, 1).contiguous())).permute(0, 2,
                                                                                                            1).contiguous()
+        scores = scores.mean(dim=-1)  # [B, num_classes]
+        return maps, scores
     def get_specific_intermediate_layer(
             self,

requirements.txt CHANGED Viewed

@@ -7,4 +7,5 @@ streamlit
 numpy
 pillow
 scikit-image
-huggingface-hub

 numpy
 pillow
 scikit-image
+huggingface-hub
+opencv-python

utils/data_utils/transform_utils.py CHANGED Viewed

@@ -3,29 +3,13 @@ from torchvision import transforms as transforms
 from torchvision.transforms import Compose
 from timm.data.constants import \
-    IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from timm.data import create_transform
-def make_train_transforms(args):
-    train_transforms: Compose = transforms.Compose([
-        transforms.Resize(size=args.image_size, antialias=True),
-        transforms.RandomHorizontalFlip(p=args.hflip),
-        transforms.RandomVerticalFlip(p=args.vflip),
-        transforms.ColorJitter(),
-        transforms.RandomAffine(degrees=90, translate=(0.2, 0.2), scale=(0.8, 1.2)),
-        transforms.RandomCrop(args.image_size),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
-    ])
-    return train_transforms
-def make_test_transforms(args):
     test_transforms: Compose = transforms.Compose([
-        transforms.Resize(size=args.image_size, antialias=True),
-        transforms.CenterCrop(args.image_size),
         transforms.ToTensor(),
         transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
@@ -33,57 +17,6 @@ def make_test_transforms(args):
     return test_transforms
-def build_transform_timm(args, is_train=True):
-    resize_im = args.image_size > 32
-    imagenet_default_mean_and_std = args.imagenet_default_mean_and_std
-    mean = IMAGENET_INCEPTION_MEAN if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_MEAN
-    std = IMAGENET_INCEPTION_STD if not imagenet_default_mean_and_std else IMAGENET_DEFAULT_STD
-    if is_train:
-        # this should always dispatch to transforms_imagenet_train
-        transform = create_transform(
-            input_size=args.image_size,
-            is_training=True,
-            color_jitter=args.color_jitter,
-            hflip=args.hflip,
-            vflip=args.vflip,
-            auto_augment=args.aa,
-            interpolation=args.train_interpolation,
-            re_prob=args.reprob,
-            re_mode=args.remode,
-            re_count=args.recount,
-            mean=mean,
-            std=std,
-        )
-        if not resize_im:
-            transform.transforms[0] = transforms.RandomCrop(
-                args.image_size, padding=4)
-        return transform
-    t = []
-    if resize_im:
-        # warping (no cropping) when evaluated at 384 or larger
-        if args.image_size >= 384:
-            t.append(
-                transforms.Resize((args.image_size, args.image_size),
-                                  interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
-            )
-            print(f"Warping {args.image_size} size input images...")
-        else:
-            if args.crop_pct is None:
-                args.crop_pct = 224 / 256
-            size = int(args.image_size / args.crop_pct)
-            t.append(
-                # to maintain same ratio w.r.t. 224 images
-                transforms.Resize(size, interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
-            )
-            t.append(transforms.CenterCrop(args.image_size))
-    t.append(transforms.ToTensor())
-    t.append(transforms.Normalize(mean, std))
-    return transforms.Compose(t)
 def inverse_normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD):
     mean = torch.as_tensor(mean)
     std = torch.as_tensor(std)
@@ -104,15 +37,3 @@ def inverse_normalize_w_resize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_
         transforms.Normalize((-mean / std).tolist(), (1.0 / std).tolist()),
         transforms.Resize(size=resize_resolution, antialias=True)])
     return resize_unnorm
-def load_transforms(args):
-    # Get the transforms and load the dataset
-    if args.augmentations_to_use == 'timm':
-        train_transforms = build_transform_timm(args, is_train=True)
-    elif args.augmentations_to_use == 'cub_original':
-        train_transforms = make_train_transforms(args)
-    else:
-        raise ValueError('Augmentations not supported.')
-    test_transforms = make_test_transforms(args)
-    return train_transforms, test_transforms

 from torchvision.transforms import Compose
 from timm.data.constants import \
+    IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+def make_test_transforms(image_size):
     test_transforms: Compose = transforms.Compose([
+        transforms.Resize(size=image_size, antialias=True),
+        transforms.CenterCrop(image_size),
         transforms.ToTensor(),
         transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)
     return test_transforms
 def inverse_normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD):
     mean = torch.as_tensor(mean)
     std = torch.as_tensor(std)
         transforms.Normalize((-mean / std).tolist(), (1.0 / std).tolist()),
         transforms.Resize(size=resize_resolution, antialias=True)])
     return resize_unnorm

utils/visualize_att_maps.py CHANGED Viewed

@@ -1,66 +1,36 @@
-import matplotlib.pyplot as plt
-from mpl_toolkits.axes_grid1 import make_axes_locatable
 import colorcet as cc
 import numpy as np
 import skimage
-from pathlib import Path
-import os
 import torch
 from utils.data_utils.transform_utils import inverse_normalize_w_resize
-from utils.misc_utils import factors
 # Define the colors to use for the attention maps
 colors = cc.glasbey_category10
 class VisualizeAttentionMaps:
-    def __init__(self, snapshot_dir="", save_resolution=(256, 256), alpha=0.5, sub_path_test="",
-                 dataset_name="", bg_label=0, batch_size=32, num_parts=15, plot_ims_separately=False,
-                 plot_landmark_amaps=False):
         """
         Plot attention maps and optionally landmark centroids on images.
         :param snapshot_dir: Directory to save the visualization results
         :param save_resolution: Size of the images to save
         :param alpha: The transparency of the attention maps
-        :param sub_path_test: The sub-path of the test dataset
-        :param dataset_name: The name of the dataset
         :param bg_label: The background label index in the attention maps
-        :param batch_size: The batch size
         :param num_parts: The number of parts in the attention maps
-        :param plot_ims_separately: Whether to plot the images separately
-        :param plot_landmark_amaps: Whether to plot the landmark attention maps
         """
         self.save_resolution = save_resolution
         self.alpha = alpha
-        self.sub_path_test = sub_path_test
-        self.dataset_name = dataset_name
         self.bg_label = bg_label
         self.snapshot_dir = snapshot_dir
         self.resize_unnorm = inverse_normalize_w_resize(resize_resolution=self.save_resolution)
-        self.batch_size = batch_size
-        self.nrows = factors(self.batch_size)[-1]
-        self.ncols = factors(self.batch_size)[-2]
         self.num_parts = num_parts
         self.req_colors = colors[:num_parts]
-        self.plot_ims_separately = plot_ims_separately
-        self.plot_landmark_amaps = plot_landmark_amaps
-        if self.nrows == 1 and self.ncols == 1:
-            self.figs_size = (10, 10)
-        else:
-            self.figs_size = (self.ncols * 2, self.nrows * 2)
-    def recalculate_nrows_ncols(self):
-        self.nrows = factors(self.batch_size)[-1]
-        self.ncols = factors(self.batch_size)[-2]
-        if self.nrows == 1 and self.ncols == 1:
-            self.figs_size = (10, 10)
-        else:
-            self.figs_size = (self.ncols * 2, self.nrows * 2)
     @torch.no_grad()
-    def show_maps(self, ims, maps, epoch=0, curr_iter=0, extra_info=""):
         """
         Plot images, attention maps and landmark centroids.
         Parameters
@@ -69,67 +39,13 @@ class VisualizeAttentionMaps:
             Input images on which to show the attention maps
         maps: Tensor, [batch_size, number of parts + 1, width_map, height_map]
             The attention maps to display
-        epoch: int
-            The epoch number
-        curr_iter: int
-            The current iteration number
-        extra_info: str
-            Any extra information to add to the file name
         """
         ims = self.resize_unnorm(ims)
-        if ims.shape[0] != self.batch_size:
-            self.batch_size = ims.shape[0]
-            self.recalculate_nrows_ncols()
-        fig, axs = plt.subplots(nrows=self.nrows, ncols=self.ncols, squeeze=False, figsize=self.figs_size)
         ims = (ims.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
         map_argmax = torch.nn.functional.interpolate(maps.clone().detach(), size=self.save_resolution,
                                                      mode='bilinear',
                                                      align_corners=True).argmax(dim=1).cpu().numpy()
-        for i, ax in enumerate(axs.ravel()):
-            curr_map = skimage.color.label2rgb(label=map_argmax[i], image=ims[i], colors=self.req_colors,
-                                               bg_label=self.bg_label, alpha=self.alpha)
-            ax.imshow(curr_map)
-            ax.axis('off')
-        save_dir = Path(os.path.join(self.snapshot_dir, 'results_vis_' + self.sub_path_test))
-        save_dir.mkdir(parents=True, exist_ok=True)
-        save_path = os.path.join(save_dir, f'{epoch}_{curr_iter}_{self.dataset_name}{extra_info}.png')
-        fig.tight_layout()
-        if self.snapshot_dir != "":
-            plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
-        else:
-            plt.show()
-        plt.close('all')
-        if self.plot_ims_separately:
-            fig, axs = plt.subplots(nrows=self.nrows, ncols=self.ncols, squeeze=False, figsize=self.figs_size)
-            for i, ax in enumerate(axs.ravel()):
-                ax.imshow(ims[i])
-                ax.axis('off')
-            save_path = os.path.join(save_dir, f'image_{epoch}_{curr_iter}_{self.dataset_name}{extra_info}.jpg')
-            fig.tight_layout()
-            if self.snapshot_dir != "":
-                plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
-            else:
-                plt.show()
-        plt.close('all')
-        if self.plot_landmark_amaps:
-            if self.batch_size > 1:
-                raise ValueError('Not implemented for batch size > 1')
-            for i in range(self.num_parts):
-                fig, ax = plt.subplots(1, 1, figsize=self.figs_size)
-                divider = make_axes_locatable(ax)
-                cax = divider.append_axes('right', size='5%', pad=0.05)
-                im = ax.imshow(maps[0, i, ...].detach().cpu().numpy(), cmap='cet_gouldian')
-                fig.colorbar(im, cax=cax, orientation='vertical')
-                ax.axis('off')
-                save_path = os.path.join(save_dir,
-                                         f'landmark_{i}_{epoch}_{curr_iter}_{self.dataset_name}{extra_info}.png')
-                fig.tight_layout()
-                if self.snapshot_dir != "":
-                    plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
-                else:
-                    plt.show()
-                plt.close()
-        plt.close('all')

 import colorcet as cc
 import numpy as np
 import skimage
 import torch
 from utils.data_utils.transform_utils import inverse_normalize_w_resize
 # Define the colors to use for the attention maps
 colors = cc.glasbey_category10
 class VisualizeAttentionMaps:
+    def __init__(self, snapshot_dir="", save_resolution=(256, 256), alpha=0.5, bg_label=0, num_parts=15):
         """
         Plot attention maps and optionally landmark centroids on images.
         :param snapshot_dir: Directory to save the visualization results
         :param save_resolution: Size of the images to save
         :param alpha: The transparency of the attention maps
         :param bg_label: The background label index in the attention maps
         :param num_parts: The number of parts in the attention maps
         """
         self.save_resolution = save_resolution
         self.alpha = alpha
         self.bg_label = bg_label
         self.snapshot_dir = snapshot_dir
         self.resize_unnorm = inverse_normalize_w_resize(resize_resolution=self.save_resolution)
         self.num_parts = num_parts
         self.req_colors = colors[:num_parts]
+        self.figs_size = (10, 10)
     @torch.no_grad()
+    def show_maps(self, ims, maps):
         """
         Plot images, attention maps and landmark centroids.
         Parameters
             Input images on which to show the attention maps
         maps: Tensor, [batch_size, number of parts + 1, width_map, height_map]
             The attention maps to display
         """
         ims = self.resize_unnorm(ims)
         ims = (ims.permute(0, 2, 3, 1).cpu().numpy() * 255).astype(np.uint8)
         map_argmax = torch.nn.functional.interpolate(maps.clone().detach(), size=self.save_resolution,
                                                      mode='bilinear',
                                                      align_corners=True).argmax(dim=1).cpu().numpy()
+        curr_map = skimage.color.label2rgb(label=map_argmax[0], image=ims[0], colors=self.req_colors,
+                                           bg_label=self.bg_label, alpha=self.alpha)
+        return curr_map