Spaces:

YouLiXiya
/

Mobile-SAM

Running

App Files Files Community

YouLiXiya commited on Aug 11, 2023

Commit

7dbe662

1 Parent(s): 27d04f5

Upload 22 files

Browse files

Files changed (22) hide show

sam_extension/distillation_models/__init__.py +4 -0
sam_extension/distillation_models/__pycache__/__init__.cpython-38.pyc +0 -0
sam_extension/distillation_models/__pycache__/dino.cpython-38.pyc +0 -0
sam_extension/distillation_models/__pycache__/fastertinyvit.cpython-38.pyc +0 -0
sam_extension/distillation_models/__pycache__/fastervit.cpython-38.pyc +0 -0
sam_extension/distillation_models/__pycache__/sam.cpython-38.pyc +0 -0
sam_extension/distillation_models/dino.py +122 -0
sam_extension/distillation_models/fastertinyvit.py +233 -0
sam_extension/distillation_models/fastervit.py +659 -0
sam_extension/distillation_models/sam.py +369 -0
sam_extension/pipeline/__init__.py +4 -0
sam_extension/pipeline/__pycache__/__init__.cpython-38.pyc +0 -0
sam_extension/pipeline/__pycache__/base.cpython-38.pyc +0 -0
sam_extension/pipeline/__pycache__/groundingdino.cpython-38.pyc +0 -0
sam_extension/pipeline/__pycache__/owlvit.cpython-38.pyc +0 -0
sam_extension/pipeline/__pycache__/sam.cpython-38.pyc +0 -0
sam_extension/pipeline/base.py +20 -0
sam_extension/pipeline/groundingdino.py +97 -0
sam_extension/pipeline/owlvit.py +372 -0
sam_extension/pipeline/sam.py +722 -0
sam_extension/utils/__init__.py +175 -0
sam_extension/utils/__pycache__/__init__.cpython-38.pyc +0 -0

sam_extension/distillation_models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .dino import DINO
+from .sam import SAMEncoderViT, DINOSAMViT
+from .fastertinyvit import FasterTinyViT
+# from .flashvision_transformer import FlashVisionTransformer

sam_extension/distillation_models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (322 Bytes). View file

sam_extension/distillation_models/__pycache__/dino.cpython-38.pyc ADDED Viewed

Binary file (4.72 kB). View file

sam_extension/distillation_models/__pycache__/fastertinyvit.cpython-38.pyc ADDED Viewed

Binary file (6.26 kB). View file

sam_extension/distillation_models/__pycache__/fastervit.cpython-38.pyc ADDED Viewed

Binary file (18 kB). View file

sam_extension/distillation_models/__pycache__/sam.cpython-38.pyc ADDED Viewed

Binary file (10.7 kB). View file

sam_extension/distillation_models/dino.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import PIL
+from PIL.Image import Image
+from typing import Union
+from sklearn.decomposition import PCA
+import torch
+from torch import nn
+from torchvision import transforms as tfs
+MEAN = [0.485, 0.456, 0.406]
+STD = [0.229, 0.224, 0.225]
+DINO_MODEL_HUB = 'facebookresearch/dino:main'
+DINO_MODEL_TYPE = ['dino_vits16',
+                  'dino_vits8',
+                  'dino_vitb16',
+                  'dino_vitb8',
+                  'dino_xcit_small_12_p16',
+                  'dino_xcit_small_12_p8',
+                  'dino_xcit_medium_24_p16',
+                  'dino_xcit_medium_24_p8',
+                  'dino_resnet50']
+DINOV2_MODEL_HUB = 'facebookresearch/dinov2:main'
+DINOV2_MODEL_TYPE = ['dinov2_vits14',
+                     'dinov2_vitb14',
+                     'dinov2_vitl14',
+                     'dinov2_vitg14']
+class DINO(nn.Module):
+    def __init__(self, model_type, device='cuda', img_size=224, pca_dim=None):
+        super(DINO, self).__init__()
+        assert model_type in DINO_MODEL_TYPE, 'Given DINO model type must in DINO_MODEL_TYPE!'
+        self.model = torch.hub.load(DINO_MODEL_HUB, model_type).to(device)
+        self.device = device
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.model.eval()
+        self.img_size = img_size
+        self.pca_dim = pca_dim
+        self.pca = self.set_pca(pca_dim) if pca_dim else None
+    def set_pca(self, dim=64):
+        return PCA(n_components=dim)
+    @torch.no_grad()
+    def extract_features(
+            self, img: Union[Image, torch.Tensor], transform=True, size=None
+    ):
+        if transform and isinstance(img, Image):
+            img = self.transform(img, self.img_size).unsqueeze(0)  # Nx3xHxW
+        with torch.no_grad():
+            out = self.model.get_intermediate_layers(img.to(self.device), n=1)[0]
+            out = out[:, 1:, :]  # we discard the [CLS] token
+            h, w = int(img.shape[2] / self.model.patch_embed.patch_size), int(
+                img.shape[3] / self.model.patch_embed.patch_size
+            )
+            dim = out.shape[-1]
+            out = out.reshape(-1, h, w, dim)
+            dtype = out.dtype
+            if size is not None:
+                out = torch.nn.functional.interpolate(out.permute(0, 3, 1, 2), size=size, mode='bilinear').permute(0, 2, 3, 1)
+            if self.pca:
+                B, H, W, C = out.shape
+                out = out.view(-1, C).cpu().numpy()
+                out = self.pca.fit_transform(out)
+                out = torch.tensor(out.reshape(B, H, W, self.pca_dim), dtype=dtype).to(self.device)
+        return out
+    def forward(self, img: Union[Image, torch.Tensor], transform=True, size=None):
+        return self.extract_features(img, transform, size)
+    @staticmethod
+    def transform(img, image_size):
+        transforms = tfs.Compose(
+            [tfs.Resize((image_size, image_size)), tfs.ToTensor(), tfs.Normalize(MEAN, STD)]
+        )
+        img = transforms(img)
+        return img
+class DINOV2(nn.Module):
+    def __init__(self, model_type, device='cuda', img_size=224, pca_dim=None):
+        super(DINOV2, self).__init__()
+        assert model_type in DINOV2_MODEL_TYPE, 'Given DINO model type must in DINO_MODEL_TYPE!'
+        self.model = torch.hub.load(DINOV2_MODEL_HUB, model_type).to(device)
+        self.device = device
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.model.eval()
+        self.img_size = img_size
+        self.pca_dim = pca_dim
+        self.pca = self.set_pca(pca_dim) if pca_dim else None
+    def set_pca(self, dim=64):
+        return PCA(n_components=dim)
+    @torch.no_grad()
+    def extract_features(
+            self, img: Union[Image, torch.Tensor], transform=True, size=None
+    ):
+        if transform and isinstance(img, Image):
+            img = self.transform(img, self.img_size).unsqueeze(0)  # Nx3xHxW
+        with torch.no_grad():
+            out = self.model.forward_features(img.to(self.device))['x_norm_patchtokens']
+            h, w = int(img.shape[2] / self.model.patch_size), int(
+                img.shape[3] / self.model.patch_size
+            )
+            dim = out.shape[-1]
+            out = out.reshape(-1, h, w, dim)
+            dtype = out.dtype
+            if size is not None:
+                out = torch.nn.functional.interpolate(out.permute(0, 3, 1, 2), size=size, mode='bilinear').permute(0, 2, 3, 1)
+            if self.pca:
+                B, H, W, C = out.shape
+                out = out.view(-1, C).cpu().numpy()
+                out = self.pca.fit_transform(out)
+                out = torch.tensor(out.reshape(B, H, W, self.pca_dim), dtype=dtype).to(self.device)
+        return out
+    def forward(self, img: Union[Image, torch.Tensor], transform=True, size=None):
+        return self.extract_features(img, transform, size)
+    @staticmethod
+    def transform(img, image_size):
+        transforms = tfs.Compose(
+            [tfs.Resize((image_size, image_size)), tfs.ToTensor(), tfs.Normalize(MEAN, STD)]
+        )
+        img = transforms(img)
+        return img

sam_extension/distillation_models/fastertinyvit.py ADDED Viewed

	@@ -0,0 +1,233 @@

+from typing import Tuple, List, Union
+import torch
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_
+from sam_extension.distillation_models.fastervit import FasterViTLayer
+from segment_anything.mobile_encoder.tiny_vit_sam import PatchEmbed, Conv2d_BN, LayerNorm2d, MBConv
+class PatchMerging(nn.Module):
+    def __init__(self, input_resolution, dim, out_dim, activation):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.out_dim = out_dim
+        self.act = activation()
+        self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
+        stride_c=2
+        if(out_dim==320 or out_dim==448 or out_dim==576):#handongshen  576
+            stride_c=1
+        self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
+        self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
+    def forward(self, x):
+        if x.ndim == 3:
+            H, W = self.input_resolution
+            B = len(x)
+            # (B, C, H, W)
+            x = x.view(B, H, W, -1).permute(0, 3, 1, 2)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.act(x)
+        x = self.conv3(x)
+        return x
+class ConvLayer(nn.Module):
+    def __init__(self, dim, input_resolution, depth,
+                 activation,
+                 drop_path=0., downsample=None, use_checkpoint=False,
+                 out_dim=None,
+                 conv_expand_ratio=4.,
+                 ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            MBConv(dim, dim, conv_expand_ratio, activation,
+                   drop_path[i] if isinstance(drop_path, list) else drop_path,
+                   )
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, out_dim=out_dim, activation=activation)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+class FasterTinyViT(nn.Module):
+    def __init__(self, img_size=224,
+                 in_chans=3,
+                 out_chans=256,
+                 embed_dims=[96, 192, 384, 768], depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_sizes=[7, 7, 14, 7],
+                 mlp_ratio=4.,
+                 drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_checkpoint=False,
+                 mbconv_expand_ratio=4.0,
+                 ct_size=2,
+                 conv=False,
+                 multi_scale=False,
+                 output_shape=None,
+                 ):
+        super().__init__()
+        self.img_size = img_size
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.mlp_ratio = mlp_ratio
+        self.multi_scale = multi_scale
+        self.output_shape = tuple(output_shape) if output_shape else None
+        activation = nn.GELU
+        self.patch_embed = PatchEmbed(in_chans=in_chans,
+                                      embed_dim=embed_dims[0],
+                                      resolution=img_size,
+                                      activation=activation)
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate,
+                                                sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            kwargs_0 = dict(dim=embed_dims[i_layer],
+                          input_resolution=(patches_resolution[0] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)),
+                                            patches_resolution[1] // (2 ** (i_layer - 1 if i_layer == 3 else i_layer))),
+                          #   input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                          #                     patches_resolution[1] // (2 ** i_layer)),
+                          depth=depths[i_layer],
+                          drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                          downsample=PatchMerging if (
+                                  i_layer < self.num_layers - 1) else None,
+                          use_checkpoint=use_checkpoint,
+                          out_dim=embed_dims[min(
+                              i_layer + 1, len(embed_dims) - 1)],
+                          activation=activation,
+                          )
+            kwargs_1 = dict(dim=embed_dims[i_layer],
+                            out_dim=embed_dims[i_layer+1] if (
+                                  i_layer < self.num_layers - 1) else embed_dims[i_layer],
+                            input_resolution=patches_resolution[0] // (2 ** i_layer),
+                            depth=depths[i_layer],
+                            drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                            downsample=True if (i_layer < self.num_layers - 1) else False,
+                            ct_size=ct_size,
+                            conv=conv,
+                            )
+            if i_layer == 0:
+                layer = ConvLayer(
+                    conv_expand_ratio=mbconv_expand_ratio,
+                    **kwargs_0,
+                )
+            else:
+                layer = FasterViTLayer(
+                    num_heads=num_heads[i_layer],
+                    window_size=window_sizes[i_layer],
+                    mlp_ratio=self.mlp_ratio,
+                    drop=drop_rate,
+                    **kwargs_1)
+            self.layers.append(layer)
+        # init weights
+        self.apply(self._init_weights)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                sum(embed_dims)+embed_dims[-1] if self.multi_scale and self.output_shape else embed_dims[-1],
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'attention_biases'}
+    def forward_features(self, x):
+        if self.multi_scale and self.output_shape:
+            output_list = []
+            # x: (N, C, H, W)
+            x = self.patch_embed(x)
+            output_list.append(F.interpolate(x, size=self.output_shape, mode='bilinear'))
+            for layer in self.layers:
+                x = layer(x)
+                output_list.append(F.interpolate(x, size=self.output_shape, mode='bilinear'))
+            x = self.neck(torch.cat(output_list, dim=1))
+        else:
+            x = self.patch_embed(x)
+            for layer in self.layers:
+                x = layer(x)
+            x = self.neck(x)
+        return x
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+if __name__ == '__main__':
+    from distillation.utils import get_parameter_number
+    x = torch.randn(1, 3, 1024, 1024).cuda()
+    fastertinyvit = FasterTinyViT(img_size=1024, in_chans=3,
+                embed_dims=[64, 128, 256],
+                depths=[1, 2, 1],
+                num_heads=[2, 4, 8],
+                window_sizes=[8, 8, 8],
+                mlp_ratio=4.,
+                drop_rate=0.,
+                drop_path_rate=0.0,
+                use_checkpoint=False,
+                mbconv_expand_ratio=4.0,
+               multi_scale=False,
+               output_shape='').cuda()
+    print(fastertinyvit(x).shape)
+    print(get_parameter_number(fastertinyvit))
+    # torch.save(fastertinyvit, 'fastertinyvit.pt')

sam_extension/distillation_models/fastervit.py ADDED Viewed

	@@ -0,0 +1,659 @@

+import torch
+import numpy as np
+import torch.nn as nn
+from timm.models.layers import DropPath, LayerNorm2d
+def window_partition(x, window_size):
+    B, C, H, W = x.shape
+    x = x.view(B, C, H // window_size, window_size, W // window_size, window_size)
+    windows = x.permute(0, 2, 4, 3, 5, 1).reshape(-1, window_size*window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W, B):
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 5, 1, 3, 2, 4).reshape(B, windows.shape[2], H, W)
+    return x
+def ct_dewindow(ct, W, H, window_size):
+    bs = ct.shape[0]
+    N=ct.shape[2]
+    ct2 = ct.view(-1, W//window_size, H//window_size, window_size, window_size, N).permute(0, 5, 1, 3, 2, 4)
+    ct2 = ct2.reshape(bs, N, W*H).transpose(1, 2)
+    return ct2
+def ct_window(ct, W, H, window_size):
+    bs = ct.shape[0]
+    N = ct.shape[2]
+    ct = ct.view(bs, H // window_size, window_size, W // window_size, window_size, N)
+    ct = ct.permute(0, 1, 3, 2, 4, 5)
+    return ct
+class PosEmbMLPSwinv2D(nn.Module):
+    def __init__(self,
+                 window_size,
+                 pretrained_window_size,
+                 num_heads, seq_length,
+                 ct_correct=False,
+                 no_log=False):
+        super().__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True),
+                                     nn.ReLU(inplace=True),
+                                     nn.Linear(512, num_heads, bias=False))
+        relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32)
+        relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32)
+        relative_coords_table = torch.stack(
+            torch.meshgrid([relative_coords_h,
+                            relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
+        if pretrained_window_size[0] > 0:
+            relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1)
+        else:
+            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
+        if not no_log:
+            relative_coords_table *= 8  # normalize to -8, 8
+            relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
+                torch.abs(relative_coords_table) + 1.0) / np.log2(8)
+        self.register_buffer("relative_coords_table", relative_coords_table)
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.grid_exists = False
+        self.pos_emb = None
+        self.deploy = False
+        relative_bias = torch.zeros(1, num_heads, seq_length, seq_length)
+        self.seq_length = seq_length
+        self.register_buffer("relative_bias", relative_bias)
+        self.ct_correct=ct_correct
+    def switch_to_deploy(self):
+        self.deploy = True
+    def forward(self, input_tensor, local_window_size):
+        if self.deploy:
+            input_tensor += self.relative_bias
+            return input_tensor
+        else:
+            self.grid_exists = False
+        if not self.grid_exists:
+            self.grid_exists = True
+            relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads)
+            relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1],
+                -1)
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+            relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
+            n_global_feature = input_tensor.shape[2] - local_window_size
+            if n_global_feature > 0 and self.ct_correct:
+                step_for_ct=self.window_size[0]/(n_global_feature**0.5+1)
+                seq_length = int(n_global_feature ** 0.5)
+                indices = []
+                for i in range(seq_length):
+                    for j in range(seq_length):
+                        ind = (i+1)*step_for_ct*self.window_size[0] + (j+1)*step_for_ct
+                        indices.append(int(ind))
+                top_part = relative_position_bias[:, indices, :]
+                lefttop_part = relative_position_bias[:, indices, :][:, :, indices]
+                left_part = relative_position_bias[:, :, indices]
+            relative_position_bias = torch.nn.functional.pad(relative_position_bias, (n_global_feature,
+                                                                                      0,
+                                                                                      n_global_feature,
+                                                                                      0)).contiguous()
+            if n_global_feature>0 and self.ct_correct:
+                relative_position_bias = relative_position_bias*0.0
+                relative_position_bias[:, :n_global_feature, :n_global_feature] = lefttop_part
+                relative_position_bias[:, :n_global_feature, n_global_feature:] = top_part
+                relative_position_bias[:, n_global_feature:, :n_global_feature] = left_part
+            self.pos_emb = relative_position_bias.unsqueeze(0)
+            self.relative_bias = self.pos_emb
+        input_tensor += self.pos_emb
+        return input_tensor
+class PosEmbMLPSwinv1D(nn.Module):
+    def __init__(self,
+                 dim,
+                 rank=2,
+                 seq_length=4,
+                 conv=False):
+        super().__init__()
+        self.rank = rank
+        if not conv:
+            self.cpb_mlp = nn.Sequential(nn.Linear(self.rank, 512, bias=True),
+                                         nn.ReLU(),
+                                         nn.Linear(512, dim, bias=False))
+        else:
+            self.cpb_mlp = nn.Sequential(nn.Conv1d(self.rank, 512, 1,bias=True),
+                                         nn.ReLU(),
+                                         nn.Conv1d(512, dim, 1,bias=False))
+        self.grid_exists = False
+        self.pos_emb = None
+        self.deploy = False
+        relative_bias = torch.zeros(1,seq_length, dim)
+        self.register_buffer("relative_bias", relative_bias)
+        self.conv = conv
+    def switch_to_deploy(self):
+        self.deploy = True
+    def forward(self, input_tensor):
+        seq_length = input_tensor.shape[1] if not self.conv else input_tensor.shape[2]
+        if self.deploy:
+            return input_tensor + self.relative_bias
+        else:
+            self.grid_exists = False
+        if not self.grid_exists:
+            self.grid_exists = True
+            if self.rank == 1:
+                relative_coords_h = torch.arange(0, seq_length, device=input_tensor.device, dtype = input_tensor.dtype)
+                relative_coords_h -= seq_length//2
+                relative_coords_h /= (seq_length//2)
+                relative_coords_table = relative_coords_h
+                self.pos_emb = self.cpb_mlp(relative_coords_table.unsqueeze(0).unsqueeze(2))
+                self.relative_bias = self.pos_emb
+            else:
+                seq_length = int(seq_length**0.5)
+                relative_coords_h = torch.arange(0, seq_length, device=input_tensor.device, dtype = input_tensor.dtype)
+                relative_coords_w = torch.arange(0, seq_length, device=input_tensor.device, dtype = input_tensor.dtype)
+                relative_coords_table = torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w])).contiguous().unsqueeze(0)
+                relative_coords_table -= seq_length // 2
+                relative_coords_table /= (seq_length // 2)
+                if not self.conv:
+                    self.pos_emb = self.cpb_mlp(relative_coords_table.flatten(2).transpose(1,2))
+                else:
+                    self.pos_emb = self.cpb_mlp(relative_coords_table.flatten(2))
+                self.relative_bias = self.pos_emb
+        input_tensor = input_tensor + self.pos_emb
+        return input_tensor
+class Mlp(nn.Module):
+    """
+    Multi-Layer Perceptron (MLP) block
+    """
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        """
+        Args:
+            in_features: input features dimension.
+            hidden_features: hidden features dimension.
+            out_features: output features dimension.
+            act_layer: activation function.
+            drop: dropout rate.
+        """
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x_size = x.size()
+        x = x.view(-1, x_size[-1])
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        x = x.view(x_size)
+        return x
+class Downsample(nn.Module):
+    """
+    Down-sampling block based on: "Hatamizadeh et al.,
+    FasterViT: Fast Vision Transformers with Hierarchical Attention
+    """
+    def __init__(self,
+                 dim,
+                 out_dim,
+                 keep_dim=False,
+                 stride=2,
+                 ):
+        """
+        Args:
+            dim: feature size dimension.
+            norm_layer: normalization layer.
+            keep_dim: bool argument for maintaining the resolution.
+        """
+        super().__init__()
+        if keep_dim:
+            out_dim = dim
+        self.norm = LayerNorm2d(dim)
+        self.reduction = nn.Sequential(
+            nn.Conv2d(dim, out_dim, 3, stride, 1, bias=False),
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class PatchEmbed(nn.Module):
+    """
+    Patch embedding block based on: "Hatamizadeh et al.,
+    FasterViT: Fast Vision Transformers with Hierarchical Attention
+    """
+    def __init__(self, in_chans=3, in_dim=64, dim=96):
+        """
+        Args:
+            in_chans: number of input channels.
+            dim: feature size dimension.
+        """
+        super().__init__()
+        self.proj = nn.Identity()
+        self.conv_down = nn.Sequential(
+            nn.Conv2d(in_chans, in_dim, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(in_dim, eps=1e-4),
+            nn.ReLU(),
+            nn.Conv2d(in_dim, dim, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(dim, eps=1e-4),
+            nn.ReLU()
+            )
+    def forward(self, x):
+        x = self.proj(x)
+        x = self.conv_down(x)
+        return x
+class ConvBlock(nn.Module):
+    """
+    Conv block based on: "Hatamizadeh et al.,
+    FasterViT: Fast Vision Transformers with Hierarchical Attention
+    """
+    def __init__(self, dim,
+                 drop_path=0.,
+                 layer_scale=None,
+                 kernel_size=3):
+        super().__init__()
+        """
+        Args:
+            drop_path: drop path.
+            layer_scale: layer scale coefficient.
+            kernel_size: kernel size.
+        """
+        self.conv1 = nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=1)
+        self.norm1 = nn.BatchNorm2d(dim, eps=1e-5)
+        self.act1 = nn.GELU()
+        self.conv2 = nn.Conv2d(dim, dim, kernel_size=kernel_size, stride=1, padding=1)
+        self.norm2 = nn.BatchNorm2d(dim, eps=1e-5)
+        self.layer_scale = layer_scale
+        if layer_scale is not None and type(layer_scale) in [int, float]:
+            self.gamma = nn.Parameter(layer_scale * torch.ones(dim))
+            self.layer_scale = True
+        else:
+            self.layer_scale = False
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x, global_feature=None):
+        input = x
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        if self.layer_scale:
+            x = x * self.gamma.view(1, -1, 1, 1)
+        x = input + self.drop_path(x)
+        return x, global_feature
+class WindowAttention(nn.Module):
+    """
+    Window attention based on: "Hatamizadeh et al.,
+    FasterViT: Fast Vision Transformers with Hierarchical Attention
+    """
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 resolution=0,
+                 seq_length=0):
+        super().__init__()
+        """
+        Args:
+            dim: feature size dimension.
+            num_heads: number of attention head.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            attn_drop: attention dropout rate.
+            proj_drop: output dropout rate.
+            resolution: feature resolution.
+            seq_length: sequence length.
+        """
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        # attention positional bias
+        self.pos_emb_funct = PosEmbMLPSwinv2D(window_size=[resolution, resolution],
+                                              pretrained_window_size=[resolution, resolution],
+                                              num_heads=num_heads,
+                                              seq_length=seq_length)
+        self.resolution = resolution
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, -1, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = self.pos_emb_funct(attn, self.resolution ** 2)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, -1, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class HAT(nn.Module):
+    """
+    Hierarchical attention (HAT) based on: "Hatamizadeh et al.,
+    FasterViT: Fast Vision Transformers with Hierarchical Attention
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1.,
+                 window_size=7,
+                 last=False,
+                 layer_scale=None,
+                 ct_size=1,
+                 do_propagation=False):
+        super().__init__()
+        """
+        Args:
+            dim: feature size dimension.
+            num_heads: number of attention head.
+            mlp_ratio: MLP ratio.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            proj_drop: output dropout rate.
+            act_layer: activation function.
+            norm_layer: normalization layer.
+            sr_ratio: input to window size ratio.
+            window_size: window size.
+            last: last layer flag.
+            layer_scale: layer scale coefficient.
+            ct_size: spatial dimension of carrier token local window.
+            do_propagation: enable carrier token propagation.
+        """
+        # positional encoding for windowed attention tokens
+        self.pos_embed = PosEmbMLPSwinv1D(dim, rank=2, seq_length=window_size**2)
+        self.norm1 = norm_layer(dim)
+        # number of carrier tokens per every window
+        cr_tokens_per_window = ct_size**2 if sr_ratio > 1 else 0
+        # total number of carrier tokens
+        cr_tokens_total = cr_tokens_per_window*sr_ratio*sr_ratio
+        self.cr_window = ct_size
+        self.attn = WindowAttention(dim,
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop,
+                                    proj_drop=drop,
+                                    resolution=window_size,
+                                    seq_length=window_size**2 + cr_tokens_per_window)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.window_size = window_size
+        use_layer_scale = layer_scale is not None and type(layer_scale) in [int, float]
+        self.gamma3 = nn.Parameter(layer_scale * torch.ones(dim))  if use_layer_scale else 1
+        self.gamma4 = nn.Parameter(layer_scale * torch.ones(dim))  if use_layer_scale else 1
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            # if do hierarchical attention, this part is for carrier tokens
+            self.hat_norm1 = norm_layer(dim)
+            self.hat_norm2 = norm_layer(dim)
+            self.hat_attn = WindowAttention(
+                dim,
+                num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                attn_drop=attn_drop, proj_drop=drop, resolution=int(cr_tokens_total**0.5),
+                seq_length=cr_tokens_total)
+            self.hat_mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+            self.hat_drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+            self.hat_pos_embed = PosEmbMLPSwinv1D(dim, rank=2, seq_length=cr_tokens_total)
+            self.gamma1 = nn.Parameter(layer_scale * torch.ones(dim)) if use_layer_scale else 1
+            self.gamma2 = nn.Parameter(layer_scale * torch.ones(dim)) if use_layer_scale else 1
+            self.upsampler = nn.Upsample(size=window_size, mode='nearest')
+        # keep track for the last block to explicitly add carrier tokens to feature maps
+        self.last = last
+        self.do_propagation = do_propagation
+    def forward(self, x, carrier_tokens):
+        B, T, N = x.shape
+        ct = carrier_tokens
+        x = self.pos_embed(x)
+        if self.sr_ratio > 1:
+            # do hierarchical attention via carrier tokens
+            # first do attention for carrier tokens
+            Bg, Ng, Hg = ct.shape
+            # ct are located quite differently
+            ct = ct_dewindow(ct, self.cr_window*self.sr_ratio, self.cr_window*self.sr_ratio, self.cr_window)
+            # positional bias for carrier tokens
+            ct = self.hat_pos_embed(ct)
+            # attention plus mlp
+            ct = ct + self.hat_drop_path(self.gamma1*self.hat_attn(self.hat_norm1(ct)))
+            ct = ct + self.hat_drop_path(self.gamma2*self.hat_mlp(self.hat_norm2(ct)))
+            # ct are put back to windows
+            ct = ct_window(ct, self.cr_window * self.sr_ratio, self.cr_window * self.sr_ratio, self.cr_window)
+            ct = ct.reshape(x.shape[0], -1, N)
+            # concatenate carrier_tokens to the windowed tokens
+            x = torch.cat((ct, x), dim=1)
+        # window attention together with carrier tokens
+        x = x + self.drop_path(self.gamma3*self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.gamma4*self.mlp(self.norm2(x)))
+        if self.sr_ratio > 1:
+            # for hierarchical attention we need to split carrier tokens and window tokens back
+            ctr, x = x.split([x.shape[1] - self.window_size*self.window_size, self.window_size*self.window_size], dim=1)
+            ct = ctr.reshape(Bg, Ng, Hg) # reshape carrier tokens.
+            if self.last and self.do_propagation:
+                # propagate carrier token information into the image
+                ctr_image_space = ctr.transpose(1, 2).reshape(B, N, self.cr_window, self.cr_window)
+                x = x + self.gamma1 * self.upsampler(ctr_image_space.to(dtype=torch.float32)).flatten(2).transpose(1, 2).to(dtype=x.dtype)
+        return x, ct
+class TokenInitializer(nn.Module):
+    """
+    Carrier token Initializer based on: "Hatamizadeh et al.,
+    FasterViT: Fast Vision Transformers with Hierarchical Attention
+    """
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 window_size,
+                 ct_size=1):
+        """
+        Args:
+            dim: feature size dimension.
+            input_resolution: input image resolution.
+            window_size: window size.
+            ct_size: spatial dimension of carrier token local window
+        """
+        super().__init__()
+        output_size = int(ct_size * input_resolution/window_size)
+        stride_size = int(input_resolution/output_size)
+        kernel_size = input_resolution - (output_size - 1) * stride_size
+        self.pos_embed = nn.Conv2d(dim, dim, 3, padding=1, groups=dim)
+        to_global_feature = nn.Sequential()
+        to_global_feature.add_module("pos", self.pos_embed)
+        to_global_feature.add_module("pool", nn.AvgPool2d(kernel_size=kernel_size, stride=stride_size))
+        self.to_global_feature = to_global_feature
+        self.window_size = ct_size
+    def forward(self, x):
+        x = self.to_global_feature(x)
+        B, C, H, W = x.shape
+        ct = x.view(B, C, H // self.window_size, self.window_size, W // self.window_size, self.window_size)
+        ct = ct.permute(0, 2, 4, 3, 5, 1).reshape(-1, H*W, C)
+        return ct
+class FasterViTLayer(nn.Module):
+    """
+    GCViT layer based on: "Hatamizadeh et al.,
+    Global Context Vision Transformers <https://arxiv.org/abs/2206.09959>"
+    """
+    def __init__(self,
+                 dim,
+                 out_dim,
+                 depth,
+                 input_resolution,
+                 num_heads,
+                 window_size,
+                 ct_size=1,
+                 conv=False,
+                 downsample=True,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 layer_scale=None,
+                 layer_scale_conv=None,
+                 only_local=False,
+                 hierarchy=True,
+                 do_propagation=False
+                 ):
+        """
+        Args:
+            dim: feature size dimension.
+            depth: layer depth.
+            input_resolution: input resolution.
+            num_heads: number of attention head.
+            window_size: window size.
+            ct_size: spatial dimension of carrier token local window.
+            conv: conv_based stage flag.
+            downsample: downsample flag.
+            mlp_ratio: MLP ratio.
+            qkv_bias: bool argument for query, key, value learnable bias.
+            qk_scale: bool argument to scaling query, key.
+            drop: dropout rate.
+            attn_drop: attention dropout rate.
+            drop_path: drop path rate.
+            layer_scale: layer scale coefficient.
+            layer_scale_conv: conv layer scale coefficient.
+            only_local: local attention flag.
+            hierarchy: hierarchical attention flag.
+            do_propagation: enable carrier token propagation.
+        """
+        super().__init__()
+        self.conv = conv
+        self.transformer_block = False
+        if conv:
+            self.blocks = nn.ModuleList([
+                ConvBlock(dim=dim,
+                          drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                          layer_scale=layer_scale_conv)
+                for i in range(depth)])
+            self.transformer_block = False
+        else:
+            sr_ratio = input_resolution // window_size if not only_local else 1
+            self.blocks = nn.ModuleList([
+                HAT(dim=dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    sr_ratio=sr_ratio,
+                    window_size=window_size,
+                    last=(i == depth-1),
+                    layer_scale=layer_scale,
+                    ct_size=ct_size,
+                    do_propagation=do_propagation,
+                    )
+                for i in range(depth)])
+            self.transformer_block = True
+        self.downsample = Downsample(dim=dim, out_dim=out_dim, stride=1) if not downsample else Downsample(dim=dim, out_dim=out_dim, stride=2)
+        if len(self.blocks) and not only_local and input_resolution // window_size > 1 and hierarchy and not self.conv:
+            self.global_tokenizer = TokenInitializer(dim,
+                                                     input_resolution,
+                                                     window_size,
+                                                     ct_size=ct_size)
+            self.do_gt = True
+        else:
+            self.do_gt = False
+        self.window_size = window_size
+    def forward(self, x):
+        ct = self.global_tokenizer(x) if self.do_gt else None
+        B, C, H, W = x.shape
+        if self.transformer_block:
+            x = window_partition(x, self.window_size)
+        for bn, blk in enumerate(self.blocks):
+            x, ct = blk(x, ct)
+        if self.transformer_block:
+            x = window_reverse(x, self.window_size, H, W, B)
+        if self.downsample is None:
+            return x
+        return self.downsample(x)

sam_extension/distillation_models/sam.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import os
+import functools
+import torch
+from torch import nn
+import torch.nn.functional as F
+from huggingface_hub import hf_hub_download
+from typing import Optional, List, Union, Tuple, Type
+from segment_anything import build_sam
+from segment_anything.mobile_encoder.tiny_vit_sam import TinyViT
+from segment_anything.modeling import PromptEncoder, MaskDecoder, TwoWayTransformer
+from segment_anything.modeling.image_encoder import ImageEncoderViT, LayerNorm2d, PatchEmbed, Block, Attention
+from segment_anything.mobile_encoder.setup_mobile_sam import load_mobile_sam
+from segment_anything.modeling.sam import Sam
+from sam_extension.distillation_models.fastertinyvit import FasterTinyViT
+from sam_extension.distillation_models.dino import DINO
+# from sam_extension.distillation_models.flashvision_transformer import FlashVisionTransformer
+SAM_REPO_ID = 'YouLiXiya/YL-SAM'
+hf_sam_download = functools.partial(hf_hub_download, repo_id=SAM_REPO_ID, local_dir_use_symlinks=True)
+class SAMImageEncoder(nn.Module):
+    def __init__(self,
+                 sam_checkpoint_path,
+                 device='cuda'):
+        super(SAMImageEncoder, self).__init__()
+        sam = build_sam(sam_checkpoint_path).to(device)
+        self.image_encoder = sam.image_encoder
+        del sam
+        torch.cuda.empty_cache()
+    def forward(self, x):
+        return self.image_encoder(x)
+class MobileSAMImageEncoder(nn.Module):
+    def __init__(self,
+                 sam_checkpoint_path,
+                 device='cuda'):
+        super(MobileSAMImageEncoder, self).__init__()
+        sam = load_mobile_sam(sam_checkpoint_path, device)
+        self.image_encoder = sam.image_encoder
+        del sam
+        torch.cuda.empty_cache()
+    def forward(self, x):
+        return self.image_encoder(x)
+class SAMEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+        multi_scale: bool = False,
+        output_shape: Union[Tuple, List] = None
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """
+        super().__init__()
+        self.img_size = img_size
+        self.multi_scale = multi_scale
+        self.output_shape = tuple(output_shape) if output_shape else None
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
+            )
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim*depth if self.multi_scale and self.output_shape else embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        if self.multi_scale and self.output_shape:
+            output_list = []
+            for blk in self.blocks:
+                x = blk(x)
+                output_list.append(F.interpolate(x.permute(0, 3, 1, 2), size=self.output_shape, mode='bilinear'))
+            x = self.neck(torch.cat(output_list, dim=1))
+        else:
+            for blk in self.blocks:
+                x = blk(x)
+            x = self.neck(x.permute(0, 3, 1, 2))
+        return x
+class SAMEncoderAdaptor(nn.Module):
+    def __init__(self,
+                 img_size: int,
+                 input_size: Optional[Tuple[int, int]],
+                 embed_dim: int = 768,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 mlp_ratio: float = 4.0,
+                 out_chans: int = 256,
+                 qkv_bias: bool = True,
+                 norm_layer: Type[nn.Module] = nn.LayerNorm,
+                 act_layer: Type[nn.Module] = nn.GELU,
+                 use_abs_pos: bool = True,
+                 use_rel_pos: bool = False,
+                 rel_pos_zero_init: bool = True,
+                 window_size: int = 0,
+                 global_attn_indexes: Tuple[int, ...] = (),
+                 multi_scale: bool = False,
+                 output_shape: Union[Tuple, List] = None):
+        super(SAMEncoderAdaptor, self).__init__()
+        self.img_size = img_size
+        self.multi_scale = multi_scale
+        self.output_shape = tuple(output_shape) if output_shape else None
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(1, input_size[0], input_size[1], embed_dim)
+            )
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=input_size,
+            )
+            self.blocks.append(block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim * depth if self.multi_scale and self.output_shape else embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+    def forward(self, x: torch.Tensor, original_size: Union[Tuple, List] = None) -> torch.Tensor:
+        if original_size:
+            original_size = torch.LongTensor(original_size)
+            output_shape = x.shape[-2:]
+            if original_size.ndim == 1:
+                original_size = original_size[None, ...]
+            adaptor_inputs = []
+            for i in range(original_size.shape[0]):
+                h, w = original_size[i]
+                if h > w:
+                    new_h = output_shape[0]
+                    new_w = int(w * new_h / h)
+                else:
+                    new_w = output_shape[1]
+                    new_h = int(h * new_w / w)
+                encoder_output = x[0].unsqueeze(0)
+                encoder_output = F.interpolate(encoder_output, size=(new_h, new_w), mode='bilinear')
+                pad_h = output_shape[0] - new_h
+                pad_w = output_shape[1] - new_w
+                encoder_output = F.pad(encoder_output, (0, pad_w, 0, pad_h))
+                adaptor_inputs.append(encoder_output)
+            adaptor_inputs = torch.cat(adaptor_inputs, dim=0)
+            x = adaptor_inputs.permute(0, 2, 3, 1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        if self.multi_scale and self.output_shape:
+            output_list = []
+            for blk in self.blocks:
+                x = blk(x)
+                output_list.append(F.interpolate(x.permute(0, 3, 1, 2), size=self.output_shape, mode='bilinear'))
+            x = self.neck(torch.cat(output_list, dim=1))
+        else:
+            for blk in self.blocks:
+                x = blk(x)
+            x = self.neck(x.permute(0, 3, 1, 2))
+        return x
+class DINOSAMViT(nn.Module):
+    def __init__(self,
+                 dino_model_type,
+                 device='cuda',
+                 pca_dim=None,
+                 **kwargs
+                 ):
+        super(DINOSAMViT, self).__init__()
+        self.img_size = kwargs['img_size']
+        if not pca_dim:
+            pca_dim = None
+        self.dino = DINO(dino_model_type, device, self.img_size, pca_dim)
+        self.input_size = tuple(kwargs['output_shape'])
+        # input_size = self.dino.model.patch_embed.img_size // self.dino.model.patch_embed.img_size
+        # self.input_size = (input_size, input_size)
+        embed_dim = pca_dim if pca_dim is not None else self.dino.model.embed_dim
+        kwargs.update({'input_size': self.input_size, 'embed_dim': embed_dim})
+        self.adaptor = SAMEncoderAdaptor(**kwargs).to(device)
+    def extract_dino_features(self, x, transform=False, size = None):
+        return self.dino.extract_features(x, transform, size)
+    def forward(self, x, transform=False, size = None):
+        dino_feature = F.normalize(self.extract_dino_features(x, transform, size), dim=3)
+        adaptor_input = F.interpolate(dino_feature.permute(0, 3, 1, 2), size=self.input_size, mode='bilinear').permute(0, 2, 3, 1)
+        return self.adaptor(adaptor_input)
+def setup_model(model_config):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_embedding_size = image_size // vit_patch_size
+    model = eval(model_config.pop('type'))(**model_config)
+    if model.__class__.__name__ == 'SAMEncoderAdaptor':
+        adaptor = model
+        image_encoder = load_sam('weights/sam/mobile_sam.pt', 'mobile_sam', 'cpu').image_encoder
+    else:
+        adaptor = None
+        image_encoder = model
+    sam = Sam(
+            image_encoder=image_encoder,
+            prompt_encoder=PromptEncoder(
+            embed_dim=prompt_embed_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(image_size, image_size),
+            mask_in_chans=16,
+            ),
+            mask_decoder=MaskDecoder(
+                    num_multimask_outputs=3,
+                    transformer=TwoWayTransformer(
+                    depth=2,
+                    embedding_dim=prompt_embed_dim,
+                    mlp_dim=2048,
+                    num_heads=8,
+                ),
+                transformer_dim=prompt_embed_dim,
+                iou_head_depth=3,
+                iou_head_hidden_dim=256,
+            ),
+            adaptor=adaptor,
+            pixel_mean=[123.675, 116.28, 103.53],
+            pixel_std=[58.395, 57.12, 57.375],
+        )
+    return sam
+def load_distillation_sam(distillation_sam_ckpt_path,
+                          device='cuda'):
+    ckpt = torch.load(distillation_sam_ckpt_path)
+    sam = setup_model(ckpt['model_config'])
+    sam.load_state_dict(ckpt['model'])
+    return sam.to(device)
+def load_sam(sam_ckpt_path, sam_version, device):
+    if not os.path.exists(sam_ckpt_path):
+        parent_dir = os.path.dirname(sam_ckpt_path)
+        os.makedirs(parent_dir, exist_ok=True)
+        hf_sam_download(filename=os.path.basename(sam_ckpt_path), local_dir=parent_dir)
+    if sam_version == 'sam':
+        sam = build_sam(sam_ckpt_path).to(device)
+    elif sam_version == 'mobile_sam':
+        sam = load_mobile_sam(sam_ckpt_path, device)
+    elif sam_version == 'distillation_sam':
+        sam = load_distillation_sam(sam_ckpt_path, device)
+    else:
+        raise ValueError('sam version error, please give sam version in [sam, mobile_sam, distillation_sam]')
+    return sam
+if __name__ == '__main__':
+    from distillation.utils import get_parameter_number
+    vit = SAMEncoderViT(depth=3,
+                        embed_dim=256,
+                        img_size=512,
+                        mlp_ratio=4,
+                        num_heads=16,
+                        patch_size=8,
+                        qkv_bias=True,
+                        use_rel_pos=True,
+                        global_attn_indexes=[1],
+                        window_size=16,
+                        out_chans=256,
+                        multi_scale=False,
+                        output_shape='').cuda()
+    x = torch.randn((1, 3, 512, 512)).cuda()
+    print(vit(x).shape)
+    print(get_parameter_number(vit))

sam_extension/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .base import Pipeline
+from .sam import SAMEncoderPipeline, SAMDecoderPipeline
+from .owlvit import OwlViTVisionEncoderPipeline, OwlViTDecoderPipeline
+from .groundingdino import GroundingDinoPipeline

sam_extension/pipeline/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (421 Bytes). View file

sam_extension/pipeline/__pycache__/base.cpython-38.pyc ADDED Viewed

Binary file (1.14 kB). View file

sam_extension/pipeline/__pycache__/groundingdino.cpython-38.pyc ADDED Viewed

Binary file (3.28 kB). View file

sam_extension/pipeline/__pycache__/owlvit.cpython-38.pyc ADDED Viewed

Binary file (10.8 kB). View file

sam_extension/pipeline/__pycache__/sam.cpython-38.pyc ADDED Viewed

Binary file (19.6 kB). View file

sam_extension/pipeline/base.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+from torch import nn
+from typing import Union, Dict
+from dataclasses import dataclass
+@dataclass(repr=True)
+class Output:
+    pass
+class Pipeline(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(Pipeline, self).__init__()
+        self.args = args
+        self.kwargs = kwargs
+    @classmethod
+    def from_pretrained(cls, ckpt_path, device='cuda', *args, **kwargs):
+        pass
+    def forward(self, *args, **kwargs):
+        pass

sam_extension/pipeline/groundingdino.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import functools
+import PIL
+from PIL.Image import Image
+import numpy as np
+from typing import List, Union
+import supervision as sv
+import torch
+import torchvision
+from huggingface_hub import hf_hub_download
+from sam_extension.pipeline import Pipeline
+from groundingdino.util.inference import Model
+GROUNDING_DINO_CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
+GROUNDING_DINO_CHECKPOINT_PATH = "groundingdino_swint_ogc.pth"
+SAM_REPO_ID = 'YouLiXiya/YL-SAM'
+LOCAL_DIR = "weights/groundingdino"
+hf_sam_download = functools.partial(hf_hub_download, repo_id=SAM_REPO_ID, local_dir=LOCAL_DIR, local_dir_use_symlinks=True)
+class GroundingDinoPipeline(Pipeline):
+    def __init__(self,
+                 grounding_dino_config_path,
+                 grounfing_dino_ckpt_path,
+                 grounding_dino_model,
+                 device,
+                 *args,
+                 **kwargs):
+        super(GroundingDinoPipeline, self).__init__(*args, **kwargs)
+        self.grounding_dino_config_path = grounding_dino_config_path
+        self.grounfing_dino_ckpt_path = grounfing_dino_ckpt_path
+        self.grounding_dino_model = grounding_dino_model
+        self.device = device
+    @classmethod
+    def from_pretrained(cls, grounding_dino_config_path, grounfing_dino_ckpt_path,device='cuda', *args, **kwargs):
+        if not os.path.exists(grounfing_dino_ckpt_path):
+            hf_sam_download(filename=os.path.basename(grounfing_dino_ckpt_path))
+        grounding_dino_model = Model(model_config_path=grounding_dino_config_path,
+                                     model_checkpoint_path=grounfing_dino_ckpt_path,
+                                     device=device)
+        return cls(grounding_dino_config_path,
+                   grounfing_dino_ckpt_path,
+                   grounding_dino_model,
+                   device,
+                   *args,
+                   **kwargs)
+    def visualize_results(self,
+                          img: Union[Image, np.ndarray],
+                          class_list: [List],
+                          box_threshold: float=0.25,
+                          text_threshold: float=0.25,
+                          nms_threshold: float=0.8,
+                          pil: bool=True):
+        detections = self.forward(img, class_list, box_threshold, text_threshold)
+        box_annotator = sv.BoxAnnotator()
+        nms_idx = torchvision.ops.nms(
+            torch.from_numpy(detections.xyxy),
+            torch.from_numpy(detections.confidence),
+            nms_threshold
+        ).numpy().tolist()
+        detections.xyxy = detections.xyxy[nms_idx]
+        detections.confidence = detections.confidence[nms_idx]
+        detections.class_id = detections.class_id[nms_idx]
+        labels = [
+            f"{class_list[class_id]} {confidence:0.2f}"
+            for _, _, confidence, class_id, _
+            in detections]
+        annotated_frame = box_annotator.annotate(scene=img.copy(), detections=detections, labels=labels)
+        if pil:
+            return PIL.Image.fromarray(annotated_frame[:, :, ::-1]), detections
+        else:
+            return annotated_frame, detections
+    @torch.no_grad()
+    def forward(self,
+                img: Union[Image, np.ndarray],
+                class_list: [List],
+                box_threshold: float=0.25,
+                text_threshold: float=0.25
+                )->sv.Detections:
+        if isinstance(img, Image):
+            img = np.uint8(img)[:, :, ::-1]
+        detections = self.grounding_dino_model.predict_with_classes(
+                    image=img,
+                    classes=class_list,
+                    box_threshold=box_threshold,
+                    text_threshold=text_threshold
+        )
+        return detections

sam_extension/pipeline/owlvit.py ADDED Viewed

	@@ -0,0 +1,372 @@

+from typing import Optional, Tuple, Union, List
+import numpy as np
+import PIL
+from PIL.Image import Image
+import supervision as sv
+import torch
+from torch import nn
+from transformers import OwlViTProcessor, OwlViTForObjectDetection, OwlViTVisionModel
+from transformers.models.owlvit.modeling_owlvit import center_to_corners_format, box_iou, generalized_box_iou, OwlViTObjectDetectionOutput
+from sam_extension.pipeline.base import Pipeline, Output
+class OwlViTVisionEncoderPipeline(Pipeline):
+    def __init__(self,
+                 vision_model,
+                 layer_norm,
+                 processor,
+                 device='cuda',
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.vision_model = vision_model
+        self.layer_norm = layer_norm
+        self.processor = processor
+        self.device = device
+        torch.cuda.empty_cache()
+    @classmethod
+    def from_pretrained(cls, model_type, device='cuda', *args, **kwargs):
+        owlvit_for_object_detection = OwlViTForObjectDetection.from_pretrained(model_type).to(device)
+        processor = OwlViTProcessor.from_pretrained(model_type)
+        return cls(owlvit_for_object_detection.owlvit.vision_model,
+                   owlvit_for_object_detection.layer_norm,
+                   processor,
+                   device,
+                   *args,
+                   **kwargs)
+    def process_image(self, image:Image):
+        image = self.processor(images=image, return_tensors="pt").pixel_values.to(self.device)
+        return image
+    @torch.no_grad()
+    def forward(
+        self,
+        pixel_values: Union[torch.FloatTensor, Image] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        if isinstance(pixel_values, Image):
+            pixel_values = self.process_image(pixel_values)
+        pixel_values = pixel_values.to(self.device)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # Get image embeddings
+        last_hidden_state = vision_outputs[0]
+        image_embeds = self.vision_model.post_layernorm(last_hidden_state)
+        new_size = tuple(np.array(image_embeds.shape) - np.array((0, 1, 0)))
+        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], new_size)
+        # Merge image embedding with class tokens
+        image_embeds = image_embeds[:, 1:, :] * class_token_out
+        image_embeds = self.layer_norm(image_embeds)
+        # Resize to [batch_size, num_patches, num_patches, hidden_size]
+        new_size = (
+            image_embeds.shape[0],
+            int(np.sqrt(image_embeds.shape[1])),
+            int(np.sqrt(image_embeds.shape[1])),
+            image_embeds.shape[-1],
+        )
+        image_embeds = image_embeds.reshape(new_size)
+        return image_embeds
+class OwlViTDecoderPipeline(Pipeline):
+    prompt_template: str = 'a photo of a '
+    def __init__(self,
+                 owlvit_text,
+                 text_projection,
+                 class_head,
+                 box_head,
+                 processor,
+                 device='cuda',
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.owlvit_text = owlvit_text
+        self.text_projection = text_projection
+        self.class_head = class_head
+        self.box_head = box_head
+        self.sigmoid = nn.Sigmoid()
+        self.processor = processor
+        self.device = device
+        torch.cuda.empty_cache()
+    @classmethod
+    def from_pretrained(cls, model_type, device='cuda', *args, **kwargs):
+        owlvit_for_object_detection = OwlViTForObjectDetection.from_pretrained(model_type).to(device)
+        processor = OwlViTProcessor.from_pretrained(model_type)
+        return cls(owlvit_for_object_detection.owlvit.text_model,
+                   owlvit_for_object_detection.owlvit.text_projection,
+                   owlvit_for_object_detection.class_head,
+                   owlvit_for_object_detection.box_head,
+                   processor,
+                   device,
+                   *args,
+                   **kwargs)
+    def set_template(self, template: str):
+        self.prompt_template = template
+    def process_text(self, text:List, use_template:bool = True):
+        if use_template:
+            text = [[self.prompt_template+i for i in text[0]]]
+        inputs = self.processor(text=text, return_tensors="pt")
+        return inputs
+    def normalize_grid_corner_coordinates(self, feature_map: torch.FloatTensor):
+        # Computes normalized xy corner coordinates from feature_map.
+        if not feature_map.ndim == 4:
+            raise ValueError("Expected input shape is [batch_size, num_patches, num_patches, hidden_dim]")
+        device = feature_map.device
+        num_patches = feature_map.shape[1]
+        box_coordinates = np.stack(
+            np.meshgrid(np.arange(1, num_patches + 1), np.arange(1, num_patches + 1)), axis=-1
+        ).astype(np.float32)
+        box_coordinates /= np.array([num_patches, num_patches], np.float32)
+        # Flatten (h, w, 2) -> (h*w, 2)
+        box_coordinates = box_coordinates.reshape(
+            box_coordinates.shape[0] * box_coordinates.shape[1], box_coordinates.shape[2]
+        )
+        box_coordinates = torch.from_numpy(box_coordinates).to(device)
+        return box_coordinates
+    def compute_box_bias(self, feature_map: torch.FloatTensor) -> torch.FloatTensor:
+        # The box center is biased to its position on the feature grid
+        box_coordinates = self.normalize_grid_corner_coordinates(feature_map)
+        box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)
+        # Unnormalize xy
+        box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)
+        # The box size is biased to the patch size
+        box_size = torch.full_like(box_coord_bias, 1.0 / feature_map.shape[-2])
+        box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)
+        # Compute box bias
+        box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1)
+        return box_bias
+    def box_predictor(
+        self,
+        image_feats: torch.FloatTensor,
+        feature_map: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            image_feats:
+                Features extracted from the image, returned by the `image_text_embedder` method.
+            feature_map:
+                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
+        Returns:
+            pred_boxes:
+                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
+        """
+        # Bounding box detection head [batch_size, num_boxes, 4].
+        pred_boxes = self.box_head(image_feats)
+        # Compute the location of each token on the grid and use it to compute a bias for the bbox prediction
+        pred_boxes += self.compute_box_bias(feature_map)
+        pred_boxes = self.sigmoid(pred_boxes)
+        return pred_boxes
+    def class_predictor(
+        self,
+        image_feats: torch.FloatTensor,
+        query_embeds: Optional[torch.FloatTensor] = None,
+        query_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            image_feats:
+                Features extracted from the `image_text_embedder`.
+            query_embeds:
+                Text query embeddings.
+            query_mask:
+                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
+        """
+        (pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask)
+        return (pred_logits, image_class_embeds)
+    def image_text_embedder(
+        self,
+        input_ids: torch.Tensor,
+        image_embeds: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Tuple[torch.FloatTensor]:
+        # Encode text and image
+        text_outputs = self.owlvit_text(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+        text_embeds = text_embeds / torch.linalg.norm(text_embeds, ord=2, dim=-1, keepdim=True)
+        return (text_embeds, image_embeds, text_outputs)
+    def embed_image_query(
+        self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        _, class_embeds = self.class_predictor(query_image_features)
+        pred_boxes = self.box_predictor(query_image_features, query_feature_map)
+        pred_boxes_as_corners = center_to_corners_format(pred_boxes)
+        # Loop over query images
+        best_class_embeds = []
+        best_box_indices = []
+        pred_boxes_device = pred_boxes_as_corners.device
+        for i in range(query_image_features.shape[0]):
+            each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device)
+            each_query_pred_boxes = pred_boxes_as_corners[i]
+            ious, _ = box_iou(each_query_box, each_query_pred_boxes)
+            # If there are no overlapping boxes, fall back to generalized IoU
+            if torch.all(ious[0] == 0.0):
+                ious = generalized_box_iou(each_query_box, each_query_pred_boxes)
+            # Use an adaptive threshold to include all boxes within 80% of the best IoU
+            iou_threshold = torch.max(ious) * 0.8
+            selected_inds = (ious[0] >= iou_threshold).nonzero()
+            if selected_inds.numel():
+                selected_embeddings = class_embeds[i][selected_inds[0]]
+                mean_embeds = torch.mean(class_embeds[i], axis=0)
+                mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
+                best_box_ind = selected_inds[torch.argmin(mean_sim)]
+                best_class_embeds.append(class_embeds[i][best_box_ind])
+                best_box_indices.append(best_box_ind)
+        if best_class_embeds:
+            query_embeds = torch.stack(best_class_embeds)
+            box_indices = torch.stack(best_box_indices)
+        else:
+            query_embeds, box_indices = None, None
+        return query_embeds, box_indices, pred_boxes
+    @torch.no_grad()
+    def forward(
+        self,
+        image_embeds: torch.FloatTensor,
+        input_ids: Optional[torch.Tensor] = None,
+        text: Optional[List] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> OwlViTObjectDetectionOutput:
+        if text is not None:
+            inputs = self.process_text(text)
+            input_ids = inputs.input_ids.to(self.device)
+            attention_mask = inputs.attention_mask.to(self.device)
+        input_ids = input_ids.to(self.device)
+        image_embeds = image_embeds.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else False
+        )
+        return_dict = return_dict if return_dict is not None else True
+        # Embed images and text queries
+        query_embeds, feature_map, text_outputs = self.image_text_embedder(
+            input_ids=input_ids,
+            image_embeds=image_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        # Text and vision model outputs
+        batch_size, num_patches, num_patches, hidden_dim = feature_map.shape
+        image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim))
+        # Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim]
+        max_text_queries = input_ids.shape[0] // batch_size
+        query_embeds = query_embeds.reshape(batch_size, max_text_queries, query_embeds.shape[-1])
+        # If first token is 0, then this is a padded query [batch_size, num_queries].
+        input_ids = input_ids.reshape(batch_size, max_text_queries, input_ids.shape[-1])
+        query_mask = input_ids[..., 0] > 0
+        # Predict object classes [batch_size, num_patches, num_queries+1]
+        (pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask)
+        # Predict object boxes
+        pred_boxes = self.box_predictor(image_feats, feature_map)
+        if not return_dict:
+            output = (
+                pred_logits,
+                pred_boxes,
+                query_embeds,
+                feature_map,
+                class_embeds,
+                text_outputs.to_tuple(),
+                None,
+            )
+            output = tuple(x for x in output if x is not None)
+            return output
+        return OwlViTObjectDetectionOutput(
+            image_embeds=feature_map,
+            text_embeds=query_embeds,
+            pred_boxes=pred_boxes.cpu(),
+            logits=pred_logits.cpu(),
+            class_embeds=class_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=None,
+        )
+    def owlvit_visualize(self,
+                         image: Image,
+                         texts: List,
+                         owlvit_objectdetection_output: OwlViTObjectDetectionOutput,
+                         score_threshold: float = 0.1,
+                         pil=True):
+        target_sizes = torch.Tensor([image.size[::-1]])
+        # Convert outputs (bounding boxes and class logits) to COCO API
+        results = self.processor.post_process(outputs=owlvit_objectdetection_output, target_sizes=target_sizes)
+        text = texts[0]
+        boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
+        boxes_np = []
+        labels_list = []
+        # Print detected objects and rescaled box coordinates
+        for box, score, label in zip(boxes, scores, labels):
+            box = [int(i) for i in box.tolist()]
+            if score >= score_threshold:
+                labels_list.append(f"{text[label]} {round(score.item(), 3)}")
+                boxes_np.append(box)
+                print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+        boxes_np = np.array(boxes_np)
+        detections = sv.Detections(xyxy=boxes_np)
+        image_np = np.uint8(image)[:, :, ::-1]
+        box_annotator = sv.BoxAnnotator()
+        annotated_frame = box_annotator.annotate(scene=image_np.copy(), detections=detections, labels=labels_list)
+        if pil:
+            return PIL.Image.fromarray(annotated_frame[:, :, ::-1])
+        else:
+            return annotated_frame[:, :, ::-1]

sam_extension/pipeline/sam.py ADDED Viewed

	@@ -0,0 +1,722 @@

+import functools
+from dataclasses import dataclass
+import PIL
+from PIL.Image import Image
+import numpy as np
+from typing import Union, Tuple, List, Optional, Callable
+from sklearn.decomposition import PCA
+import supervision as sv
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchvision
+import torchvision.transforms as T
+from segment_anything.utils.transforms import ResizeLongestSide
+from segment_anything.predictor import preprocess, postprocess_masks
+from segment_anything import build_sam, load_mobile_sam
+from sam_extension.utils import add_prompts_tag, get_empty_detections, transform_coords
+from sam_extension.pipeline.base import Pipeline, Output
+from sam_extension.pipeline.groundingdino import GroundingDinoPipeline
+from sam_extension.distillation_models.sam import load_distillation_sam, load_sam
+from sam_extension.distillation_models import *
+ORIGINAL_SAM_IMG_SIZE: int = 1024
+PIXEL_MEAN = torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1)
+PIXEL_STD = torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1)
+PREPROCESS = functools.partial(preprocess, ORIGINAL_SAM_IMG_SIZE, PIXEL_MEAN, PIXEL_STD)
+POSTPROCESS_MASKS = functools.partial(postprocess_masks, ORIGINAL_SAM_IMG_SIZE)
+@dataclass(repr=True)
+class SAMEncoderOutput(Output):
+    features: torch.Tensor
+    interm_features: List[torch.Tensor]
+    original_size: Tuple
+    input_size: Tuple
+@dataclass(repr=True)
+class SAMEncoderProcesImgOutput(Output):
+    input_image: torch.Tensor
+    original_size: Tuple
+    input_size: Tuple
+@dataclass(repr=True)
+class SAMDecoderPredictOutput(Output):
+    masks_np: np.ndarray
+    iou_predictions_np: np.ndarray
+    low_res_masks_np: np.ndarray
+@dataclass(repr=True)
+class SAMDecoderPredictTorchOutput(Output):
+    masks: torch.Tensor
+    iou_predictions: torch.Tensor
+    low_res_masks: torch.Tensor
+class SAMEncoderPipeline(Pipeline):
+    def __init__(self,
+                 encoder: nn.Module,
+                 input_img_size: Tuple,
+                 multi_output: bool,
+                 preprocess: Callable,
+                 transform: ResizeLongestSide,
+                 device: str,
+                 *args,
+                 **kwargs):
+        super(SAMEncoderPipeline, self).__init__(*args, **kwargs)
+        self.encoder = encoder
+        self.input_img_size = input_img_size
+        self.multi_output = multi_output
+        self.preprocess = preprocess
+        self.transform = transform
+        self.device = device
+    @classmethod
+    def from_pretrained(cls, ckpt_path, device='cuda', *args, **kwargs):
+        if 'sam_version' not in kwargs.keys():
+            sam_version = 'sam'
+        else:
+            sam_version = kwargs['sam_version']
+        sam = load_sam(ckpt_path, sam_version, device)
+        encoder = sam.image_encoder
+        encoder_type = encoder.__class__.__name__
+        if encoder_type in ['TinyViT', 'FasterTinyViT', 'SAMEncoderViT', 'DINOSAMViT', 'FlashVisionTransformer']:
+            multi_output = False
+            if encoder_type in ['FasterTinyViT', 'SAMEncoderViT', 'DINOSAMViT', 'FlashVisionTransformer']:
+                input_img_size = (encoder.img_size, encoder.img_size)
+                if encoder_type == 'DINOSAMViT':
+                    encoder = encoder.dino
+            else:
+                input_img_size = (ORIGINAL_SAM_IMG_SIZE, ORIGINAL_SAM_IMG_SIZE)
+        else:
+            multi_output = True
+            input_img_size = (ORIGINAL_SAM_IMG_SIZE, ORIGINAL_SAM_IMG_SIZE)
+        if sam.adaptor is None:
+            transform = ResizeLongestSide(ORIGINAL_SAM_IMG_SIZE)
+            preprocess_ = functools.partial(preprocess, ORIGINAL_SAM_IMG_SIZE, PIXEL_MEAN.to(device), PIXEL_STD.to(device))
+        else:
+            transform = T.Compose([
+                T.Resize(input_img_size),
+                T.ToTensor(),
+                T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+            ])
+            preprocess_ = None
+        pipeline = cls(encoder=encoder,
+                       input_img_size=input_img_size,
+                       multi_output=multi_output,
+                       preprocess=preprocess_,
+                       transform=transform,
+                       device=device)
+        del sam, encoder
+        torch.cuda.empty_cache()
+        return pipeline
+    def process_img(self, img: Union[Image, np.ndarray]) -> SAMEncoderProcesImgOutput:
+        if self.preprocess is not None:
+            if isinstance(img, Image):
+                img = np.uint8(img)
+            input_image = self.transform.apply_image(img)
+            input_image_torch = torch.as_tensor(input_image, device=self.device)
+            input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[None, :, :, :]
+            original_size = tuple(img.shape[:2])
+            input_size = tuple(input_image_torch.shape[-2:])
+            input_image = F.interpolate(self.preprocess(input_image_torch), size=self.input_img_size, mode='bilinear')
+        else:
+            if isinstance(img, np.ndarray):
+                img = PIL.Image.fromarray(img)
+            original_size = (img.size[1], img.size[0])
+            if original_size[0] > original_size[1]:
+                input_h = 1024
+                input_w = int((1024 / original_size[0]) * original_size[1])
+            else:
+                input_w = 1024
+                input_h = int((1024 / original_size[1]) * original_size[0])
+            input_size = (input_h, input_w)
+            input_image = self.transform(img)[None, ...].to(self.device)
+        return SAMEncoderProcesImgOutput(input_image, original_size, input_size)
+    @torch.no_grad()
+    def get_visual_feature(self, x: Union[torch.Tensor, Image, np.ndarray]=None, **kwargs):
+        pca_rgb = PCA(n_components=3)
+        if 'sam_feature' in kwargs.keys() and 'original_size' in kwargs.keys():
+            sam_feature = kwargs['sam_feature']
+            original_size = kwargs['original_size']
+        else:
+            assert x is not None, 'please give x type Union[torch.Tensor, Image, np.ndarray] !'
+            sam_encoder_output = self.forward(x, **kwargs)
+            sam_feature = sam_encoder_output.features
+            original_size = sam_encoder_output.original_size
+            assert original_size is not None, 'please give original_size!'
+        sam_feature = F.interpolate(sam_feature, size=original_size, mode='bilinear').permute(0, 2, 3, 1)
+        b, h, w, c = sam_feature.shape
+        sam_feature = sam_feature.view(-1, c).cpu().numpy()
+        sam_feature = pca_rgb.fit_transform(sam_feature)
+        sam_feature = torch.Tensor(sam_feature.reshape(h, w, 3))
+        min_f, _ = sam_feature.min(-1)
+        max_f, _ = sam_feature.max(-1)
+        sam_feature = (sam_feature - min_f[..., None]) / (max_f[..., None] - min_f[..., None])
+        sam_feature = sam_feature.cpu().numpy()
+        sam_feature_image = PIL.Image.fromarray((sam_feature * 255).astype(np.uint8))
+        return sam_feature_image
+    def forward(self, x: Union[torch.Tensor, Image, np.ndarray], **kwargs) -> SAMEncoderOutput:
+        if isinstance(x, (Image, np.ndarray)):
+            process_img_output = self.process_img(x)
+            x = process_img_output.input_image
+            original_size = process_img_output.original_size
+            input_size = process_img_output.input_size
+        else:
+            original_size = kwargs.pop('original_size') if 'original_size' in kwargs.keys() else None
+            input_size = x.shape[-2:]
+        with torch.no_grad():
+            if self.multi_output:
+                features, interm_features = self.encoder(x, **kwargs)
+            else:
+                features = self.encoder(x, **kwargs)
+                if self.encoder.__class__.__name__ == 'DINO':
+                    features = features.permute(0, 3, 1, 2)
+                interm_features = None
+        return SAMEncoderOutput(features, interm_features, original_size, input_size)
+class SAMDecoderPipeline(Pipeline):
+    def __init__(self,
+                 prompt_encoder: nn.Module,
+                 mask_decoder: nn.Module,
+                 adaptor: nn.Module,
+                 mask_threshold: float,
+                 transform: ResizeLongestSide,
+                 postprocess_masks: Callable,
+                 img_size: int,
+                 device: str,
+                 *args,
+                 **kwargs):
+        super(SAMDecoderPipeline, self).__init__(*args, **kwargs)
+        self.prompt_encoder = prompt_encoder
+        self.mask_decoder = mask_decoder
+        self.adaptor = adaptor
+        self.mask_threshold = mask_threshold
+        self.transform = transform
+        self.postprocess_masks = postprocess_masks
+        self.img_size = img_size
+        self.device = device
+    @classmethod
+    def from_pretrained(cls, ckpt_path, device='cuda', *args, **kwargs):
+        if 'sam_version' not in kwargs.keys():
+            sam_version = 'sam'
+        else:
+            sam_version = kwargs['sam_version']
+        sam = load_sam(ckpt_path, sam_version, device)
+        if sam.image_encoder.__class__.__name__ == 'DINOSAMViT':
+            adaptor = sam.image_encoder.adaptor
+        elif sam.adaptor is not None:
+            adaptor = sam.adaptor
+        else:
+            adaptor = None
+        img_size = sam.image_encoder.img_size
+        prompt_encoder = sam.prompt_encoder
+        mask_decoder = sam.mask_decoder
+        transform = ResizeLongestSide(ORIGINAL_SAM_IMG_SIZE)
+        pipeline = cls(prompt_encoder=prompt_encoder,
+                       mask_decoder=mask_decoder,
+                       adaptor=adaptor,
+                       mask_threshold=sam.mask_threshold,
+                       transform=transform,
+                       postprocess_masks=POSTPROCESS_MASKS,
+                       img_size=img_size,
+                       device=device)
+        del sam, prompt_encoder, mask_decoder
+        torch.cuda.empty_cache()
+        return pipeline
+    def visualize_prompt(self,
+                         img: Union[Image, np.ndarray],
+                         des_img: Union[Image, np.ndarray] = None,
+                         point_labels: Union[List[int], np.ndarray] = None,
+                         point_coords: Union[List[List[int]], np.ndarray] = None,
+                         boxes: Union[List[List[int]], np.ndarray] = None,
+                         pil: bool = False
+                         ) -> Union[Image, np.ndarray]:
+        if des_img is not None:
+            if isinstance(des_img, np.ndarray):
+                des_shape = tuple(des_img.shape[:2])
+            else:
+                des_shape = (des_img.size[1], des_img.size[0])
+            src_shape = (img.size[1], img.size[0])
+            point_coords, boxes = transform_coords(src_shape, des_shape, point_coords, boxes)
+            return add_prompts_tag(des_img, point_labels, point_coords, boxes, pil)
+        else:
+            return add_prompts_tag(img, point_labels, point_coords, boxes, pil)
+    def visualize_results(self,
+                          img: Union[Image, np.ndarray],
+                          des_img: Union[Image, np.ndarray] = None,
+                          sam_encoder_output: Optional[SAMEncoderOutput] = None,
+                          features: Optional[torch.Tensor] = None,
+                          interm_features: Optional[List[torch.Tensor]] = None,
+                          original_size: Optional[Tuple] = None,
+                          input_size: Optional[Tuple] = None,
+                          point_coords: Optional[np.ndarray] = None,
+                          point_labels: Optional[np.ndarray] = None,
+                          boxes: Optional[np.ndarray] = None,
+                          texts: Optional[List] = None,
+                          grounding_dino_pipeline: GroundingDinoPipeline = None,
+                          box_threshold: float = 0.25,
+                          text_threshold: float = 0.25,
+                          nms_threshold: float = 0.8,
+                          detections: Optional[sv.Detections] = None,
+                          multimask_output: bool = True,
+                          visualize_promts: bool = True,
+                          pil: bool = False):
+        if isinstance(img, Image):
+            img = np.uint8(img)
+        if des_img is not None:
+            if isinstance(des_img, np.ndarray):
+                des_shape = tuple(des_img.shape[:2])
+            else:
+                des_shape = (des_img.size[1], des_img.size[0])
+            src_shape = img.shape[:2]
+            if point_coords is not None or boxes is not None:
+                des_point_coords, des_boxes = transform_coords(src_shape, des_shape, point_coords, boxes)
+            else:
+                des_point_coords = None
+                des_boxes = None
+        else:
+            des_point_coords = None
+            des_boxes = None
+            src_shape = None
+            des_shape = None
+        detections = get_empty_detections() if detections is None else detections
+        mask_annotator = sv.MaskAnnotator()
+        result_list = []
+        mask_result_list = []
+        mask_list = []
+        if boxes is None and point_coords is None and point_labels is None and texts is None or \
+                (point_coords is not None and point_labels is not None and point_coords.shape[0] != point_labels.shape[0]):
+            print('no prompt given!')
+            result_list.append(img)
+            return result_list
+        # if boxes is not None and point_coords is not None and point_labels is not None:
+        #     multimask_output = False
+        def get_annotated_image(mask_annotator,
+                                detections,
+                                img,
+                                point_labels=None,
+                                point_coords=None,
+                                boxes=None,
+                                visualize_promts=True,
+                                pil=False):
+            annotated_image = mask_annotator.annotate(scene=img.copy(), detections=detections)
+            if visualize_promts:
+                annotated_image = add_prompts_tag(annotated_image, point_labels, point_coords, boxes=boxes, pil=pil)
+            else:
+                if pil:
+                    annotated_image = PIL.Image.fromarray(annotated_image)
+            return annotated_image
+        def get_masked_image(img,
+                             masks,
+                             pil=True):
+            masked_image_list = []
+            for i in range(masks.shape[0]):
+                object_rgb = img * (masks[i].reshape(img.shape[0], img.shape[1], 1))
+                object_rgb = object_rgb.astype(np.uint8)
+                bkgd_mask = np.where(object_rgb == 0, 1, 0)
+                bkgd_mask *= 255
+                bkgd_mask = bkgd_mask.astype(np.uint8)
+                object_rgb += bkgd_mask
+                if pil:
+                    masked_image_list.append(PIL.Image.fromarray(object_rgb))
+                else:
+                    masked_image_list.append(object_rgb)
+            return masked_image_list
+        def interpolate_mask(mask_np, des_shape):
+            mask_tensor = torch.tensor(mask_np, dtype=torch.float32).unsqueeze(0)
+            mask_interpolate = F.interpolate(mask_tensor, size=des_shape, mode='bilinear')
+            mask_interpolate = (mask_interpolate+0.5).long()
+            mask_np = mask_interpolate.squeeze(0).numpy().astype(bool)
+            return mask_np
+        if point_coords is not None and point_labels is not None:
+            if src_shape is not None:
+                point_result = self.forward(sam_encoder_output,
+                                            features,
+                                            interm_features,
+                                            original_size,
+                                            input_size,
+                                            des_point_coords,
+                                            point_labels)
+                masks_np = interpolate_mask(point_result.masks_np, src_shape)
+            else:
+                point_result = self.forward(sam_encoder_output,
+                                            features,
+                                            interm_features,
+                                            original_size,
+                                            input_size,
+                                            point_coords,
+                                            point_labels)
+                masks_np = point_result.masks_np
+            if multimask_output:
+                for i in range(masks_np.shape[0]):
+                    detections.mask = masks_np[i][None, ...]
+                    mask_list.append(masks_np[i])
+                    result_list.append(get_annotated_image(mask_annotator,
+                                                           detections,
+                                                           img,
+                                                           point_labels=point_labels,
+                                                           point_coords=point_coords,
+                                                           visualize_promts=visualize_promts,
+                                                           pil=pil))
+                    mask_result_list += get_masked_image(img,
+                                                         detections.mask,
+                                                         pil=pil)
+            else:
+                index = np.argmax(point_result.iou_predictions_np)
+                detections.mask = masks_np[index][None, ...]
+                mask_list.append(masks_np[index])
+                result_list.append(get_annotated_image(mask_annotator,
+                                                       detections,
+                                                       img,
+                                                       point_labels=point_labels,
+                                                       point_coords=point_coords,
+                                                       visualize_promts=visualize_promts,
+                                                       pil=pil))
+                mask_result_list += get_masked_image(img,
+                                                     detections.mask,
+                                                     pil=pil)
+        if boxes is not None:
+            result_masks = []
+            if src_shape is not None:
+                boxes_ = des_boxes
+            else:
+                boxes_ = boxes
+            if boxes_.shape[0] > 1:
+                for i in range(len(boxes)):
+                    box_result = self.forward(sam_encoder_output,
+                                            features,
+                                            interm_features,
+                                            original_size,
+                                            input_size,
+                                            box=boxes_[i])
+                    index = np.argmax(box_result.iou_predictions_np)
+                    result_masks.append(box_result.masks_np[index])
+                mask = np.array(result_masks)
+                if src_shape is not None:
+                    masks_np = interpolate_mask(mask, src_shape)
+                else:
+                    masks_np = mask
+                mask_list.append(masks_np)
+                detections.mask = masks_np
+                result_list.append(get_annotated_image(mask_annotator,
+                                                       detections,
+                                                       img,
+                                                       boxes=boxes,
+                                                       visualize_promts=visualize_promts,
+                                                       pil=pil))
+                mask_result_list += get_masked_image(img,
+                                                     detections.mask,
+                                                     pil=pil)
+            else:
+                box_result = self.forward(sam_encoder_output,
+                                          features,
+                                          interm_features,
+                                          original_size,
+                                          input_size,
+                                          box=boxes_)
+                if src_shape is not None:
+                    masks_np = interpolate_mask(box_result.masks_np, src_shape)
+                else:
+                    masks_np = box_result.masks_np
+                if multimask_output:
+                    for i in range(masks_np.shape[0]):
+                        detections.mask = masks_np[i][None, ...]
+                        mask_list.append(masks_np[i])
+                        result_list.append(get_annotated_image(mask_annotator,
+                                                               detections,
+                                                               img,
+                                                               boxes=boxes,
+                                                               visualize_promts=visualize_promts,
+                                                               pil=pil))
+                        mask_result_list += get_masked_image(img,
+                                                             detections.mask,
+                                                             pil=pil)
+                else:
+                    index = np.argmax(box_result.iou_predictions_np)
+                    detections.mask = masks_np[index][None, ...]
+                    mask_list.append(masks_np[index])
+                    result_list.append(get_annotated_image(mask_annotator, detections, img, boxes=boxes, pil=pil))
+                    mask_result_list += get_masked_image(img,
+                                                         detections.mask,
+                                                         pil=pil)
+        if texts is not None and grounding_dino_pipeline is not None:
+            detections = grounding_dino_pipeline(img[:, :, ::-1], texts, box_threshold, text_threshold)
+            box_annotator = sv.BoxAnnotator()
+            nms_idx = torchvision.ops.nms(
+                torch.from_numpy(detections.xyxy),
+                torch.from_numpy(detections.confidence),
+                nms_threshold
+            ).numpy().tolist()
+            detections.xyxy = detections.xyxy[nms_idx]
+            detections.confidence = detections.confidence[nms_idx]
+            detections.class_id = detections.class_id[nms_idx]
+            labels = [
+                f"{texts[class_id]} {confidence:0.2f}"
+                for _, _, confidence, class_id, _
+                in detections]
+            result_masks = []
+            if src_shape is not None:
+                _, boxes_ = transform_coords(src_shape, des_shape, boxes=detections.xyxy)
+            else:
+                boxes_ = detections.xyxy
+            for box in boxes_:
+                box_result = self.forward(sam_encoder_output,
+                                          features,
+                                          interm_features,
+                                          original_size,
+                                          input_size,
+                                          box=box)
+                index = np.argmax(box_result.iou_predictions_np)
+                result_masks.append(box_result.masks_np[index])
+            mask = np.array(result_masks)
+            if src_shape is not None:
+                detections.mask = interpolate_mask(mask, src_shape)
+            else:
+                detections.mask = mask
+            for i in range(detections.mask.shape[0]):
+                mask_list.append(detections.mask[i, ...])
+            if visualize_promts:
+                annotated_image = mask_annotator.annotate(scene=img[:, :, ::-1].copy(), detections=detections)
+                annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+            else:
+                annotated_image = mask_annotator.annotate(scene=img[:, :, ::-1].copy(), detections=detections)
+            if pil:
+                result_list.append(PIL.Image.fromarray(annotated_image[:, :, ::-1]))
+            else:
+                result_list.append(annotated_image[:, :, ::-1])
+            mask_result_list += get_masked_image(img,
+                                                 detections.mask,
+                                                 pil=pil)
+        return result_list, mask_result_list, mask_list
+    def predict(
+            self,
+            features: torch.Tensor,
+            interm_features: List[torch.Tensor],
+            original_size: Tuple,
+            input_size: Tuple,
+            point_coords: Optional[np.ndarray] = None,
+            point_labels: Optional[np.ndarray] = None,
+            box: Optional[np.ndarray] = None,
+            mask_input: Optional[np.ndarray] = None,
+            multimask_output: bool = True,
+            return_logits: bool = False,
+            hq_token_only: bool = False,
+    ) -> SAMDecoderPredictOutput:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        # Transform input prompts
+        coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None
+        if point_coords is not None:
+            assert (
+                    point_labels is not None
+            ), "point_labels must be supplied if point_coords is supplied."
+            point_coords = self.transform.apply_coords(point_coords, original_size)
+            coords_torch = torch.as_tensor(point_coords, dtype=torch.float, device=self.device)
+            labels_torch = torch.as_tensor(point_labels, dtype=torch.int, device=self.device)
+            coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :]
+        if box is not None:
+            box = self.transform.apply_boxes(box, original_size)
+            box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device)
+            box_torch = box_torch[None, :]
+        if mask_input is not None:
+            mask_input_torch = torch.as_tensor(mask_input, dtype=torch.float, device=self.device)
+            mask_input_torch = mask_input_torch[None, :, :, :]
+        sam_decoder_predict_torch_output = self.predict_torch(
+            features,
+            interm_features,
+            original_size,
+            input_size,
+            coords_torch,
+            labels_torch,
+            box_torch,
+            mask_input_torch,
+            multimask_output,
+            return_logits=return_logits,
+            hq_token_only=hq_token_only,
+        )
+        masks_np = sam_decoder_predict_torch_output.masks[0].detach().cpu().numpy()
+        iou_predictions_np = sam_decoder_predict_torch_output.iou_predictions[0].detach().cpu().numpy()
+        low_res_masks_np = sam_decoder_predict_torch_output.low_res_masks[0].detach().cpu().numpy()
+        return SAMDecoderPredictOutput(masks_np, iou_predictions_np, low_res_masks_np)
+    @torch.no_grad()
+    def predict_torch(
+            self,
+            features: torch.Tensor,
+            interm_features: List[torch.Tensor],
+            original_size: Tuple,
+            input_size: Tuple,
+            point_coords: Optional[torch.Tensor],
+            point_labels: Optional[torch.Tensor],
+            boxes: Optional[torch.Tensor] = None,
+            mask_input: Optional[torch.Tensor] = None,
+            multimask_output: bool = True,
+            return_logits: bool = False,
+            hq_token_only: bool = False,
+    ) -> SAMDecoderPredictTorchOutput:
+        """
+        Predict masks for the given input prompts, using the currently set image.
+        Input prompts are batched torch tensors and are expected to already be
+        transformed to the input frame using ResizeLongestSide.
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          boxes (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """
+        if point_coords is not None:
+            points = (point_coords, point_labels)
+        else:
+            points = None
+        # Embed prompts
+        sparse_embeddings, dense_embeddings = self.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=mask_input,
+        )
+        # Predict masks
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings=features,
+            image_pe=self.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+            hq_token_only=hq_token_only,
+            interm_embeddings=interm_features,
+        )
+        # Upscale the masks to the original image resolution
+        # masks = self.model.postprocess_masks(low_res_masks, self.input_size, self.original_size)
+        masks = self.postprocess_masks(low_res_masks, input_size, original_size)
+        if not return_logits:
+            masks = masks > self.mask_threshold
+        return SAMDecoderPredictTorchOutput(masks, iou_predictions, low_res_masks)
+    def forward(self,
+                sam_encoder_output: Optional[SAMEncoderOutput]=None,
+                features: Optional[torch.Tensor]=None,
+                interm_features: Optional[List[torch.Tensor]]=None,
+                original_size: Optional[Tuple]=None,
+                input_size: Optional[Tuple]=None,
+                point_coords: Optional[np.ndarray] = None,
+                point_labels: Optional[np.ndarray] = None,
+                box: Optional[np.ndarray] = None,
+                mask_input: Optional[np.ndarray] = None,
+                multimask_output: bool = True,
+                return_logits: bool = False,
+                hq_token_only: bool = False,
+                dino: bool = False
+    ) -> SAMDecoderPredictOutput:
+        assert sam_encoder_output or (features is not None and original_size is not None and input_size is not None), 'one of sam_encoder_output and four necessary inputs must be given!'
+        if sam_encoder_output:
+            features = sam_encoder_output.features
+            interm_features = sam_encoder_output.interm_features
+            original_size = sam_encoder_output.original_size
+            input_size = sam_encoder_output.input_size
+        if self.adaptor is not None:
+            if dino:
+                features = F.interpolate(F.normalize(features, dim=1), size=(64, 64), mode='bilinear').permute(0, 2, 3, 1)
+            features = self.adaptor(features)
+            #
+            # else:
+            #     features = self.adaptor(features, original_size)
+        return self.predict(features,
+                            interm_features,
+                            original_size,
+                            input_size,
+                            point_coords,
+                            point_labels,
+                            box,
+                            mask_input,
+                            multimask_output,
+                            return_logits,
+                            hq_token_only)
+'''
+class SAMPipeline(Pipeline):
+    @classmethod
+    def from_pretrained(cls, ckpt_path, device='cuda', *args, **kwargs):
+        sam_encoder_pipeline = SAMEncoderPipeline(ckpt_path, device, *args, **kwargs)
+        sam_decoder_pipeline = SAMDecoderPipeline(ckpt_path, device, *args, **kwargs)
+        pipeline = cls(**dict(sam_encoder_pipeline=sam_encoder_pipeline,
+                              sam_decoder_pipeline=sam_decoder_pipeline,
+                              device=device))
+        return pipeline
+'''

sam_extension/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import math
+import cv2
+import PIL
+import torch
+from PIL.Image import Image
+from typing import Union, Tuple, List, Optional
+import numpy as np
+import supervision as sv
+from sklearn.decomposition import PCA
+# def add_points_tag(img: Union[Image, np.ndarray],
+#                    point_labels: Union[List[int], np.ndarray] = None,
+#                    point_coords: Union[List[List[int]], np.ndarray] = None,
+#                    pil: bool = False):
+#     if point_labels is None or point_coords is None or \
+#        not isinstance(point_labels, (List, np.ndarray)) or \
+#        not isinstance(point_coords, (List, np.ndarray)):
+#         return img
+#     if len(point_labels) != len(point_coords):
+#         print('length of point_label and point_coordinate must be same!')
+#         return img
+#     if isinstance(img, Image):
+#         img = np.uint8(img)
+#     start_angle = 40
+#     x = 8
+#     y = 2
+#     def get_point(angle, d, base):
+#         angle = angle / 180.0 * math.pi
+#         _x, _y = math.cos(angle) * d, math.sin(angle) * d
+#         return [base[0] + _x, base[1] - _y]
+#     # assert len(point_labels) == len(point_coords), ''
+#     for i in range(len(point_labels)):
+#         points = []
+#         for j in range(5):
+#             _x, _y = math.cos(start_angle), math.sin(start_angle)
+#             points.append(get_point(start_angle, x, point_coords[i]))
+#             start_angle -= 36
+#             points.append(get_point(start_angle, y, point_coords[i]))
+#             start_angle -= 36
+#         points = np.array([points], np.int32)
+#         color = (255, 0, 0) if point_labels[i] == 0 else (0, 255, 0)
+#         cv2.fillPoly(img, points, color, cv2.LINE_AA)
+#     if pil:
+#         img = PIL.Image.fromarray(img)
+#     return img
+def add_points_tag(img: Union[Image, np.ndarray],
+                   point_labels: Union[List[int], np.ndarray] = None,
+                   point_coords: Union[List[List[int]], np.ndarray] = None,
+                   pil: bool = False):
+    if point_labels is None or point_coords is None or \
+       not isinstance(point_labels, (List, np.ndarray)) or \
+       not isinstance(point_coords, (List, np.ndarray)):
+        return img
+    if len(point_labels) != len(point_coords):
+        print('length of point_label and point_coordinate must be same!')
+        return img
+    if isinstance(img, Image):
+        img = np.array(img)
+    # img.flags.writeable = True
+    h, w = img.shape[:2]
+    x_start_list, x_end_list = np.where((point_coords[:, 0] - 4) > 0, point_coords[:, 0] - 4, 0), np.where((point_coords[:, 0] + 4) < w, point_coords[:, 0] + 4, w)
+    y_start_list, y_end_list = np.where((point_coords[:, 1] - 4) > 0, point_coords[:, 1] - 4, 0), np.where((point_coords[:, 1] + 4) < h, point_coords[:, 1] + 4, h)
+    for i in range(len(point_labels)):
+        x_start, x_end = x_start_list[i], x_end_list[i]
+        y_start, y_end = y_start_list[i], y_end_list[i]
+        label = point_labels[i]
+        color = [0, 255, 0] if int(label) == 1 else [255, 0, 0]
+        for x in range(x_start, x_end):
+            for y in range(y_start, y_end):
+                img[y, x, :] = color
+    if pil:
+        img = PIL.Image.fromarray(img)
+    return img
+def add_boxes_tag(img: Union[Image, np.ndarray],
+                  boxes: Union[List[List[int]], np.ndarray] = None,
+                  pil: bool = False):
+    if boxes is None or not isinstance(boxes, (List, np.ndarray)):
+        return img
+    # if isinstance(boxes, np.ndarray):
+    #     if not boxes.all():
+    #         return img
+    # else:
+    #     if not boxes:
+    #         return img
+    if isinstance(img, Image):
+        img = np.uint8(img)
+    thickness = 2
+    for i in range(len(boxes)):
+        color = (0, 255, 0)
+        img = cv2.rectangle(img, (boxes[i][0], boxes[i][1]), (boxes[i][2], boxes[i][3]), color, thickness)
+    if pil:
+        img = PIL.Image.fromarray(img)
+    return img
+def add_prompts_tag(img: Union[Image, np.ndarray],
+                    point_labels: Union[List[int], np.ndarray] = None,
+                    point_coords: Union[List[List[int]], np.ndarray] = None,
+                    boxes: Union[List[List[int]], np.ndarray] = None,
+                    pil: bool = False):
+    img = add_points_tag(img, point_labels, point_coords, pil=pil)
+    img = add_boxes_tag(img, boxes, pil=pil)
+    return img
+def get_empty_detections():
+    detections = sv.Detections(xyxy=np.array([0, 0, 0, 0]).reshape(1, 4))
+    detections.xyxy = None
+    return detections
+def pca_feature(feature: torch.Tensor, dim: int = 3, return_np: bool = True):
+    pca = PCA(n_components=dim)
+    H, W, C = feature.shape
+    feature = feature.view(-1, C).cpu().numpy()
+    feature = pca.fit_transform(feature)
+    feature = torch.tensor(feature.reshape(H, W, dim))
+    if return_np:
+        return feature.numpy()
+    else:
+        return feature
+def visual_feature_rgb(feature: torch.Tensor, pil:bool = True):
+    assert feature.ndim >= 3, 'the dim of feature must >= 3!'
+    if feature.ndim == 4:
+        feature = feature.squeeze(0)
+    if feature.shape[-1] != 3:
+        feature = pca_feature(feature, 3, False)
+    max_f, _ = feature.max(-1)
+    min_f, _ = feature.min(-1)
+    feature = (feature - min_f[..., None]) / (max_f[..., None] - min_f[..., None])
+    feature = np.uint8((feature*255).cpu().numpy())
+    if pil:
+        return PIL.Image.fromarray(feature)
+    else:
+        return feature
+def transform_coords(src_shape, des_shape, points = None, boxes = None):
+    assert points is not None or boxes is not None, 'one of points and boxes must be given!'
+    scale_h = des_shape[0] / src_shape[0]
+    scale_w = des_shape[1] / src_shape[1]
+    if points is not None:
+        new_points = np.full_like(points, 0)
+        new_points[:, 0] = points[:, 0] * scale_w
+        new_points[:, 1] = points[:, 1] * scale_h
+        new_points.astype(np.int64)
+    else:
+        new_points = None
+    if boxes is not None:
+        new_boxes = np.full_like(boxes, 0)
+        new_boxes[:, 0] = boxes[:, 0] * scale_w
+        new_boxes[:, 1] = boxes[:, 1] * scale_h
+        new_boxes[:, 2] = boxes[:, 2] * scale_w
+        new_boxes[:, 3] = boxes[:, 3] * scale_h
+        new_boxes.astype(np.int64)
+    else:
+        new_boxes = None
+    return new_points, new_boxes
+def mask2greyimg(mask_list, pil=True):
+    grey_img_list = []
+    for mask in mask_list:
+        if pil:
+            grey_img_list.append(PIL.Image.fromarray(np.uint8(mask*255)))
+        else:
+            grey_img_list.append(np.uint8(mask * 255))
+    return grey_img_list
+if __name__ == '__main__':
+    src_shape = (100,100)
+    des_shape = (200,200)
+    points = np.array([[20,20],[40,40]])
+    boxes = np.array([[10,10,20,20]])
+    new_points, new_boxes = transform_coords(src_shape, des_shape, points, boxes)
+    print(new_points, new_boxes)

sam_extension/utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (4.51 kB). View file