Spaces:

nasa-cisto-data-science-group
/

satvision-base-demo

Sleeping

satvision-base-demo / pytorch-caney /pytorch_caney /network /attention.py

Caleb Spradlin

initial commit

ab687e7 over 1 year ago

7.48 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np


	class WindowAttention(nn.Module):
	"""
	Window based multi-head self attention (W-MSA) module with
	relative position bias. It supports both of shifted and
	non-shifted window.

	Args:
	dim (int): Number of input channels.
	window_size (tuple[int]): The height and width of the window.
	num_heads (int): Number of attention heads.
	qkv_bias (bool, optional): If True, add a learnable bias to query,
	key, value. Default: True
	attn_drop (float, optional): Dropout ratio of attention weight.
	Default: 0.0
	proj_drop (float, optional): Dropout ratio of output. Default: 0.0
	pretrained_window_size (tuple[int]): The height and width of the
	window in pre-training.
	"""

	def __init__(self,
	dim,
	window_size,
	num_heads,
	qkv_bias=True,
	attn_drop=0.,
	proj_drop=0.,
	pretrained_window_size=[0, 0]):

	super().__init__()

	self.dim = dim

	self.window_size = window_size # Wh, Ww

	self.pretrained_window_size = pretrained_window_size

	self.num_heads = num_heads

	self.logit_scale = nn.Parameter(
	torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True)

	# mlp to generate continuous relative position bias
	self.cpb_mlp = nn.Sequential(nn.Linear(2, 512, bias=True),
	nn.ReLU(inplace=True),
	nn.Linear(512, num_heads, bias=False))

	# get relative_coords_table
	relative_coords_h = torch.arange(
	-(self.window_size[0] - 1),
	self.window_size[0],
	dtype=torch.float32)
	relative_coords_w = torch.arange(
	-(self.window_size[1] - 1),
	self.window_size[1],
	dtype=torch.float32)

	# 1, 2Wh-1, 2Ww-1, 2
	relative_coords_table = torch.stack(
	torch.meshgrid(
	[relative_coords_h,
	relative_coords_w])).permute(1,
	2,
	0).contiguous().unsqueeze(0)

	if pretrained_window_size[0] > 0:

	relative_coords_table[:, :, :,
	0] /= (pretrained_window_size[0] - 1)

	relative_coords_table[:, :, :,
	1] /= (pretrained_window_size[1] - 1)

	else:

	relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)

	relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)

	relative_coords_table *= 8 # normalize to -8, 8

	relative_coords_table = torch.sign(relative_coords_table) * torch.log2(
	torch.abs(relative_coords_table) + 1.0) / np.log2(8)

	self.register_buffer("relative_coords_table", relative_coords_table)

	# get pair-wise relative position index for each token inside
	# the window
	coords_h = torch.arange(self.window_size[0])
	coords_w = torch.arange(self.window_size[1])

	coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww

	coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww

	relative_coords = coords_flatten[:, :, None] - \
	coords_flatten[:, None, :] # 2, WhWw, WhWw

	relative_coords = relative_coords.permute(
	1, 2, 0).contiguous() # WhWw, WhWw, 2

	relative_coords[:, :, 0] += self.window_size[0] - \
	1 # shift to start from 0

	relative_coords[:, :, 1] += self.window_size[1] - 1
	relative_coords[:, :, 0] = 2 self.window_size[1] - 1

	relative_position_index = relative_coords.sum(-1) # WhWw, WhWw

	self.register_buffer("relative_position_index",
	relative_position_index)

	self.qkv = nn.Linear(dim, dim * 3, bias=False)

	if qkv_bias:

	self.q_bias = nn.Parameter(torch.zeros(dim))
	self.v_bias = nn.Parameter(torch.zeros(dim))

	else:

	self.q_bias = None
	self.v_bias = None

	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)
	self.softmax = nn.Softmax(dim=-1)

	def forward(self, x, mask=None):
	"""
	Args:
	x: input features with shape of (num_windows*B, N, C)
	mask: (0/-inf) mask with shape of (num_windows, WhWw, WhWw)
	or None
	"""
	B_, N, C = x.shape
	qkv_bias = None
	if self.q_bias is not None:
	qkv_bias = torch.cat((self.q_bias, torch.zeros_like(
	self.v_bias, requires_grad=False), self.v_bias))
	qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
	qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	# make torchscript happy (cannot use tensor as tuple)
	q, k, v = qkv[0], qkv[1], qkv[2]

	# cosine attention
	attn = (F.normalize(q, dim=-1) @
	F.normalize(k, dim=-1).transpose(-2, -1))
	# logit_scale = torch.clamp(
	# self.logit_scale, max=torch.log(torch.tensor(1. / 0.01))).exp()
	logit_scale = torch.clamp(self.logit_scale, max=torch.log(
	torch.tensor(1. / 0.01))).exp() # .to(self.logit_scale.get_device())
	attn = attn * logit_scale

	relative_position_bias_table = self.cpb_mlp(
	self.relative_coords_table).view(-1, self.num_heads)
	relative_position_bias = \
	relative_position_bias_table[
	self.relative_position_index.view(-1)].view(
	self.window_size[0] * self.window_size[1],
	self.window_size[0] * self.window_size[1], -1)
	# WhWw,WhWw,nH

	relative_position_bias = relative_position_bias.permute(
	2, 0, 1).contiguous() # nH, WhWw, WhWw

	relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
	attn = attn + relative_position_bias.unsqueeze(0)

	if mask is not None:
	nW = mask.shape[0]
	attn = attn.view(B_ // nW, nW, self.num_heads, N,
	N) + mask.unsqueeze(1).unsqueeze(0)
	attn = attn.view(-1, self.num_heads, N, N)
	attn = self.softmax(attn)

	else:
	attn = self.softmax(attn)

	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
	x = self.proj(x)
	x = self.proj_drop(x)
	return x

	def extra_repr(self) -> str:
	return f'dim={self.dim}, window_size={self.window_size}, ' \
	f'pretrained_window_size={self.pretrained_window_size}, ' \
	f'num_heads={self.num_heads}'

	def flops(self, N):
	# calculate flops for 1 window with token length of N
	flops = 0
	# qkv = self.qkv(x)
	flops += N * self.dim * 3 * self.dim
	# attn = (q @ k.transpose(-2, -1))
	flops += self.num_heads * N * (self.dim // self.num_heads) * N
	# x = (attn @ v)
	flops += self.num_heads * N * N * (self.dim // self.num_heads)
	# x = self.proj(x)
	flops += N * self.dim * self.dim
	return flops