pix2pix-zero-demo-CPU

Build error

App Files Files Community

pix2pix-zero-demo-CPU / lavis /models /clip_models /timm_model.py

John6666

Upload 351 files

e84842d verified 11 months ago

raw

history blame

19 kB

	"""
	Copyright (c) 2022, salesforce.com, inc.
	All rights reserved.
	SPDX-License-Identifier: BSD-3-Clause
	For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

	Based on https://github.com/mlfoundations/open_clip
	"""

	""" timm model adapter
	Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
	"""
	import math
	import warnings
	from collections import OrderedDict
	from typing import List, Optional, Tuple, Union

	import torch
	import torch.nn as nn
	from torch import nn as nn

	try:
	import timm
	from timm.models.layers import Mlp, to_2tuple

	# from timm.models.layers.attention_pool2d import RotAttentionPool2d
	# from timm.models.layers.attention_pool2d import (
	# AttentionPool2d as AbsAttentionPool2d,
	# )

	except ImportError as e:
	timm = None

	from lavis.models.clip_models.utils import freeze_batch_norm_2d


	class TimmModel(nn.Module):
	"""timm model adapter
	# FIXME this adapter is a work in progress, may change in ways that break weight compat
	"""

	def __init__(
	self,
	model_name,
	embed_dim,
	image_size=224,
	pool="avg",
	proj="linear",
	drop=0.0,
	pretrained=False,
	):
	super().__init__()
	if timm is None:
	raise RuntimeError("Please `pip install timm` to use timm models.")

	self.image_size = to_2tuple(image_size)
	self.trunk = timm.create_model(model_name, pretrained=pretrained)
	feat_size = self.trunk.default_cfg.get("pool_size", None)
	feature_ndim = 1 if not feat_size else 2
	if pool in ("abs_attn", "rot_attn"):
	assert feature_ndim == 2
	# if attn pooling used, remove both classifier and default pool
	self.trunk.reset_classifier(0, global_pool="")
	else:
	# reset global pool if pool config set, otherwise leave as network default
	reset_kwargs = dict(global_pool=pool) if pool else {}
	self.trunk.reset_classifier(0, **reset_kwargs)
	prev_chs = self.trunk.num_features

	head_layers = OrderedDict()
	if pool == "abs_attn":
	head_layers["pool"] = AttentionPool2d(
	prev_chs, feat_size=feat_size, out_features=embed_dim
	)
	prev_chs = embed_dim
	elif pool == "rot_attn":
	head_layers["pool"] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
	prev_chs = embed_dim
	else:
	assert proj, "projection layer needed if non-attention pooling is used."

	# NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
	if proj == "linear":
	head_layers["drop"] = nn.Dropout(drop)
	head_layers["proj"] = nn.Linear(prev_chs, embed_dim)
	elif proj == "mlp":
	head_layers["mlp"] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop)

	self.head = nn.Sequential(head_layers)

	def lock(self, unlocked_groups=0, freeze_bn_stats=False):
	"""lock modules
	Args:
	unlocked_groups (int): leave last n layer groups unlocked (default: 0)
	"""
	if not unlocked_groups:
	# lock full model
	for param in self.trunk.parameters():
	param.requires_grad = False
	if freeze_bn_stats:
	freeze_batch_norm_2d(self.trunk)
	else:
	# NOTE: partial freeze requires latest timm (master) branch and is subject to change
	try:
	# FIXME import here until API stable and in an official release
	from timm.models.helpers import group_modules, group_parameters
	except ImportError:
	raise RuntimeError(
	"Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`"
	)
	matcher = self.trunk.group_matcher()
	gparams = group_parameters(self.trunk, matcher)
	max_layer_id = max(gparams.keys())
	max_layer_id = max_layer_id - unlocked_groups
	for group_idx in range(max_layer_id + 1):
	group = gparams[group_idx]
	for param in group:
	self.trunk.get_parameter(param).requires_grad = False
	if freeze_bn_stats:
	gmodules = group_modules(self.trunk, matcher, reverse=True)
	gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
	freeze_batch_norm_2d(self.trunk, gmodules)

	def forward(self, x):
	x = self.trunk(x)
	x = self.head(x)
	return x


	class RotAttentionPool2d(nn.Module):
	"""Attention based 2D feature pooling w/ rotary (relative) pos embedding.
	This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.
	Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed.
	https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
	NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from
	train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW
	"""

	def __init__(
	self,
	in_features: int,
	out_features: int = None,
	embed_dim: int = None,
	num_heads: int = 4,
	qkv_bias: bool = True,
	):
	super().__init__()
	embed_dim = embed_dim or in_features
	out_features = out_features or in_features
	self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
	self.proj = nn.Linear(embed_dim, out_features)
	self.num_heads = num_heads
	assert embed_dim % num_heads == 0
	self.head_dim = embed_dim // num_heads
	self.scale = self.head_dim**-0.5
	self.pos_embed = RotaryEmbedding(self.head_dim)

	trunc_normal_(self.qkv.weight, std=in_features**-0.5)
	nn.init.zeros_(self.qkv.bias)

	def forward(self, x):
	B, _, H, W = x.shape
	N = H * W
	x = x.reshape(B, -1, N).permute(0, 2, 1)

	x = torch.cat([x.mean(1, keepdim=True), x], dim=1)

	x = (
	self.qkv(x)
	.reshape(B, N + 1, 3, self.num_heads, self.head_dim)
	.permute(2, 0, 3, 1, 4)
	)
	q, k, v = x[0], x[1], x[2]

	qc, q = q[:, :, :1], q[:, :, 1:]
	sin_emb, cos_emb = self.pos_embed.get_embed((H, W))
	q = apply_rot_embed(q, sin_emb, cos_emb)
	q = torch.cat([qc, q], dim=2)

	kc, k = k[:, :, :1], k[:, :, 1:]
	k = apply_rot_embed(k, sin_emb, cos_emb)
	k = torch.cat([kc, k], dim=2)

	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = attn.softmax(dim=-1)

	x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1)
	x = self.proj(x)
	return x[:, 0]


	class AttentionPool2d(nn.Module):
	"""Attention based 2D feature pooling w/ learned (absolute) pos embedding.
	This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.
	It was based on impl in CLIP by OpenAI
	https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
	NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.
	"""

	def __init__(
	self,
	in_features: int,
	feat_size: Union[int, Tuple[int, int]],
	out_features: int = None,
	embed_dim: int = None,
	num_heads: int = 4,
	qkv_bias: bool = True,
	):
	super().__init__()

	embed_dim = embed_dim or in_features
	out_features = out_features or in_features
	assert embed_dim % num_heads == 0
	self.feat_size = to_2tuple(feat_size)
	self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
	self.proj = nn.Linear(embed_dim, out_features)
	self.num_heads = num_heads
	self.head_dim = embed_dim // num_heads
	self.scale = self.head_dim**-0.5

	spatial_dim = self.feat_size[0] * self.feat_size[1]
	self.pos_embed = nn.Parameter(torch.zeros(spatial_dim + 1, in_features))
	trunc_normal_(self.pos_embed, std=in_features**-0.5)
	trunc_normal_(self.qkv.weight, std=in_features**-0.5)
	nn.init.zeros_(self.qkv.bias)

	def forward(self, x):
	B, _, H, W = x.shape
	N = H * W
	assert self.feat_size[0] == H
	assert self.feat_size[1] == W
	x = x.reshape(B, -1, N).permute(0, 2, 1)
	x = torch.cat([x.mean(1, keepdim=True), x], dim=1)
	x = x + self.pos_embed.unsqueeze(0).to(x.dtype)

	x = (
	self.qkv(x)
	.reshape(B, N + 1, 3, self.num_heads, self.head_dim)
	.permute(2, 0, 3, 1, 4)
	)
	q, k, v = x[0], x[1], x[2]
	attn = (q @ k.transpose(-2, -1)) * self.scale
	attn = attn.softmax(dim=-1)

	x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1)
	x = self.proj(x)
	return x[:, 0]


	def pixel_freq_bands(
	num_bands: int,
	max_freq: float = 224.0,
	linear_bands: bool = True,
	dtype: torch.dtype = torch.float32,
	device: Optional[torch.device] = None,
	):
	if linear_bands:
	bands = torch.linspace(1.0, max_freq / 2, num_bands, dtype=dtype, device=device)
	else:
	bands = 2 ** torch.linspace(
	0, math.log(max_freq, 2) - 1, num_bands, dtype=dtype, device=device
	)
	return bands * torch.pi


	def inv_freq_bands(
	num_bands: int,
	temperature: float = 100000.0,
	step: int = 2,
	dtype: torch.dtype = torch.float32,
	device: Optional[torch.device] = None,
	) -> torch.Tensor:
	inv_freq = 1.0 / (
	temperature
	** (torch.arange(0, num_bands, step, dtype=dtype, device=device) / num_bands)
	)
	return inv_freq


	def build_sincos2d_pos_embed(
	feat_shape: List[int],
	dim: int = 64,
	temperature: float = 10000.0,
	reverse_coord: bool = False,
	interleave_sin_cos: bool = False,
	dtype: torch.dtype = torch.float32,
	device: Optional[torch.device] = None,
	) -> torch.Tensor:
	"""
	Args:
	feat_shape:
	dim:
	temperature:
	reverse_coord: stack grid order W, H instead of H, W
	interleave_sin_cos: sin, cos, sin, cos stack instead of sin, sin, cos, cos
	dtype:
	device:
	Returns:
	"""
	assert (
	dim % 4 == 0
	), "Embed dimension must be divisible by 4 for sin-cos 2D position embedding"
	pos_dim = dim // 4
	bands = inv_freq_bands(
	pos_dim, temperature=temperature, step=1, dtype=dtype, device=device
	)

	if reverse_coord:
	feat_shape = feat_shape[::-1] # stack W, H instead of H, W
	grid = (
	torch.stack(
	torch.meshgrid(
	[torch.arange(s, device=device, dtype=dtype) for s in feat_shape]
	)
	)
	.flatten(1)
	.transpose(0, 1)
	)
	pos2 = grid.unsqueeze(-1) * bands.unsqueeze(0)
	# FIXME add support for unflattened spatial dim?

	stack_dim = (
	2 if interleave_sin_cos else 1
	) # stack sin, cos, sin, cos instead of sin sin cos cos
	pos_emb = torch.stack([torch.sin(pos2), torch.cos(pos2)], dim=stack_dim).flatten(1)
	return pos_emb


	def build_fourier_pos_embed(
	feat_shape: List[int],
	bands: Optional[torch.Tensor] = None,
	num_bands: int = 64,
	max_res: int = 224,
	linear_bands: bool = False,
	include_grid: bool = False,
	concat_out: bool = True,
	in_pixels: bool = True,
	dtype: torch.dtype = torch.float32,
	device: Optional[torch.device] = None,
	) -> List[torch.Tensor]:
	if bands is None:
	if in_pixels:
	bands = pixel_freq_bands(
	num_bands,
	float(max_res),
	linear_bands=linear_bands,
	dtype=dtype,
	device=device,
	)
	else:
	bands = inv_freq_bands(num_bands, step=1, dtype=dtype, device=device)
	else:
	if device is None:
	device = bands.device
	if dtype is None:
	dtype = bands.dtype

	if in_pixels:
	grid = torch.stack(
	torch.meshgrid(
	[
	torch.linspace(-1.0, 1.0, steps=s, device=device, dtype=dtype)
	for s in feat_shape
	]
	),
	dim=-1,
	)
	else:
	grid = torch.stack(
	torch.meshgrid(
	[torch.arange(s, device=device, dtype=dtype) for s in feat_shape]
	),
	dim=-1,
	)
	grid = grid.unsqueeze(-1)
	pos = grid * bands

	pos_sin, pos_cos = pos.sin(), pos.cos()
	out = (grid, pos_sin, pos_cos) if include_grid else (pos_sin, pos_cos)
	# FIXME torchscript doesn't like multiple return types, probably need to always cat?
	if concat_out:
	out = torch.cat(out, dim=-1)
	return out


	class FourierEmbed(nn.Module):
	def __init__(
	self,
	max_res: int = 224,
	num_bands: int = 64,
	concat_grid=True,
	keep_spatial=False,
	):
	super().__init__()
	self.max_res = max_res
	self.num_bands = num_bands
	self.concat_grid = concat_grid
	self.keep_spatial = keep_spatial
	self.register_buffer(
	"bands", pixel_freq_bands(max_res, num_bands), persistent=False
	)

	def forward(self, x):
	B, C = x.shape[:2]
	feat_shape = x.shape[2:]
	emb = build_fourier_pos_embed(
	feat_shape,
	self.bands,
	include_grid=self.concat_grid,
	dtype=x.dtype,
	device=x.device,
	)
	emb = emb.transpose(-1, -2).flatten(len(feat_shape))
	batch_expand = (B,) + (-1,) * (x.ndim - 1)

	# FIXME support nD
	if self.keep_spatial:
	x = torch.cat(
	[x, emb.unsqueeze(0).expand(batch_expand).permute(0, 3, 1, 2)], dim=1
	)
	else:
	x = torch.cat(
	[x.permute(0, 2, 3, 1), emb.unsqueeze(0).expand(batch_expand)], dim=-1
	)
	x = x.reshape(B, feat_shape.numel(), -1)

	return x


	def rot(x):
	return torch.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape)


	def apply_rot_embed(x: torch.Tensor, sin_emb, cos_emb):
	return x * cos_emb + rot(x) * sin_emb


	def apply_rot_embed_list(x: List[torch.Tensor], sin_emb, cos_emb):
	if isinstance(x, torch.Tensor):
	x = [x]
	return [t * cos_emb + rot(t) * sin_emb for t in x]


	def apply_rot_embed_split(x: torch.Tensor, emb):
	split = emb.shape[-1] // 2
	return x * emb[:, :split] + rot(x) * emb[:, split:]


	def build_rotary_pos_embed(
	feat_shape: List[int],
	bands: Optional[torch.Tensor] = None,
	dim: int = 64,
	max_freq: float = 224,
	linear_bands: bool = False,
	dtype: torch.dtype = torch.float32,
	device: Optional[torch.device] = None,
	):
	"""
	NOTE: shape arg should include spatial dim only
	"""
	feat_shape = torch.Size(feat_shape)

	sin_emb, cos_emb = build_fourier_pos_embed(
	feat_shape,
	bands=bands,
	num_bands=dim // 4,
	max_res=max_freq,
	linear_bands=linear_bands,
	concat_out=False,
	device=device,
	dtype=dtype,
	)
	N = feat_shape.numel()
	sin_emb = sin_emb.reshape(N, -1).repeat_interleave(2, -1)
	cos_emb = cos_emb.reshape(N, -1).repeat_interleave(2, -1)
	return sin_emb, cos_emb


	class RotaryEmbedding(nn.Module):
	"""Rotary position embedding
	NOTE: This is my initial attempt at impl rotary embedding for spatial use, it has not
	been well tested, and will likely change. It will be moved to its own file.
	The following impl/resources were referenced for this impl:
	* https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py
	* https://blog.eleuther.ai/rotary-embeddings/
	"""

	def __init__(self, dim, max_res=224, linear_bands: bool = False):
	super().__init__()
	self.dim = dim
	self.register_buffer(
	"bands",
	pixel_freq_bands(dim // 4, max_res, linear_bands=linear_bands),
	persistent=False,
	)

	def get_embed(self, shape: List[int]):
	return build_rotary_pos_embed(shape, self.bands)

	def forward(self, x):
	# assuming channel-first tensor where spatial dim are >= 2
	sin_emb, cos_emb = self.get_embed(x.shape[2:])
	return apply_rot_embed(x, sin_emb, cos_emb)


	def _no_grad_trunc_normal_(tensor, mean, std, a, b):
	# Cut & paste from PyTorch official master until it's in a few official releases - RW
	# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
	def norm_cdf(x):
	# Computes standard normal cumulative distribution function
	return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0

	if (mean < a - 2 * std) or (mean > b + 2 * std):
	warnings.warn(
	"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
	"The distribution of values may be incorrect.",
	stacklevel=2,
	)

	with torch.no_grad():
	# Values are generated by using a truncated uniform distribution and
	# then using the inverse CDF for the normal distribution.
	# Get upper and lower cdf values
	l = norm_cdf((a - mean) / std)
	u = norm_cdf((b - mean) / std)

	# Uniformly fill tensor with values from [l, u], then translate to
	# [2l-1, 2u-1].
	tensor.uniform_(2 * l - 1, 2 * u - 1)

	# Use inverse cdf transform for normal distribution to get truncated
	# standard normal
	tensor.erfinv_()

	# Transform to proper mean, std
	tensor.mul_(std * math.sqrt(2.0))
	tensor.add_(mean)

	# Clamp to ensure it's in the proper range
	tensor.clamp_(min=a, max=b)
	return tensor


	def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
	r"""Fills the input Tensor with values drawn from a truncated
	normal distribution. The values are effectively drawn from the
	normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
	with values outside :math:`[a, b]` redrawn until they are within
	the bounds. The method used for generating the random values works
	best when :math:`a \leq \text{mean} \leq b`.
	Args:
	tensor: an n-dimensional `torch.Tensor`
	mean: the mean of the normal distribution
	std: the standard deviation of the normal distribution
	a: the minimum cutoff value
	b: the maximum cutoff value
	Examples:
	>>> w = torch.empty(3, 5)
	>>> nn.init.trunc_normal_(w)
	"""
	return _no_grad_trunc_normal_(tensor, mean, std, a, b)