Spaces:

obvious-research
/

OnlyFlow

Running

App Files Files Community

OnlyFlow / onlyflow /models /attention_processor.py

arlaz

initial commit

9bb001a 3 months ago

raw

history blame contribute delete

20.4 kB

	import inspect
	import logging
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.nn.init as init
	from diffusers.models.attention_processor import Attention as AttentionBase
	from diffusers.models.attention_processor import AttnProcessor2_0 as AttnProcessor2_0_Base, SpatialNorm, AttnProcessor
	from diffusers.models.attention_processor import IPAdapterAttnProcessor2_0 as IPAdapterAttnProcessor2_0_Base
	from diffusers.utils.torch_utils import maybe_allow_in_graph

	logger = logging.getLogger(__name__)


	@maybe_allow_in_graph
	class Attention(AttentionBase):
	r"""
	A cross attention layer.

	Parameters:
	query_dim (`int`):
	The number of channels in the query.
	cross_attention_dim (`int`, optional):
	The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
	heads (`int`, optional, defaults to 8):
	The number of heads to use for multi-head attention.
	kv_heads (`int`, optional, defaults to `None`):
	The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
	`kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
	Query Attention (MQA) otherwise GQA is used.
	dim_head (`int`, optional, defaults to 64):
	The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0):
	The dropout probability to use.
	bias (`bool`, optional, defaults to False):
	Set to `True` for the query, key, and value linear layers to contain a bias parameter.
	upcast_attention (`bool`, optional, defaults to False):
	Set to `True` to upcast the attention computation to `float32`.
	upcast_softmax (`bool`, optional, defaults to False):
	Set to `True` to upcast the softmax computation to `float32`.
	cross_attention_norm (`str`, optional, defaults to `None`):
	The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
	cross_attention_norm_num_groups (`int`, optional, defaults to 32):
	The number of groups to use for the group norm in the cross attention.
	added_kv_proj_dim (`int`, optional, defaults to `None`):
	The number of channels to use for the added key and value projections. If `None`, no projection is used.
	norm_num_groups (`int`, optional, defaults to `None`):
	The number of groups to use for the group norm in the attention.
	spatial_norm_dim (`int`, optional, defaults to `None`):
	The number of channels to use for the spatial normalization.
	out_bias (`bool`, optional, defaults to `True`):
	Set to `True` to use a bias in the output linear layer.
	scale_qk (`bool`, optional, defaults to `True`):
	Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
	only_cross_attention (`bool`, optional, defaults to `False`):
	Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
	`added_kv_proj_dim` is not `None`.
	eps (`float`, optional, defaults to 1e-5):
	An additional value added to the denominator in group normalization that is used for numerical stability.
	rescale_output_factor (`float`, optional, defaults to 1.0):
	A factor to rescale the output by dividing it with this value.
	residual_connection (`bool`, optional, defaults to `False`):
	Set to `True` to add the residual connection to the output.
	_from_deprecated_attn_block (`bool`, optional, defaults to `False`):
	Set to `True` if the attention block is loaded from a deprecated state dict.
	processor (`AttnProcessor`, optional, defaults to `None`):
	The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
	`AttnProcessor` otherwise.
	"""

	def __init__(
	self,
	query_dim: int,
	cross_attention_dim: Optional[int] = None,
	heads: int = 8,
	kv_heads: Optional[int] = None,
	dim_head: int = 64,
	dropout: float = 0.0,
	bias: bool = False,
	upcast_attention: bool = False,
	upcast_softmax: bool = False,
	cross_attention_norm: Optional[str] = None,
	cross_attention_norm_num_groups: int = 32,
	qk_norm: Optional[str] = None,
	added_kv_proj_dim: Optional[int] = None,
	added_proj_bias: Optional[bool] = True,
	norm_num_groups: Optional[int] = None,
	spatial_norm_dim: Optional[int] = None,
	out_bias: bool = True,
	scale_qk: bool = True,
	only_cross_attention: bool = False,
	eps: float = 1e-5,
	rescale_output_factor: float = 1.0,
	residual_connection: bool = False,
	_from_deprecated_attn_block: bool = False,
	processor: Optional["AttnProcessor"] = None,
	out_dim: int = None,
	context_pre_only=None,
	pre_only=False,
	):
	nn.Module.__init__(self)

	# To prevent circular import.
	from diffusers.models.normalization import FP32LayerNorm, RMSNorm

	self.inner_dim = out_dim if out_dim is not None else dim_head * heads
	self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
	self.query_dim = query_dim
	self.use_bias = bias
	self.is_cross_attention = cross_attention_dim is not None
	self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
	self.upcast_attention = upcast_attention
	self.upcast_softmax = upcast_softmax
	self.rescale_output_factor = rescale_output_factor
	self.residual_connection = residual_connection
	self.dropout = dropout
	self.fused_projections = False
	self.out_dim = out_dim if out_dim is not None else query_dim
	self.context_pre_only = context_pre_only
	self.pre_only = pre_only

	# we make use of this private variable to know whether this class is loaded
	# with an deprecated state dict so that we can convert it on the fly
	self._from_deprecated_attn_block = _from_deprecated_attn_block

	self.scale_qk = scale_qk
	self.scale = dim_head ** -0.5 if self.scale_qk else 1.0

	self.heads = out_dim // dim_head if out_dim is not None else heads
	# for slice_size > 0 the attention score computation
	# is split across the batch axis to save memory
	# You can set slice_size with `set_attention_slice`
	self.sliceable_head_dim = heads

	self.added_kv_proj_dim = added_kv_proj_dim
	self.only_cross_attention = only_cross_attention

	if self.added_kv_proj_dim is None and self.only_cross_attention:
	raise ValueError(
	"`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
	)

	if norm_num_groups is not None:
	self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
	else:
	self.group_norm = None

	if spatial_norm_dim is not None:
	self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
	else:
	self.spatial_norm = None

	if qk_norm is None:
	self.norm_q = None
	self.norm_k = None
	elif qk_norm == "layer_norm":
	self.norm_q = nn.LayerNorm(dim_head, eps=eps)
	self.norm_k = nn.LayerNorm(dim_head, eps=eps)
	elif qk_norm == "fp32_layer_norm":
	self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
	self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
	elif qk_norm == "layer_norm_across_heads":
	# Lumina applys qk norm across all heads
	self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
	self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
	elif qk_norm == "rms_norm":
	self.norm_q = RMSNorm(dim_head, eps=eps)
	self.norm_k = RMSNorm(dim_head, eps=eps)
	else:
	raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")

	if cross_attention_norm is None:
	self.norm_cross = None
	elif cross_attention_norm == "layer_norm":
	self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
	elif cross_attention_norm == "group_norm":
	if self.added_kv_proj_dim is not None:
	# The given `encoder_hidden_states` are initially of shape
	# (batch_size, seq_len, added_kv_proj_dim) before being projected
	# to (batch_size, seq_len, cross_attention_dim). The norm is applied
	# before the projection, so we need to use `added_kv_proj_dim` as
	# the number of channels for the group norm.
	norm_cross_num_channels = added_kv_proj_dim
	else:
	norm_cross_num_channels = self.cross_attention_dim

	self.norm_cross = nn.GroupNorm(
	num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
	)
	else:
	raise ValueError(
	f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
	)

	self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)

	if not self.only_cross_attention:
	# only relevant for the `AddedKVProcessor` classes
	self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
	self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
	else:
	self.to_k = None
	self.to_v = None

	self.added_proj_bias = added_proj_bias
	if self.added_kv_proj_dim is not None:
	self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
	self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
	if self.context_pre_only is not None:
	self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)

	if not self.pre_only:
	self.to_out = nn.ModuleList([])
	self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
	self.to_out.append(nn.Dropout(dropout))

	if self.context_pre_only is not None and not self.context_pre_only:
	self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)

	if qk_norm is not None and added_kv_proj_dim is not None:
	if qk_norm == "fp32_layer_norm":
	self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
	self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
	elif qk_norm == "rms_norm":
	self.norm_added_q = RMSNorm(dim_head, eps=eps)
	self.norm_added_k = RMSNorm(dim_head, eps=eps)
	else:
	self.norm_added_q = None
	self.norm_added_k = None

	# set attention processor
	# We use the AttnProcessor2_0 by default when torch 2.x is used which uses
	# torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
	# but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
	if processor is None:
	processor = (
	AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
	)
	self.set_processor(processor)

	def forward(
	self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	**cross_attention_kwargs,
	) -> torch.Tensor:
	r"""
	The forward method of the `Attention` class.

	Args:
	hidden_states (`torch.Tensor`):
	The hidden states of the query.
	encoder_hidden_states (`torch.Tensor`, optional):
	The hidden states of the encoder.
	attention_mask (`torch.Tensor`, optional):
	The attention mask to use. If `None`, no mask is applied.
	**cross_attention_kwargs:
	Additional keyword arguments to pass along to the cross attention.

	Returns:
	`torch.Tensor`: The output of the attention layer.
	"""
	# The `Attention` class can call different attention processors / attention functions
	# here we simply pass along all tensors to the selected processor class
	# For standard processors that are defined here, `**cross_attention_kwargs` is empty

	return self.processor(
	self,
	hidden_states=hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	)


	class AttnProcessor2_0(AttnProcessor2_0_Base):
	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.Tensor,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	temb: Optional[torch.Tensor] = None,
	flow_feature: Optional[torch.Tensor] = None,
	flow_scale: Optional[float] = None,
	*args,
	**kwargs,
	) -> torch.Tensor:

	old_attn = attn.scale
	attn.scale *= kwargs.get("attn_scale", 1.0)

	output = super().__call__(
	attn,
	hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	temb=temb,
	*args,
	**kwargs,
	)

	attn.scale = old_attn
	return output

	class IPAdapterAttnProcessor2_0(IPAdapterAttnProcessor2_0_Base):
	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.Tensor,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	temb: Optional[torch.Tensor] = None,
	scale: float = 1.0,
	ip_adapter_masks: Optional[torch.Tensor] = None,
	flow_feature: Optional[torch.Tensor] = None,
	flow_scale: Optional[float] = None,
	*args,
	**kwargs,
	) -> torch.Tensor:
	return super().__call__(
	attn=attn,
	hidden_states=hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	temb=temb,
	scale=scale,
	ip_adapter_masks=ip_adapter_masks,
	)


	class FlowAdaptorAttnProcessor(nn.Module):
	def __init__(self,
	type: str,
	hidden_size, # dimension of hidden state
	flow_feature_dim=None, # dimension of the pose feature
	cross_attention_dim=None, # dimension of the text embedding
	query_condition=False,
	key_value_condition=False,
	flow_scale=1.0
	):
	super().__init__()

	self.type = type
	self.hidden_size = hidden_size
	self.flow_feature_dim = flow_feature_dim
	self.cross_attention_dim = cross_attention_dim
	self.flow_scale = flow_scale
	self.query_condition = query_condition
	self.key_value_condition = key_value_condition
	assert hidden_size == flow_feature_dim
	if self.query_condition and self.key_value_condition:
	self.qkv_merge = nn.Linear(hidden_size, hidden_size)
	init.zeros_(self.qkv_merge.weight)
	init.zeros_(self.qkv_merge.bias)
	elif self.query_condition:
	self.q_merge = nn.Linear(hidden_size, hidden_size)
	init.zeros_(self.q_merge.weight)
	init.zeros_(self.q_merge.bias)
	else:
	self.kv_merge = nn.Linear(hidden_size, hidden_size)
	init.zeros_(self.kv_merge.weight)
	init.zeros_(self.kv_merge.bias)

	def forward(self,
	attn: Attention,
	hidden_states,
	flow_feature,
	encoder_hidden_states=None,
	attention_mask=None,
	temb=None,
	flow_scale=None,
	*args,
	**kwargs,
	):
	assert flow_feature is not None
	flow_embedding_scale = (flow_scale if flow_scale is not None else self.flow_scale)

	residual = hidden_states
	if attn.spatial_norm is not None:
	hidden_states = attn.spatial_norm(hidden_states, temb)

	if self.query_condition and self.key_value_condition:
	assert encoder_hidden_states is None

	if encoder_hidden_states is None:
	encoder_hidden_states = hidden_states

	batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape

	if attention_mask is not None:
	attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
	# scaled_dot_product_attention expects attention_mask shape to be
	# (batch, heads, source_length, target_length)
	attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])

	if attn.group_norm is not None:
	hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

	if attn.norm_cross:
	encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

	if self.query_condition and self.key_value_condition: # only self attention
	query_hidden_state = self.qkv_merge(hidden_states + flow_feature) * flow_embedding_scale + hidden_states
	key_value_hidden_state = query_hidden_state
	elif self.query_condition:
	query_hidden_state = self.q_merge(hidden_states + flow_feature) * flow_embedding_scale + hidden_states
	key_value_hidden_state = encoder_hidden_states
	else:
	key_value_hidden_state = self.kv_merge(
	encoder_hidden_states + flow_feature) * flow_embedding_scale + encoder_hidden_states
	query_hidden_state = hidden_states

	# original attention
	key = attn.to_k(key_value_hidden_state)
	value = attn.to_v(key_value_hidden_state)
	query = attn.to_q(query_hidden_state)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	hidden_states = F.scaled_dot_product_attention(
	query, key, value,
	attn_mask=attention_mask,
	dropout_p=0.0,
	is_causal=False,
	scale=attn.scale * kwargs.get("attn_scale_flow", 1.0),
	)

	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.to(query.dtype)

	# linear proj
	hidden_states = attn.to_out[0](hidden_states)

	# dropout
	hidden_states = attn.to_out[1](hidden_states)

	if attn.residual_connection:
	hidden_states = hidden_states + residual

	hidden_states = hidden_states / attn.rescale_output_factor

	return hidden_states