Spaces:

3DAIGC
/

LAM

Running on Zero

LAM / lam /models /transformer_dit.py

yuandong513

feat: init

17cd746 4 months ago

17 kB

	from functools import partial
	import torch
	import torch.nn as nn
	from typing import Any, Dict, Optional, Tuple, Union

	import torch.nn.functional as F
	assert hasattr(F, "scaled_dot_product_attention")
	from diffusers.models.attention import Attention, FeedForward
	from diffusers.models.attention_processor import CogVideoXAttnProcessor2_0, JointAttnProcessor2_0



	class CogVideoXBlock(nn.Module):
	r"""
	Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.

	Parameters:
	dim (`int`):
	The number of channels in the input and output.
	num_attention_heads (`int`):
	The number of heads to use for multi-head attention.
	attention_head_dim (`int`):
	The number of channels in each head.
	time_embed_dim (`int`):
	The number of channels in timestep embedding.
	dropout (`float`, defaults to `0.0`):
	The dropout probability to use.
	activation_fn (`str`, defaults to `"gelu-approximate"`):
	Activation function to be used in feed-forward.
	attention_bias (`bool`, defaults to `False`):
	Whether or not to use bias in attention projection layers.
	qk_norm (`bool`, defaults to `True`):
	Whether or not to use normalization after query and key projections in Attention.
	norm_elementwise_affine (`bool`, defaults to `True`):
	Whether to use learnable elementwise affine parameters for normalization.
	norm_eps (`float`, defaults to `1e-5`):
	Epsilon value for normalization layers.
	final_dropout (`bool` defaults to `False`):
	Whether to apply a final dropout after the last feed-forward layer.
	ff_inner_dim (`int`, optional, defaults to `None`):
	Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
	ff_bias (`bool`, defaults to `True`):
	Whether or not to use bias in Feed-forward layer.
	attention_out_bias (`bool`, defaults to `True`):
	Whether or not to use bias in Attention output projection layer.
	"""

	def __init__(
	self,
	dim: int,
	num_heads: int,
	# num_attention_heads: int,
	# attention_head_dim: int,
	# time_embed_dim: int,
	dropout: float = 0.0,
	activation_fn: str = "gelu-approximate",
	attention_bias: bool = False,
	qk_norm: bool = True,
	norm_elementwise_affine: bool = True,
	eps: float = 1e-5,
	# norm_eps: float = 1e-5,
	final_dropout: bool = True,
	ff_inner_dim: Optional[int] = None,
	ff_bias: bool = True,
	attention_out_bias: bool = True,
	):
	super().__init__()
	norm_eps = eps
	num_attention_heads = num_heads
	attention_head_dim = dim // num_attention_heads
	assert attention_head_dim * num_attention_heads == dim

	# 1. Self Attention
	self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps, bias=True)
	self.norm1_context = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps, bias=True)

	self.attn1 = Attention(
	query_dim=dim,
	dim_head=attention_head_dim,
	heads=num_attention_heads,
	qk_norm="layer_norm" if qk_norm else None,
	eps=1e-6,
	bias=attention_bias,
	out_bias=attention_out_bias,
	processor=CogVideoXAttnProcessor2_0(),
	)

	# 2. Feed Forward
	self.norm2 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps, bias=True)
	self.norm2_context = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps, bias=True)

	self.ff = FeedForward(
	dim,
	dropout=dropout,
	activation_fn=activation_fn,
	final_dropout=final_dropout,
	inner_dim=ff_inner_dim,
	bias=ff_bias,
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: torch.Tensor,
	temb: torch.Tensor = None,
	image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
	) -> torch.Tensor:
	text_seq_length = encoder_hidden_states.size(1)

	# norm & modulate
	# norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
	# hidden_states, encoder_hidden_states, temb
	# )
	norm_hidden_states = self.norm1(hidden_states)
	norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)


	# attention
	attn_hidden_states, attn_encoder_hidden_states = self.attn1(
	hidden_states=norm_hidden_states,
	encoder_hidden_states=norm_encoder_hidden_states,
	image_rotary_emb=image_rotary_emb,
	)

	hidden_states = hidden_states + attn_hidden_states
	encoder_hidden_states = encoder_hidden_states + attn_encoder_hidden_states

	# norm & modulate
	# norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
	# hidden_states, encoder_hidden_states, temb
	# )
	norm_hidden_states = self.norm2(hidden_states)
	norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)


	# feed-forward
	norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
	ff_output = self.ff(norm_hidden_states)

	hidden_states = hidden_states + ff_output[:, text_seq_length:]
	encoder_hidden_states = encoder_hidden_states + ff_output[:, :text_seq_length]

	return hidden_states, encoder_hidden_states


	def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
	# "feed_forward_chunk_size" can be used to save memory
	if hidden_states.shape[chunk_dim] % chunk_size != 0:
	raise ValueError(
	f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
	)

	num_chunks = hidden_states.shape[chunk_dim] // chunk_size
	ff_output = torch.cat(
	[ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
	dim=chunk_dim,
	)
	return ff_output


	class QKNormJointAttnProcessor2_0:
	"""Attention processor used typically in processing the SD3-like self-attention projections."""

	def __init__(self):
	if not hasattr(F, "scaled_dot_product_attention"):
	raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")

	def __call__(
	self,
	attn: Attention,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: torch.FloatTensor = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	*args,
	**kwargs,
	) -> torch.FloatTensor:
	residual = hidden_states

	input_ndim = hidden_states.ndim
	if input_ndim == 4:
	batch_size, channel, height, width = hidden_states.shape
	hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
	context_input_ndim = encoder_hidden_states.ndim
	if context_input_ndim == 4:
	batch_size, channel, height, width = encoder_hidden_states.shape
	encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)

	batch_size = encoder_hidden_states.shape[0]

	# `sample` projections.
	query = attn.to_q(hidden_states)
	key = attn.to_k(hidden_states)
	value = attn.to_v(hidden_states)

	# `context` projections.
	encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
	encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
	encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)

	# attention
	query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
	key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
	value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads
	query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.to(query.dtype)

	# Split the attention outputs.
	hidden_states, encoder_hidden_states = (
	hidden_states[:, : residual.shape[1]],
	hidden_states[:, residual.shape[1] :],
	)

	# linear proj
	hidden_states = attn.to_out[0](hidden_states)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)
	if not attn.context_pre_only:
	encoder_hidden_states = attn.to_add_out(encoder_hidden_states)

	if input_ndim == 4:
	hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
	if context_input_ndim == 4:
	encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)

	return hidden_states, encoder_hidden_states


	class SD3JointTransformerBlock(nn.Module):
	r"""
	A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.

	Reference: https://arxiv.org/abs/2403.03206

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
	processing of `context` conditions.
	"""

	def __init__(
	self,
	dim: int,
	num_heads: int,
	eps: float,
	# num_attention_heads: int,
	# attention_head_dim: int,
	context_pre_only: bool = False,
	qk_norm: Optional[str] = None,
	use_dual_attention: bool = False,
	):
	super().__init__()
	num_attention_heads = num_heads
	attention_head_dim = dim // num_attention_heads
	assert attention_head_dim * num_attention_heads == dim

	self.use_dual_attention = use_dual_attention
	self.context_pre_only = context_pre_only
	# context_norm_type = "ada_norm_continous" if context_pre_only else "ada_norm_zero"

	# if use_dual_attention:
	# self.norm1 = SD35AdaLayerNormZeroX(dim)
	# else:
	# self.norm1 = AdaLayerNormZero(dim)

	self.norm1 = nn.LayerNorm(dim)

	# if context_norm_type == "ada_norm_continous":
	# self.norm1_context = AdaLayerNormContinuous(
	# dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="layer_norm"
	# )
	# elif context_norm_type == "ada_norm_zero":
	# self.norm1_context = AdaLayerNormZero(dim)
	# else:
	# raise ValueError(
	# f"Unknown context_norm_type: {context_norm_type}, currently only support `ada_norm_continous`, `ada_norm_zero`"
	# )
	# self.norm1_context = AdaLayerNormZero(dim)

	self.norm1_context = nn.LayerNorm(dim)

	processor = JointAttnProcessor2_0()

	self.attn = Attention(
	query_dim=dim,
	cross_attention_dim=None,
	added_kv_proj_dim=dim,
	dim_head=attention_head_dim,
	heads=num_attention_heads,
	out_dim=dim,
	context_pre_only=context_pre_only,
	bias=True,
	processor=processor,
	qk_norm=qk_norm,
	eps=eps,
	)

	if use_dual_attention:
	self.attn2 = Attention(
	query_dim=dim,
	cross_attention_dim=None,
	dim_head=attention_head_dim,
	heads=num_attention_heads,
	out_dim=dim,
	bias=True,
	processor=processor,
	qk_norm=qk_norm,
	eps=eps,
	)
	else:
	self.attn2 = None

	self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
	self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")

	if not context_pre_only:
	self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
	self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
	else:
	self.norm2_context = None
	self.ff_context = None

	# let chunk size default to None
	self._chunk_size = None
	self._chunk_dim = 0

	# Copied from diffusers.models.attention.BasicTransformerBlock.set_chunk_feed_forward
	def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
	# Sets chunk feed-forward
	self._chunk_size = chunk_size
	self._chunk_dim = dim

	def forward(
	self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor=None
	):
	# if self.use_dual_attention:
	# norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
	# hidden_states, emb=temb
	# )
	# else:
	# norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)

	# if self.context_pre_only:
	# norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
	# else:
	# norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
	# encoder_hidden_states, emb=temb
	# )
	norm_hidden_states = self.norm1(hidden_states)
	norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)

	# Attention.
	attn_output, context_attn_output = self.attn(
	hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
	)

	# Process attention outputs for the `hidden_states`.
	# attn_output = gate_msa.unsqueeze(1) * attn_output
	hidden_states = hidden_states + attn_output

	if self.use_dual_attention:
	attn_output2 = self.attn2(hidden_states=norm_hidden_states)
	# attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
	hidden_states = hidden_states + attn_output2

	norm_hidden_states = self.norm2(hidden_states)
	# norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
	if self._chunk_size is not None:
	# "feed_forward_chunk_size" can be used to save memory
	ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
	else:
	ff_output = self.ff(norm_hidden_states)
	# ff_output = gate_mlp.unsqueeze(1) * ff_output

	hidden_states = hidden_states + ff_output

	# Process attention outputs for the `encoder_hidden_states`.
	if self.context_pre_only:
	encoder_hidden_states = None
	else:
	# context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
	encoder_hidden_states = encoder_hidden_states + context_attn_output

	norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
	# norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
	if self._chunk_size is not None:
	# "feed_forward_chunk_size" can be used to save memory
	context_ff_output = _chunked_feed_forward(
	self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
	)
	else:
	context_ff_output = self.ff_context(norm_encoder_hidden_states)
	# encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
	encoder_hidden_states = encoder_hidden_states + context_ff_output

	return hidden_states, encoder_hidden_states