DragNUWA

Runtime error

DragNUWA / dragnuwa /svd /modules /diffusionmodules /video_model_flow.py

yinshengming

init

ab85cf9 over 1 year ago

49 kB

	import torch
	import torch as th
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.checkpoint import checkpoint as pt_checkpoint # gradient checkpointing from pytorch
	from functools import partial
	from typing import List, Optional, Union
	import deepspeed
	from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint as ds_checkpoint # gradient checkpointing from deepspeed

	from einops import rearrange

	from dragnuwa.svd.modules.video_attention import SpatialVideoTransformer
	from dragnuwa.svd.modules.attention import SpatialTransformer
	from dragnuwa.svd.util import default
	from dragnuwa.svd.modules.diffusionmodules.util import (AlphaBlender, avg_pool_nd, conv_nd, linear,
	normalization,
	timestep_embedding, zero_module)

	import logging
	import math
	from abc import abstractmethod
	from typing import Iterable, List, Optional, Tuple, Union

	logpy = logging.getLogger(__name__)

	GRADIENT_CHECKPOINTING = 'ds' # 'ds' or 'pt'

	def is_deepspeed_initialized():
	if deepspeed.comm.comm.cdb is not None and deepspeed.comm.comm.cdb.is_initialized():
	return True
	else:
	return False

	def checkpoint(func, args, *kwargs):
	if GRADIENT_CHECKPOINTING == 'ds':
	if is_deepspeed_initialized():
	return ds_checkpoint(func, args, *kwargs)
	else:
	return pt_checkpoint(func, args, *kwargs)
	elif GRADIENT_CHECKPOINTING == 'pt':
	return pt_checkpoint(func, args, *kwargs)
	else:
	raise ValueError(f'Invalid gradient checkpointing method: {GRADIENT_CHECKPOINTING}')


	class TimestepBlock(nn.Module):
	"""
	Any module where forward() takes timestep embeddings as a second argument.
	"""

	@abstractmethod
	def forward(self, x: th.Tensor, emb: th.Tensor):
	"""
	Apply the module to `x` given `emb` timestep embeddings.
	"""


	class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
	"""
	A sequential module that passes timestep embeddings to the children that
	support it as an extra input.
	"""

	def forward(
	self,
	x: th.Tensor,
	emb: th.Tensor,
	context: Optional[th.Tensor] = None,
	image_only_indicator: Optional[th.Tensor] = None,
	time_context: Optional[int] = None,
	num_video_frames: Optional[int] = None,
	flow: Optional[th.Tensor] = None,
	):
	for layer in self:
	module = layer

	if isinstance(module, TimestepBlock) and not isinstance(module, VideoResBlock) and not isinstance(module, VideoResBlock_Embed):
	x = layer(x, emb)
	elif isinstance(module, VideoResBlock):
	x = layer(x, emb, num_video_frames, image_only_indicator)
	elif isinstance(module, VideoResBlock_Embed):
	x = layer(x, emb, num_video_frames, image_only_indicator, flow)
	elif isinstance(module, SpatialVideoTransformer):
	x = layer(
	x,
	context,
	time_context,
	num_video_frames,
	image_only_indicator,
	)
	elif isinstance(module, SpatialTransformer):
	x = layer(x, context)
	elif isinstance(module, nn.Conv2d):
	x = layer(x)
	elif isinstance(module, nn.Conv1d):
	h, w = x.shape[-2:]
	x = rearrange(x, "(b f) c h w -> (b h w) c f", f=num_video_frames)
	x = layer(x)
	x = rearrange(x, "(b h w) c f -> (b f) c h w", h=h, w=w)
	else:
	x = layer(x)
	return x


	class Upsample(nn.Module):
	"""
	An upsampling layer with an optional convolution.
	:param channels: channels in the inputs and outputs.
	:param use_conv: a bool determining if a convolution is applied.
	:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
	upsampling occurs in the inner-two dimensions.
	"""

	def __init__(
	self,
	channels: int,
	use_conv: bool,
	dims: int = 2,
	out_channels: Optional[int] = None,
	padding: int = 1,
	third_up: bool = False,
	kernel_size: int = 3,
	scale_factor: int = 2,
	):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.dims = dims
	self.third_up = third_up
	self.scale_factor = scale_factor
	if use_conv:
	self.conv = conv_nd(
	dims, self.channels, self.out_channels, kernel_size, padding=padding
	)

	def forward(self, x: th.Tensor) -> th.Tensor:
	assert x.shape[1] == self.channels

	if self.dims == 3:
	t_factor = 1 if not self.third_up else self.scale_factor
	x = F.interpolate(
	x,
	(
	t_factor * x.shape[2],
	x.shape[3] * self.scale_factor,
	x.shape[4] * self.scale_factor,
	),
	mode="nearest",
	)
	else:
	x = F.interpolate(x, scale_factor=self.scale_factor, mode="nearest")
	if self.use_conv:
	x = self.conv(x)
	return x


	class Downsample(nn.Module):
	"""
	A downsampling layer with an optional convolution.
	:param channels: channels in the inputs and outputs.
	:param use_conv: a bool determining if a convolution is applied.
	:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
	downsampling occurs in the inner-two dimensions.
	"""

	def __init__(
	self,
	channels: int,
	use_conv: bool,
	dims: int = 2,
	out_channels: Optional[int] = None,
	padding: int = 1,
	third_down: bool = False,
	):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.dims = dims
	stride = 2 if dims != 3 else ((1, 2, 2) if not third_down else (2, 2, 2))
	if use_conv:
	logpy.info(f"Building a Downsample layer with {dims} dims.")
	logpy.info(
	f" --> settings are: \n in-chn: {self.channels}, out-chn: {self.out_channels}, "
	f"kernel-size: 3, stride: {stride}, padding: {padding}"
	)
	if dims == 3:
	logpy.info(f" --> Downsampling third axis (time): {third_down}")
	self.op = conv_nd(
	dims,
	self.channels,
	self.out_channels,
	3,
	stride=stride,
	padding=padding,
	)
	else:
	assert self.channels == self.out_channels
	self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)

	def forward(self, x: th.Tensor) -> th.Tensor:
	assert x.shape[1] == self.channels

	return self.op(x)


	class ResBlock(TimestepBlock):
	"""
	A residual block that can optionally change the number of channels.
	:param channels: the number of input channels.
	:param emb_channels: the number of timestep embedding channels.
	:param dropout: the rate of dropout.
	:param out_channels: if specified, the number of out channels.
	:param use_conv: if True and out_channels is specified, use a spatial
	convolution instead of a smaller 1x1 convolution to change the
	channels in the skip connection.
	:param dims: determines if the signal is 1D, 2D, or 3D.
	:param use_checkpoint: if True, use gradient checkpointing on this module.
	:param up: if True, use this block for upsampling.
	:param down: if True, use this block for downsampling.
	"""

	def __init__(
	self,
	channels: int,
	emb_channels: int,
	dropout: float,
	out_channels: Optional[int] = None,
	use_conv: bool = False,
	use_scale_shift_norm: bool = False,
	dims: int = 2,
	use_checkpoint: bool = False,
	up: bool = False,
	down: bool = False,
	kernel_size: int = 3,
	exchange_temb_dims: bool = False,
	skip_t_emb: bool = False,
	):
	super().__init__()
	self.channels = channels
	self.emb_channels = emb_channels
	self.dropout = dropout
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.use_checkpoint = use_checkpoint
	self.use_scale_shift_norm = use_scale_shift_norm
	self.exchange_temb_dims = exchange_temb_dims

	if isinstance(kernel_size, Iterable):
	padding = [k // 2 for k in kernel_size]
	else:
	padding = kernel_size // 2

	self.in_layers = nn.Sequential(
	normalization(channels),
	nn.SiLU(),
	conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding),
	)

	self.updown = up or down

	if up:
	self.h_upd = Upsample(channels, False, dims)
	self.x_upd = Upsample(channels, False, dims)
	elif down:
	self.h_upd = Downsample(channels, False, dims)
	self.x_upd = Downsample(channels, False, dims)
	else:
	self.h_upd = self.x_upd = nn.Identity()

	self.skip_t_emb = skip_t_emb
	self.emb_out_channels = (
	2 * self.out_channels if use_scale_shift_norm else self.out_channels
	)
	if self.skip_t_emb:
	logpy.info(f"Skipping timestep embedding in {self.__class__.__name__}")
	assert not self.use_scale_shift_norm
	self.emb_layers = None
	self.exchange_temb_dims = False
	else:
	self.emb_layers = nn.Sequential(
	nn.SiLU(),
	linear(
	emb_channels,
	self.emb_out_channels,
	),
	)

	self.out_layers = nn.Sequential(
	normalization(self.out_channels),
	nn.SiLU(),
	nn.Dropout(p=dropout),
	zero_module(
	conv_nd(
	dims,
	self.out_channels,
	self.out_channels,
	kernel_size,
	padding=padding,
	)
	),
	)

	if self.out_channels == channels:
	self.skip_connection = nn.Identity()
	elif use_conv:
	self.skip_connection = conv_nd(
	dims, channels, self.out_channels, kernel_size, padding=padding
	)
	else:
	self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)

	def forward(self, x: th.Tensor, emb: th.Tensor) -> th.Tensor:
	"""
	Apply the block to a Tensor, conditioned on a timestep embedding.
	:param x: an [N x C x ...] Tensor of features.
	:param emb: an [N x emb_channels] Tensor of timestep embeddings.
	:return: an [N x C x ...] Tensor of outputs.
	"""
	if self.use_checkpoint:
	return checkpoint(self._forward, x, emb)
	else:
	return self._forward(x, emb)

	def _forward(self, x: th.Tensor, emb: th.Tensor) -> th.Tensor:
	if self.updown:
	in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
	h = in_rest(x)
	h = self.h_upd(h)
	x = self.x_upd(x)
	h = in_conv(h)
	else:
	h = self.in_layers(x)

	if self.skip_t_emb:
	emb_out = th.zeros_like(h)
	else:
	emb_out = self.emb_layers(emb).type(h.dtype)
	while len(emb_out.shape) < len(h.shape):
	emb_out = emb_out[..., None]
	if self.use_scale_shift_norm:
	out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
	scale, shift = th.chunk(emb_out, 2, dim=1)
	h = out_norm(h) * (1 + scale) + shift
	h = out_rest(h)
	else:
	if self.exchange_temb_dims:
	emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
	h = h + emb_out
	h = self.out_layers(h)
	return self.skip_connection(x) + h


	class AttentionBlock(nn.Module):
	"""
	An attention block that allows spatial positions to attend to each other.
	Originally ported from here, but adapted to the N-d case.
	https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
	"""

	def __init__(
	self,
	channels: int,
	num_heads: int = 1,
	num_head_channels: int = -1,
	use_checkpoint: bool = False,
	use_new_attention_order: bool = False,
	):
	super().__init__()
	self.channels = channels
	if num_head_channels == -1:
	self.num_heads = num_heads
	else:
	assert (
	channels % num_head_channels == 0
	), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = channels // num_head_channels
	self.use_checkpoint = use_checkpoint
	self.norm = normalization(channels)
	self.qkv = conv_nd(1, channels, channels * 3, 1)
	if use_new_attention_order:
	# split qkv before split heads
	self.attention = QKVAttention(self.num_heads)
	else:
	# split heads before split qkv
	self.attention = QKVAttentionLegacy(self.num_heads)

	self.proj_out = zero_module(conv_nd(1, channels, channels, 1))

	def forward(self, x: th.Tensor, **kwargs) -> th.Tensor:
	return checkpoint(self._forward, x)

	def _forward(self, x: th.Tensor) -> th.Tensor:
	b, c, *spatial = x.shape
	x = x.reshape(b, c, -1)
	qkv = self.qkv(self.norm(x))
	h = self.attention(qkv)
	h = self.proj_out(h)
	return (x + h).reshape(b, c, *spatial)


	class QKVAttentionLegacy(nn.Module):
	"""
	A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
	"""

	def __init__(self, n_heads: int):
	super().__init__()
	self.n_heads = n_heads

	def forward(self, qkv: th.Tensor) -> th.Tensor:
	"""
	Apply QKV attention.
	:param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
	:return: an [N x (H * C) x T] tensor after attention.
	"""
	bs, width, length = qkv.shape
	assert width % (3 * self.n_heads) == 0
	ch = width // (3 * self.n_heads)
	q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
	scale = 1 / math.sqrt(math.sqrt(ch))
	weight = th.einsum(
	"bct,bcs->bts", q * scale, k * scale
	) # More stable with f16 than dividing afterwards
	weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
	a = th.einsum("bts,bcs->bct", weight, v)
	return a.reshape(bs, -1, length)


	class QKVAttention(nn.Module):
	"""
	A module which performs QKV attention and splits in a different order.
	"""

	def __init__(self, n_heads: int):
	super().__init__()
	self.n_heads = n_heads

	def forward(self, qkv: th.Tensor) -> th.Tensor:
	"""
	Apply QKV attention.
	:param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
	:return: an [N x (H * C) x T] tensor after attention.
	"""
	bs, width, length = qkv.shape
	assert width % (3 * self.n_heads) == 0
	ch = width // (3 * self.n_heads)
	q, k, v = qkv.chunk(3, dim=1)
	scale = 1 / math.sqrt(math.sqrt(ch))
	weight = th.einsum(
	"bct,bcs->bts",
	(q * scale).view(bs * self.n_heads, ch, length),
	(k * scale).view(bs * self.n_heads, ch, length),
	) # More stable with f16 than dividing afterwards
	weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
	a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
	return a.reshape(bs, -1, length)


	class Timestep(nn.Module):
	def __init__(self, dim: int):
	super().__init__()
	self.dim = dim

	def forward(self, t: th.Tensor) -> th.Tensor:
	return timestep_embedding(t, self.dim)

	class FloatGroupNorm(nn.GroupNorm):
	def forward(self, x):
	return super().forward(x.to(self.bias.dtype)).type(x.dtype)

	class VideoResBlock(ResBlock):
	def __init__(
	self,
	channels: int,
	emb_channels: int,
	dropout: float,
	video_kernel_size: Union[int, List[int]] = 3,
	merge_strategy: str = "fixed",
	merge_factor: float = 0.5,
	out_channels: Optional[int] = None,
	use_conv: bool = False,
	use_scale_shift_norm: bool = False,
	dims: int = 2,
	use_checkpoint: bool = False,
	up: bool = False,
	down: bool = False,
	):
	super().__init__(
	channels,
	emb_channels,
	dropout,
	out_channels=out_channels,
	use_conv=use_conv,
	use_scale_shift_norm=use_scale_shift_norm,
	dims=dims,
	use_checkpoint=use_checkpoint,
	up=up,
	down=down,
	)

	self.time_stack = ResBlock(
	default(out_channels, channels),
	emb_channels,
	dropout=dropout,
	dims=3,
	out_channels=default(out_channels, channels),
	use_scale_shift_norm=False,
	use_conv=False,
	up=False,
	down=False,
	kernel_size=video_kernel_size,
	use_checkpoint=use_checkpoint,
	exchange_temb_dims=True,
	)
	self.time_mixer = AlphaBlender(
	alpha=merge_factor,
	merge_strategy=merge_strategy,
	rearrange_pattern="b t -> b 1 t 1 1",
	)

	def forward(
	self,
	x: th.Tensor,
	emb: th.Tensor,
	num_video_frames: int,
	image_only_indicator: Optional[th.Tensor] = None,
	) -> th.Tensor:
	x = super().forward(x, emb)

	x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
	x = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)

	x = self.time_stack(
	x, rearrange(emb, "(b t) ... -> b t ...", t=num_video_frames)
	)
	x = self.time_mixer(
	x_spatial=x_mix, x_temporal=x, image_only_indicator=image_only_indicator
	)
	x = rearrange(x, "b c t h w -> (b t) c h w")
	return x

	class ResBlockEmbed(TimestepBlock):
	"""
	A residual block that can optionally change the number of channels.
	:param channels: the number of input channels.
	:param emb_channels: the number of timestep embedding channels.
	:param dropout: the rate of dropout.
	:param out_channels: if specified, the number of out channels.
	:param use_conv: if True and out_channels is specified, use a spatial
	convolution instead of a smaller 1x1 convolution to change the
	channels in the skip connection.
	:param dims: determines if the signal is 1D, 2D, or 3D.
	:param use_checkpoint: if True, use gradient checkpointing on this module.
	:param up: if True, use this block for upsampling.
	:param down: if True, use this block for downsampling.
	"""

	def __init__(
	self,
	channels: int,
	emb_channels: int,
	dropout: float,
	out_channels: Optional[int] = None,
	use_conv: bool = False,
	use_scale_shift_norm: bool = False,
	dims: int = 2,
	use_checkpoint: bool = False,
	up: bool = False,
	down: bool = False,
	kernel_size: int = 3,
	exchange_temb_dims: bool = False,
	skip_t_emb: bool = False,
	is_same_channel: bool = True,
	flow_dim_scale: int = 8,
	):
	super().__init__()
	self.channels = channels
	self.emb_channels = emb_channels
	self.dropout = dropout
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.use_checkpoint = use_checkpoint
	self.use_scale_shift_norm = use_scale_shift_norm
	self.exchange_temb_dims = exchange_temb_dims

	if isinstance(kernel_size, Iterable):
	padding = [k // 2 for k in kernel_size]
	else:
	padding = kernel_size // 2

	self.in_layers = nn.Sequential(
	normalization(channels),
	nn.SiLU(),
	conv_nd(dims, channels, self.out_channels, kernel_size, padding=padding),
	)

	self.updown = up or down

	if up:
	self.h_upd = Upsample(channels, False, dims)
	self.x_upd = Upsample(channels, False, dims)
	elif down:
	self.h_upd = Downsample(channels, False, dims)
	self.x_upd = Downsample(channels, False, dims)
	else:
	self.h_upd = self.x_upd = nn.Identity()

	#### add layers to deal with flow
	self.flow_cond_norm = FloatGroupNorm(32, self.out_channels)

	if is_same_channel:
	flow_in_channel = self.out_channels // flow_dim_scale
	else:
	flow_in_channel = self.out_channels // flow_dim_scale // 2

	self.flow_gamma_spatial = nn.Conv2d(flow_in_channel, self.out_channels // 4, 3, padding=1)
	self.flow_gamma_temporal = zero_module(nn.Conv1d(self.out_channels // 4, self.out_channels, kernel_size=3,
	stride=1,
	padding=1,
	padding_mode='replicate'))
	self.flow_beta_spatial = nn.Conv2d(flow_in_channel, self.out_channels // 4, 3, padding=1)
	self.flow_beta_temporal = zero_module(nn.Conv1d(self.out_channels // 4, self.out_channels, kernel_size=3,
	stride=1,
	padding=1,
	padding_mode='replicate'))

	self.skip_t_emb = skip_t_emb
	self.emb_out_channels = (
	2 * self.out_channels if use_scale_shift_norm else self.out_channels
	)
	if self.skip_t_emb:
	logpy.info(f"Skipping timestep embedding in {self.__class__.__name__}")
	assert not self.use_scale_shift_norm
	self.emb_layers = None
	self.exchange_temb_dims = False
	else:
	self.emb_layers = nn.Sequential(
	nn.SiLU(),
	linear(
	emb_channels,
	self.emb_out_channels,
	),
	)

	self.out_layers = nn.Sequential(
	normalization(self.out_channels),
	nn.SiLU(),
	nn.Dropout(p=dropout),
	zero_module(
	conv_nd(
	dims,
	self.out_channels,
	self.out_channels,
	kernel_size,
	padding=padding,
	)
	),
	)

	if self.out_channels == channels:
	self.skip_connection = nn.Identity()
	elif use_conv:
	self.skip_connection = conv_nd(
	dims, channels, self.out_channels, kernel_size, padding=padding
	)
	else:
	self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)

	def forward(self, x: th.Tensor, emb: th.Tensor, num_video_frames: int, flow: th.Tensor) -> th.Tensor:
	"""
	Apply the block to a Tensor, conditioned on a timestep embedding.
	:param x: an [N x C x ...] Tensor of features.
	:param emb: an [N x emb_channels] Tensor of timestep embeddings.
	:return: an [N x C x ...] Tensor of outputs.
	"""
	num_video_frames = torch.tensor(num_video_frames).to(x.device).to(x.dtype)
	if self.use_checkpoint:
	return checkpoint(self._forward, x, emb, num_video_frames, flow)
	else:
	return self._forward(x, emb, num_video_frames, flow)

	def _forward(self, x: th.Tensor, emb: th.Tensor, num_video_frames: th.Tensor, flow: th.Tensor) -> th.Tensor:
	num_video_frames = int(num_video_frames.detach().cpu().numpy())

	if self.updown:
	in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
	h = in_rest(x)
	h = self.h_upd(h)
	x = self.x_upd(x)
	h = in_conv(h)
	else:
	h = self.in_layers(x)

	assert flow is not None, 'You must provide the flow to the ResBlockEmbed'
	gamma_flow = self.flow_gamma_spatial(flow)
	beta_flow = self.flow_beta_spatial(flow)
	_, _, hh, wh = beta_flow.shape
	gamma_flow = rearrange(gamma_flow, "(b f) c h w -> (b h w) c f", f=num_video_frames)
	beta_flow = rearrange(beta_flow, "(b f) c h w -> (b h w) c f", f=num_video_frames)
	gamma_flow = self.flow_gamma_temporal(gamma_flow)
	beta_flow = self.flow_beta_temporal(beta_flow)
	gamma_flow = rearrange(gamma_flow, "(b h w) c f -> (b f) c h w", h=hh, w=wh)
	beta_flow = rearrange(beta_flow, "(b h w) c f -> (b f) c h w", h=hh, w=wh)
	h = h + self.flow_cond_norm(h) * gamma_flow + beta_flow

	if self.skip_t_emb:
	emb_out = th.zeros_like(h)
	else:
	emb_out = self.emb_layers(emb).type(h.dtype)
	while len(emb_out.shape) < len(h.shape):
	emb_out = emb_out[..., None]
	if self.use_scale_shift_norm:
	out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
	scale, shift = th.chunk(emb_out, 2, dim=1)
	h = out_norm(h) * (1 + scale) + shift
	h = out_rest(h)
	else:
	if self.exchange_temb_dims:
	emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
	h = h + emb_out
	h = self.out_layers(h)
	return self.skip_connection(x) + h

	class VideoResBlock_Embed(ResBlockEmbed):
	def __init__(
	self,
	channels: int,
	emb_channels: int,
	dropout: float,
	video_kernel_size: Union[int, List[int]] = 3,
	merge_strategy: str = "fixed",
	merge_factor: float = 0.5,
	out_channels: Optional[int] = None,
	use_conv: bool = False,
	use_scale_shift_norm: bool = False,
	dims: int = 2,
	use_checkpoint: bool = False,
	up: bool = False,
	down: bool = False,
	is_same_channel: bool = True,
	flow_dim_scale: int = 8,
	):
	super().__init__(
	channels,
	emb_channels,
	dropout,
	out_channels=out_channels,
	use_conv=use_conv,
	use_scale_shift_norm=use_scale_shift_norm,
	dims=dims,
	use_checkpoint=use_checkpoint,
	up=up,
	down=down,
	is_same_channel=is_same_channel,
	flow_dim_scale=flow_dim_scale,
	)

	self.time_stack = ResBlock(
	default(out_channels, channels),
	emb_channels,
	dropout=dropout,
	dims=3,
	out_channels=default(out_channels, channels),
	use_scale_shift_norm=False,
	use_conv=False,
	up=False,
	down=False,
	kernel_size=video_kernel_size,
	use_checkpoint=use_checkpoint,
	exchange_temb_dims=True,
	)
	self.time_mixer = AlphaBlender(
	alpha=merge_factor,
	merge_strategy=merge_strategy,
	rearrange_pattern="b t -> b 1 t 1 1",
	)

	def forward(
	self,
	x: th.Tensor,
	emb: th.Tensor,
	num_video_frames: int,
	image_only_indicator: Optional[th.Tensor] = None,
	flow: th.Tensor = None,
	) -> th.Tensor:
	x = super().forward(x, emb, num_video_frames, flow)

	x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)
	x = rearrange(x, "(b t) c h w -> b c t h w", t=num_video_frames)

	x = self.time_stack(
	x, rearrange(emb, "(b t) ... -> b t ...", t=num_video_frames)
	)
	x = self.time_mixer(
	x_spatial=x_mix, x_temporal=x, image_only_indicator=image_only_indicator
	)
	x = rearrange(x, "b c t h w -> (b t) c h w")
	return x


	class VideoUNet_flow(nn.Module):
	def __init__(
	self,
	in_channels: int,
	model_channels: int,
	out_channels: int,
	num_res_blocks: int,
	attention_resolutions: int,
	dropout: float = 0.0,
	channel_mult: List[int] = (1, 2, 4, 8),
	conv_resample: bool = True,
	dims: int = 2,
	num_classes: Optional[int] = None,
	use_checkpoint: bool = False,
	num_heads: int = -1,
	num_head_channels: int = -1,
	num_heads_upsample: int = -1,
	use_scale_shift_norm: bool = False,
	resblock_updown: bool = False,
	transformer_depth: Union[List[int], int] = 1,
	transformer_depth_middle: Optional[int] = None,
	context_dim: Optional[int] = None,
	time_downup: bool = False,
	time_context_dim: Optional[int] = None,
	extra_ff_mix_layer: bool = False,
	use_spatial_context: bool = False,
	merge_strategy: str = "fixed",
	merge_factor: float = 0.5,
	spatial_transformer_attn_type: str = "softmax",
	video_kernel_size: Union[int, List[int]] = 3,
	use_linear_in_transformer: bool = False,
	adm_in_channels: Optional[int] = None,
	disable_temporal_crossattention: bool = False,
	max_ddpm_temb_period: int = 10000,
	flow_dim_scale: int = 8,
	):
	super().__init__()
	assert context_dim is not None

	if num_heads_upsample == -1:
	num_heads_upsample = num_heads

	if num_heads == -1:
	assert num_head_channels != -1

	if num_head_channels == -1:
	assert num_heads != -1

	self.in_channels = in_channels
	self.model_channels = model_channels
	self.out_channels = out_channels
	if isinstance(transformer_depth, int):
	transformer_depth = len(channel_mult) * [transformer_depth]
	transformer_depth_middle = default(
	transformer_depth_middle, transformer_depth[-1]
	)

	self.num_res_blocks = num_res_blocks
	self.attention_resolutions = attention_resolutions
	self.dropout = dropout
	self.channel_mult = channel_mult
	self.conv_resample = conv_resample
	self.num_classes = num_classes
	self.use_checkpoint = use_checkpoint
	self.num_heads = num_heads
	self.num_head_channels = num_head_channels
	self.num_heads_upsample = num_heads_upsample

	time_embed_dim = model_channels * 4
	self.time_embed = nn.Sequential(
	linear(model_channels, time_embed_dim),
	nn.SiLU(),
	linear(time_embed_dim, time_embed_dim),
	)

	if self.num_classes is not None:
	if isinstance(self.num_classes, int):
	self.label_emb = nn.Embedding(num_classes, time_embed_dim)
	elif self.num_classes == "continuous":
	print("setting up linear c_adm embedding layer")
	self.label_emb = nn.Linear(1, time_embed_dim)
	elif self.num_classes == "timestep":
	self.label_emb = nn.Sequential(
	Timestep(model_channels),
	nn.Sequential(
	linear(model_channels, time_embed_dim),
	nn.SiLU(),
	linear(time_embed_dim, time_embed_dim),
	),
	)

	elif self.num_classes == "sequential":
	assert adm_in_channels is not None
	self.label_emb = nn.Sequential(
	nn.Sequential(
	linear(adm_in_channels, time_embed_dim),
	nn.SiLU(),
	linear(time_embed_dim, time_embed_dim),
	)
	)
	else:
	raise ValueError()

	### process the flow / drag
	self.flow_dim_scale = flow_dim_scale
	self.flow_blocks = nn.ModuleList([])
	flow_in_block = TimestepEmbedSequential(
	nn.Conv2d(2, self.model_channels // flow_dim_scale // 4, 3, stride=2, padding=1), # flow in channel 2
	nn.Conv1d(self.model_channels // flow_dim_scale // 4, self.model_channels // flow_dim_scale // 4, kernel_size=3, stride=1,
	padding=1, padding_mode='replicate'),
	FloatGroupNorm(8, self.model_channels // flow_dim_scale // 4),
	nn.SiLU(),
	nn.Conv2d(self.model_channels // flow_dim_scale // 4, self.model_channels // flow_dim_scale // 2, 3, stride=2, padding=1),
	nn.Conv1d(self.model_channels // flow_dim_scale // 2, self.model_channels // flow_dim_scale // 2, kernel_size=3, stride=1,
	padding=1, padding_mode='replicate'),
	FloatGroupNorm(8, self.model_channels // flow_dim_scale // 2),
	nn.SiLU(),
	nn.Conv2d(self.model_channels // flow_dim_scale // 2, self.model_channels // flow_dim_scale, 3, stride=2, padding=1),
	nn.Conv1d(self.model_channels // flow_dim_scale, self.model_channels // flow_dim_scale, kernel_size=3, stride=1,
	padding=1, padding_mode='replicate'),
	)
	self.flow_blocks.append(flow_in_block)

	flow_in_channel = self.model_channels // flow_dim_scale
	for i_f, ch_f in enumerate(channel_mult[1:]):
	layers_f = nn.ModuleList([
	FloatGroupNorm(8, flow_in_channel),
	nn.SiLU(),
	nn.Conv2d(flow_in_channel, ch_f * self.model_channels // flow_dim_scale, 3, padding=1),
	nn.Conv1d(ch_f * self.model_channels // flow_dim_scale, ch_f * self.model_channels // flow_dim_scale, kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
	])
	flow_in_channel = ch_f * self.model_channels // flow_dim_scale
	if i_f != len(channel_mult) - 1:
	layers_f.append(
	Downsample(
	flow_in_channel, True, dims=2, out_channels=flow_in_channel
	)
	)
	self.flow_blocks.append(TimestepEmbedSequential(*layers_f))

	self.input_blocks = nn.ModuleList(
	[
	TimestepEmbedSequential(
	conv_nd(dims, in_channels, model_channels, 3, padding=1)
	)
	]
	)
	self._feature_size = model_channels
	input_block_chans = [model_channels]
	ch = model_channels
	ds = 1

	def get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=1,
	context_dim=None,
	use_checkpoint=False,
	disabled_sa=False,
	):
	return SpatialVideoTransformer(
	ch,
	num_heads,
	dim_head,
	depth=depth,
	context_dim=context_dim,
	time_context_dim=time_context_dim,
	dropout=dropout,
	ff_in=extra_ff_mix_layer,
	use_spatial_context=use_spatial_context,
	merge_strategy=merge_strategy,
	merge_factor=merge_factor,
	checkpoint=use_checkpoint,
	use_linear=use_linear_in_transformer,
	attn_mode=spatial_transformer_attn_type,
	disable_self_attn=disabled_sa,
	disable_temporal_crossattention=disable_temporal_crossattention,
	max_time_embed_period=max_ddpm_temb_period,
	)

	def get_resblock(
	merge_factor,
	merge_strategy,
	video_kernel_size,
	ch,
	time_embed_dim,
	dropout,
	out_ch,
	dims,
	use_checkpoint,
	use_scale_shift_norm,
	down=False,
	up=False,
	):
	return VideoResBlock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	channels=ch,
	emb_channels=time_embed_dim,
	dropout=dropout,
	out_channels=out_ch,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	down=down,
	up=up,
	)

	def get_embed_resblock(
	merge_factor,
	merge_strategy,
	video_kernel_size,
	ch,
	time_embed_dim,
	dropout,
	out_ch,
	dims,
	use_checkpoint,
	use_scale_shift_norm,
	down=False,
	up=False,
	is_same_channel=True,
	flow_dim_scale=8,
	):
	return VideoResBlock_Embed(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	channels=ch,
	emb_channels=time_embed_dim,
	dropout=dropout,
	out_channels=out_ch,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	down=down,
	up=up,
	is_same_channel=is_same_channel,
	flow_dim_scale=flow_dim_scale,
	)

	for level, mult in enumerate(channel_mult):
	for i in range(num_res_blocks):
	if i == 0:
	layers = [
	get_embed_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	out_ch=mult * model_channels,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	is_same_channel=True,
	flow_dim_scale=flow_dim_scale,
	)
	]
	else:
	layers = [
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	out_ch=mult * model_channels,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	)
	]
	ch = mult * model_channels
	if ds in attention_resolutions:
	if num_head_channels == -1:
	dim_head = ch // num_heads
	else:
	num_heads = ch // num_head_channels
	dim_head = num_head_channels

	layers.append(
	get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=transformer_depth[level],
	context_dim=context_dim,
	use_checkpoint=use_checkpoint,
	disabled_sa=False,
	)
	)
	self.input_blocks.append(TimestepEmbedSequential(*layers))
	self._feature_size += ch
	input_block_chans.append(ch)
	if level != len(channel_mult) - 1:
	ds *= 2
	out_ch = ch
	self.input_blocks.append(
	TimestepEmbedSequential(
	Downsample(
	ch,
	conv_resample,
	dims=dims,
	out_channels=out_ch,
	third_down=time_downup,
	)
	)
	)
	ch = out_ch
	input_block_chans.append(ch)

	self._feature_size += ch

	if num_head_channels == -1:
	dim_head = ch // num_heads
	else:
	num_heads = ch // num_head_channels
	dim_head = num_head_channels

	self.middle_block = TimestepEmbedSequential(
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	time_embed_dim=time_embed_dim,
	out_ch=None,
	dropout=dropout,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	),
	get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=transformer_depth_middle,
	context_dim=context_dim,
	use_checkpoint=use_checkpoint,
	),
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch,
	out_ch=None,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	),
	)
	self._feature_size += ch

	self.output_blocks = nn.ModuleList([])
	for level, mult in list(enumerate(channel_mult))[::-1]:
	for i in range(num_res_blocks + 1):
	ich = input_block_chans.pop()
	if i == 0:
	layers = [
	get_embed_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch + ich,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	out_ch=model_channels * mult,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	is_same_channel=True,
	flow_dim_scale=flow_dim_scale,
	)
	]
	else:
	layers = [
	get_resblock(
	merge_factor=merge_factor,
	merge_strategy=merge_strategy,
	video_kernel_size=video_kernel_size,
	ch=ch + ich,
	time_embed_dim=time_embed_dim,
	dropout=dropout,
	out_ch=model_channels * mult,
	dims=dims,
	use_checkpoint=use_checkpoint,
	use_scale_shift_norm=use_scale_shift_norm,
	)
	]
	ch = model_channels * mult
	if ds in attention_resolutions:
	if num_head_channels == -1:
	dim_head = ch // num_heads
	else:
	num_heads = ch // num_head_channels
	dim_head = num_head_channels

	layers.append(
	get_attention_layer(
	ch,
	num_heads,
	dim_head,
	depth=transformer_depth[level],
	context_dim=context_dim,
	use_checkpoint=use_checkpoint,
	disabled_sa=False,
	)
	)
	if level and i == num_res_blocks:
	out_ch = ch
	ds //= 2
	layers.append(
	Upsample(
	ch,
	conv_resample,
	dims=dims,
	out_channels=out_ch,
	third_up=time_downup,
	)
	)

	self.output_blocks.append(TimestepEmbedSequential(*layers))
	self._feature_size += ch

	self.out = nn.Sequential(
	normalization(ch),
	nn.SiLU(),
	zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
	)

	def forward(
	self,
	x: th.Tensor,
	timesteps: th.Tensor,
	context: Optional[th.Tensor] = None,
	y: Optional[th.Tensor] = None,
	time_context: Optional[th.Tensor] = None,
	num_video_frames: Optional[int] = None,
	image_only_indicator: Optional[th.Tensor] = None,
	flow: Optional[th.Tensor] = None, # input flow or drag: b l c h w
	):
	assert (y is not None) == (
	self.num_classes is not None
	), "must specify y if and only if the model is class-conditional -> no, relax this TODO"
	hs = []
	t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
	emb = self.time_embed(t_emb)

	if self.num_classes is not None:
	assert y.shape[0] == x.shape[0]
	emb = emb + self.label_emb(y)

	batch_size = flow.shape[0]

	# process the flow
	hs_z_flow = []
	hs_z_flow_clone = []
	flow = rearrange(flow, "b l c h w -> (b l) c h w")
	for module in self.flow_blocks:
	flow = module(flow, emb=None, num_video_frames=num_video_frames)
	hs_z_flow.extend([flow])
	hs_z_flow_clone.extend([flow.clone()])

	h = x
	for module in self.input_blocks:
	if isinstance(module[0], VideoResBlock_Embed):
	h = module(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	flow=hs_z_flow.pop(0)
	)
	else:
	h = module(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	)
	hs.append(h)
	h = self.middle_block(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	)
	for module in self.output_blocks:
	h = th.cat([h, hs.pop()], dim=1)
	if isinstance(module[0], VideoResBlock_Embed):
	h = module(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	flow=hs_z_flow_clone.pop(),
	)
	else:
	h = module(
	h,
	emb,
	context=context,
	image_only_indicator=image_only_indicator,
	time_context=time_context,
	num_video_frames=num_video_frames,
	)
	h = h.type(x.dtype)
	return self.out(h)