File size: 8,542 Bytes

b1f3a76

from typing import Any, Optional, Type
import numpy as np
import torch
from diffusers.models.transformers import LTXVideoTransformer3DModel
from diffusers.utils import (
    USE_PEFT_BACKEND,
    is_torch_version,
    scale_lora_layers,
    unscale_lora_layers
)

class TeaCacheConfig:
    """Configuration for TeaCache optimization"""
    def __init__(
        self,
        enabled: bool = True,
        rel_l1_thresh: float = 0.05,  # 0.03 for 1.6x speedup, 0.05 for 2.1x speedup
        num_inference_steps: int = 50
    ):
        self.enabled = enabled
        self.rel_l1_thresh = rel_l1_thresh
        self.num_inference_steps = num_inference_steps
        
        # Internal state
        self.cnt = 0
        self.accumulated_rel_l1_distance = 0
        self.previous_modulated_input = None
        self.previous_residual = None

def create_teacache_forward(original_forward: Any):
    """Factory function to create a TeaCache-enabled forward pass"""
    
    def teacache_forward(
        self,
        hidden_states: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
        timestep: torch.LongTensor,
        encoder_attention_mask: torch.Tensor,
        num_frames: int,
        height: int,
        width: int,
        rope_interpolation_scale: Optional[tuple[float, float, float]] = None,
        attention_kwargs: Optional[dict[str, Any]] = None,
        return_dict: bool = True,
    ) -> torch.Tensor:
        # Handle LoRA scaling
        if attention_kwargs is not None:
            attention_kwargs = attention_kwargs.copy()
            lora_scale = attention_kwargs.pop("scale", 1.0)
        else:
            lora_scale = 1.0

        if USE_PEFT_BACKEND:
            scale_lora_layers(self, lora_scale)

        # Initial processing
        image_rotary_emb = self.rope(hidden_states, num_frames, height, width, rope_interpolation_scale)
        
        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)

        batch_size = hidden_states.size(0)
        hidden_states = self.proj_in(hidden_states)

        # Time embedding
        temb, embedded_timestep = self.time_embed(
            timestep.flatten(),
            batch_size=batch_size,
            hidden_dtype=hidden_states.dtype,
        )
        temb = temb.view(batch_size, -1, temb.size(-1))
        embedded_timestep = embedded_timestep.view(batch_size, -1, embedded_timestep.size(-1))

        # Caption projection
        encoder_hidden_states = self.caption_projection(encoder_hidden_states)
        encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.size(-1))

        # TeaCache optimization logic
        should_calc = True
        if hasattr(self, 'teacache_config') and self.teacache_config.enabled:
            inp = hidden_states.clone()
            temb_ = temb.clone()
            inp = self.transformer_blocks[0].norm1(inp)
            
            num_ada_params = self.transformer_blocks[0].scale_shift_table.shape[0]
            ada_values = (
                self.transformer_blocks[0].scale_shift_table[None, None] + 
                temb_.reshape(batch_size, temb_.size(1), num_ada_params, -1)
            )
            shift_msa, scale_msa, *_ = ada_values.unbind(dim=2)
            modulated_inp = inp * (1 + scale_msa) + shift_msa

            # Determine if we should calculate or reuse
            if self.teacache_config.cnt == 0 or self.teacache_config.cnt == self.teacache_config.num_inference_steps - 1:
                should_calc = True
                self.teacache_config.accumulated_rel_l1_distance = 0
            else:
                # Polynomial coefficients for rescaling
                coefficients = [2.14700694e+01, -1.28016453e+01, 2.31279151e+00, 7.92487521e-01, 9.69274326e-03]
                rescale_func = np.poly1d(coefficients)
                
                rel_diff = (
                    (modulated_inp - self.teacache_config.previous_modulated_input).abs().mean() / 
                    self.teacache_config.previous_modulated_input.abs().mean()
                ).cpu().item()
                
                self.teacache_config.accumulated_rel_l1_distance += rescale_func(rel_diff)
                
                if self.teacache_config.accumulated_rel_l1_distance < self.teacache_config.rel_l1_thresh:
                    should_calc = False
                else:
                    should_calc = True
                    self.teacache_config.accumulated_rel_l1_distance = 0

            self.teacache_config.previous_modulated_input = modulated_inp
            self.teacache_config.cnt += 1
            if self.teacache_config.cnt == self.teacache_config.num_inference_steps:
                self.teacache_config.cnt = 0

        # Process hidden states
        if hasattr(self, 'teacache_config') and self.teacache_config.enabled and not should_calc:
            hidden_states += self.teacache_config.previous_residual
        else:
            ori_hidden_states = hidden_states.clone() if hasattr(self, 'teacache_config') and self.teacache_config.enabled else None
            
            for block in self.transformer_blocks:
                if torch.is_grad_enabled() and self.gradient_checkpointing:
                    def create_custom_forward(module, return_dict=None):
                        def custom_forward(*inputs):
                            if return_dict is not None:
                                return module(*inputs, return_dict=return_dict)
                            else:
                                return module(*inputs)
                        return custom_forward

                    ckpt_kwargs: dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
                    hidden_states = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(block),
                        hidden_states,
                        encoder_hidden_states,
                        temb,
                        image_rotary_emb,
                        encoder_attention_mask,
                        **ckpt_kwargs,
                    )
                else:
                    hidden_states = block(
                        hidden_states=hidden_states,
                        encoder_hidden_states=encoder_hidden_states,
                        temb=temb,
                        image_rotary_emb=image_rotary_emb,
                        encoder_attention_mask=encoder_attention_mask,
                    )

            scale_shift_values = self.scale_shift_table[None, None] + embedded_timestep[:, :, None]
            shift, scale = scale_shift_values[:, :, 0], scale_shift_values[:, :, 1]

            hidden_states = self.norm_out(hidden_states)
            hidden_states = hidden_states * (1 + scale) + shift
            
            if hasattr(self, 'teacache_config') and self.teacache_config.enabled:
                self.teacache_config.previous_residual = hidden_states - ori_hidden_states

        output = self.proj_out(hidden_states)

        if USE_PEFT_BACKEND:
            unscale_lora_layers(self, lora_scale)

        if not return_dict:
            return (output,)
            
        return {"sample": output}

    return teacache_forward

def enable_teacache(model_class: Type[LTXVideoTransformer3DModel], config: TeaCacheConfig) -> None:
    """Enable TeaCache optimization for a model class
    
    Args:
        model_class: The model class to patch
        config: TeaCache configuration
    """
    # Store original forward method if needed
    if not hasattr(model_class, '_original_forward'):
        model_class._original_forward = model_class.forward
    
    # Create new forward method with TeaCache
    model_class.forward = create_teacache_forward(model_class._original_forward)
    
    # Add config attribute to class
    model_class.teacache_config = config

def disable_teacache(model_class: Type[LTXVideoTransformer3DModel]) -> None:
    """Disable TeaCache optimization for a model class
    
    Args:
        model_class: The model class to unpatch
    """
    if hasattr(model_class, '_original_forward'):
        model_class.forward = model_class._original_forward
        delattr(model_class, '_original_forward')
    
    if hasattr(model_class, 'teacache_config'):
        delattr(model_class, 'teacache_config')