jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -15,6 +15,7 @@ class LTXEnhanceAttnProcessor2_0:
     def _get_enhance_scores(self, query, key, inner_dim, num_heads, num_frames, text_seq_length=None):
         """Calculate enhancement scores for the attention mechanism"""
         head_dim = inner_dim // num_heads
         if text_seq_length is not None:
             img_q = query[:, :, :-text_seq_length] if text_seq_length > 0 else query
@@ -23,16 +24,11 @@ class LTXEnhanceAttnProcessor2_0:
             img_q, img_k = query, key
         batch_size, num_heads, ST, head_dim = img_q.shape
-        # Calculate spatial dimension by dividing total tokens by number of frames
         spatial_dim = ST // num_frames
-        # Ensure spatial_dim is calculated correctly
         if spatial_dim * num_frames != ST:
-            # If we can't divide evenly, we'll need to pad or reshape
             spatial_dim = max(1, ST // num_frames)
-            # Adjust ST to be evenly divisible
             ST = spatial_dim * num_frames
-        # Ensure tensors have the right shape before rearranging
         img_q = img_q[:, :, :ST, :]
         img_k = img_k[:, :, :ST, :]
@@ -46,33 +42,31 @@ class LTXEnhanceAttnProcessor2_0:
                 T=num_frames, S=spatial_dim, N=num_heads, C=head_dim
             )
         except Exception as e:
-            # If rearrangement fails, return a default enhancement score
-            return torch.ones(img_q.shape[0], 1, 1, 1, device=img_q.device)
         scale = head_dim**-0.5
         query_image = query_image * scale
-        attn_temp = query_image @ key_image.transpose(-2, -1)  # translate attn to float32
-        attn_temp = attn_temp.to(torch.float32)
-        attn_temp = attn_temp.softmax(dim=-1)
-        # Reshape to [batch_size * num_tokens, num_frames, num_frames]
         attn_temp = attn_temp.reshape(-1, num_frames, num_frames)
-        # Create a mask for diagonal elements
         diag_mask = torch.eye(num_frames, device=attn_temp.device).bool()
         diag_mask = diag_mask.unsqueeze(0).expand(attn_temp.shape[0], -1, -1)
-        # Zero out diagonal elements
         attn_wo_diag = attn_temp.masked_fill(diag_mask, 0)
-        # Calculate mean for each token's attention matrix
-        # Number of off-diagonal elements per matrix is n*n - n
         num_off_diag = num_frames * num_frames - num_frames
         mean_scores = attn_wo_diag.sum(dim=(1, 2)) / num_off_diag
         enhance_scores = mean_scores.mean() * (num_frames + 4.0)
         enhance_scores = enhance_scores.clamp(min=1)
-        return enhance_scores
     def __call__(
         self,
@@ -82,6 +76,7 @@ class LTXEnhanceAttnProcessor2_0:
         attention_mask = None,
         **kwargs
     ) -> torch.Tensor:
         batch_size, sequence_length, _ = hidden_states.shape
         text_seq_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0
@@ -92,12 +87,10 @@ class LTXEnhanceAttnProcessor2_0:
         num_heads = attn.heads
         head_dim = inner_dim // num_heads
-        # Get query, key, value projections
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
-        # Reshape projections
         query = query.view(batch_size, sequence_length, num_heads, head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
@@ -106,7 +99,6 @@ class LTXEnhanceAttnProcessor2_0:
             query = query.float()
             key = key.float()
-        # Process attention
         enhance_scores = None
         if is_enhance_enabled():
             try:
@@ -120,12 +112,10 @@ class LTXEnhanceAttnProcessor2_0:
             except ValueError as e:
                 print(f"Warning: Could not calculate enhance scores: {e}")
-        # Make sure attention_mask has correct shape
         if attention_mask is not None:
             attention_mask = attention_mask.view(batch_size, 1, 1, attention_mask.shape[-1])
             attention_mask = attention_mask.expand(-1, num_heads, -1, -1)
-        # Compute attention with correct shapes
         hidden_states = torch.nn.functional.scaled_dot_product_attention(
             query, key, value,
             attn_mask=attention_mask,
@@ -133,15 +123,13 @@ class LTXEnhanceAttnProcessor2_0:
             is_causal=False
         )
-        # Reshape output
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, inner_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        # Apply enhancement if enabled
         if is_enhance_enabled() and enhance_scores is not None:
             hidden_states = hidden_states * enhance_scores
-        # Output projection
         hidden_states = attn.to_out[0](hidden_states)
         hidden_states = attn.to_out[1](hidden_states)

     def _get_enhance_scores(self, query, key, inner_dim, num_heads, num_frames, text_seq_length=None):
         """Calculate enhancement scores for the attention mechanism"""
         head_dim = inner_dim // num_heads
+        orig_dtype = query.dtype  # Store original dtype
         if text_seq_length is not None:
             img_q = query[:, :, :-text_seq_length] if text_seq_length > 0 else query
             img_q, img_k = query, key
         batch_size, num_heads, ST, head_dim = img_q.shape
         spatial_dim = ST // num_frames
         if spatial_dim * num_frames != ST:
             spatial_dim = max(1, ST // num_frames)
             ST = spatial_dim * num_frames
         img_q = img_q[:, :, :ST, :]
         img_k = img_k[:, :, :ST, :]
                 T=num_frames, S=spatial_dim, N=num_heads, C=head_dim
             )
         except Exception as e:
+            return torch.ones(img_q.shape[0], 1, 1, 1, device=img_q.device, dtype=orig_dtype)
         scale = head_dim**-0.5
         query_image = query_image * scale
+        # Compute attention in float32 for stability
+        with torch.cuda.amp.autocast(enabled=False):
+            query_image = query_image.float()
+            key_image = key_image.float()
+            attn_temp = query_image @ key_image.transpose(-2, -1)
+            attn_temp = attn_temp.softmax(dim=-1)
         attn_temp = attn_temp.reshape(-1, num_frames, num_frames)
         diag_mask = torch.eye(num_frames, device=attn_temp.device).bool()
         diag_mask = diag_mask.unsqueeze(0).expand(attn_temp.shape[0], -1, -1)
         attn_wo_diag = attn_temp.masked_fill(diag_mask, 0)
         num_off_diag = num_frames * num_frames - num_frames
         mean_scores = attn_wo_diag.sum(dim=(1, 2)) / num_off_diag
         enhance_scores = mean_scores.mean() * (num_frames + 4.0)
         enhance_scores = enhance_scores.clamp(min=1)
+        # Convert back to original dtype
+        return enhance_scores.to(orig_dtype)
     def __call__(
         self,
         attention_mask = None,
         **kwargs
     ) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype  # Store original dtype
         batch_size, sequence_length, _ = hidden_states.shape
         text_seq_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0
         num_heads = attn.heads
         head_dim = inner_dim // num_heads
         query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
         query = query.view(batch_size, sequence_length, num_heads, head_dim).transpose(1, 2)
         key = key.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
         value = value.view(batch_size, -1, num_heads, head_dim).transpose(1, 2)
             query = query.float()
             key = key.float()
         enhance_scores = None
         if is_enhance_enabled():
             try:
             except ValueError as e:
                 print(f"Warning: Could not calculate enhance scores: {e}")
         if attention_mask is not None:
             attention_mask = attention_mask.view(batch_size, 1, 1, attention_mask.shape[-1])
             attention_mask = attention_mask.expand(-1, num_heads, -1, -1)
         hidden_states = torch.nn.functional.scaled_dot_product_attention(
             query, key, value,
             attn_mask=attention_mask,
             is_causal=False
         )
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, inner_dim)
+        hidden_states = hidden_states.to(orig_dtype)  # Ensure we're back to original dtype
         if is_enhance_enabled() and enhance_scores is not None:
             hidden_states = hidden_states * enhance_scores
+        # Apply output projections while maintaining dtype
         hidden_states = attn.to_out[0](hidden_states)
         hidden_states = attn.to_out[1](hidden_states)