Spaces:

multimodalart
/

wan2-1-fast

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 25 days ago

Commit

9762ac2

verified ·

1 Parent(s): 1bb48a3

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -251

app.py CHANGED Viewed

@@ -22,277 +22,241 @@ logger = logging.getLogger(__name__)
 MANUAL_PATCHES_STORE = {}
 def _custom_convert_non_diffusers_wan_lora_to_diffusers(state_dict):
-    """
-    Custom converter for Wan 2.1 T2V LoRA.
-    Separates LoRA A/B weights for PEFT and diff_b/diff for manual patching.
-    Stores diff_b/diff in the global MANUAL_PATCHES_STORE.
-    """
     global MANUAL_PATCHES_STORE
-    MANUAL_PATCHES_STORE.clear() # Clear previous patches if any
-    converted_state_dict_for_peft = {}
-    manual_diff_patches = {}
-    # Strip "diffusion_model." prefix
-    original_state_dict = {
-        k[len("diffusion_model.") :]: v
-        for k, v in state_dict.items()
-        if k.startswith("diffusion_model.")
-    }
-    # --- Determine number of blocks ---
     block_indices = set()
-    for k_orig in original_state_dict:
-        if "blocks." in k_orig:
             try:
-                block_idx_str = k_orig.split("blocks.")[1].split(".")[0]
                 if block_idx_str.isdigit():
                     block_indices.add(int(block_idx_str))
-            except (IndexError, ValueError) as e:
-                logger.warning(f"Could not parse block index from key: {k_orig} due to {e}")
-    num_transformer_blocks = max(block_indices) + 1 if block_indices else 0
-    if not block_indices and any("blocks." in k for k in original_state_dict):
-         logger.warning("Found 'blocks.' in keys but could not determine num_transformer_blocks reliably.")
-    # --- Convert Transformer Blocks (blocks.0 to blocks.N-1) ---
-    for i in range(num_transformer_blocks):
-        # Self-attention (attn1 in Diffusers DiT)
-        for lora_key_part, diffusers_layer_name in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
-            orig_lora_down_key = f"blocks.{i}.self_attn.{lora_key_part}.lora_down.weight"
-            orig_lora_up_key = f"blocks.{i}.self_attn.{lora_key_part}.lora_up.weight"
-            target_base_key_peft = f"blocks.{i}.attn1.{diffusers_layer_name}"
-            target_base_key_manual = f"transformer.blocks.{i}.attn1.{diffusers_layer_name}"
-            if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-                converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-                converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-            orig_diff_b_key = f"blocks.{i}.self_attn.{lora_key_part}.diff_b"
-            if orig_diff_b_key in original_state_dict:
-                manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
-        # Cross-attention (attn2 in Diffusers DiT)
-        for lora_key_part, diffusers_layer_name in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
-            orig_lora_down_key = f"blocks.{i}.cross_attn.{lora_key_part}.lora_down.weight"
-            orig_lora_up_key = f"blocks.{i}.cross_attn.{lora_key_part}.lora_up.weight"
-            target_base_key_peft = f"blocks.{i}.attn2.{diffusers_layer_name}"
-            target_base_key_manual = f"transformer.blocks.{i}.attn2.{diffusers_layer_name}"
-            if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-                converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-                converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-            orig_diff_b_key = f"blocks.{i}.cross_attn.{lora_key_part}.diff_b"
-            if orig_diff_b_key in original_state_dict:
-                 manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
         # FFN
-        for original_ffn_idx, diffusers_ffn_path_part in zip(["0", "2"], ["net.0.proj", "net.2"]):
-            orig_lora_down_key = f"blocks.{i}.ffn.{original_ffn_idx}.lora_down.weight"
-            orig_lora_up_key = f"blocks.{i}.ffn.{original_ffn_idx}.lora_up.weight"
-            target_base_key_peft = f"blocks.{i}.ffn.{diffusers_ffn_path_part}"
-            target_base_key_manual = f"transformer.blocks.{i}.ffn.{diffusers_ffn_path_part}"
-            if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-                converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-                converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-            orig_diff_b_key = f"blocks.{i}.ffn.{original_ffn_idx}.diff_b"
-            if orig_diff_b_key in original_state_dict:
-                manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
-        # Norm layers within blocks
-        # LoRA has `norm3.diff` and `norm3.diff_b`. Wan2.1 base DiTBlock has `norm2`.
-        norm3_diff_key = f"blocks.{i}.norm3.diff"
-        norm3_diff_b_key = f"blocks.{i}.norm3.diff_b"
-        target_norm_key_base_manual = f"transformer.blocks.{i}.norm2" # Diffusers DiTBlock's second norm
-        if norm3_diff_key in original_state_dict:
-             manual_diff_patches[f"{target_norm_key_base_manual}.weight"] = original_state_dict.pop(norm3_diff_key)
-        if norm3_diff_b_key in original_state_dict:
-             manual_diff_patches[f"{target_norm_key_base_manual}.bias"] = original_state_dict.pop(norm3_diff_b_key)
-        # Attention QK norms
-        for attn_type, diffusers_attn_block in zip(["self_attn", "cross_attn"], ["attn1", "attn2"]):
-            for norm_target_suffix in ["norm_q", "norm_k"]:
-                orig_norm_diff_key = f"blocks.{i}.{attn_type}.{norm_target_suffix}.diff"
-                target_norm_key_manual = f"transformer.blocks.{i}.{diffusers_attn_block}.{norm_target_suffix}.weight"
-                if orig_norm_diff_key in original_state_dict:
-                    manual_diff_patches[target_norm_key_manual] = original_state_dict.pop(orig_norm_diff_key)
-    # --- Convert Non-Block Components ---
     # Patch Embedding
     patch_emb_diff_b_key = "patch_embedding.diff_b"
-    if patch_emb_diff_b_key in original_state_dict:
-        manual_diff_patches["transformer.patch_embedding.bias"] = original_state_dict.pop(patch_emb_diff_b_key)
-    # Text Embedding
-    for orig_idx, diffusers_linear_idx in zip(["0", "2"], ["linear_1", "linear_2"]):
-        orig_lora_down_key = f"text_embedding.{orig_idx}.lora_down.weight"
-        orig_lora_up_key = f"text_embedding.{orig_idx}.lora_up.weight"
-        target_base_key_peft = f"condition_embedder.text_embedder.{diffusers_linear_idx}"
-        target_base_key_manual = f"transformer.condition_embedder.text_embedder.{diffusers_linear_idx}"
-        if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-            converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-            converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-        orig_diff_b_key = f"text_embedding.{orig_idx}.diff_b"
-        if orig_diff_b_key in original_state_dict:
-            manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
-    # Time Embedding
-    for orig_idx, diffusers_linear_idx in zip(["0", "2"], ["linear_1", "linear_2"]):
-        orig_lora_down_key = f"time_embedding.{orig_idx}.lora_down.weight"
-        orig_lora_up_key = f"time_embedding.{orig_idx}.lora_up.weight"
-        target_base_key_peft = f"condition_embedder.time_embedder.{diffusers_linear_idx}"
-        target_base_key_manual = f"transformer.condition_embedder.time_embedder.{diffusers_linear_idx}"
-        if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-            converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-            converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-        orig_diff_b_key = f"time_embedding.{orig_idx}.diff_b"
-        if orig_diff_b_key in original_state_dict:
-            manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
-    # Time Projection
-    orig_lora_down_key = "time_projection.1.lora_down.weight"
-    orig_lora_up_key = "time_projection.1.lora_up.weight"
-    target_base_key_peft = "condition_embedder.time_proj"
-    target_base_key_manual = "transformer.condition_embedder.time_proj"
-    if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-        converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-        converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-    orig_diff_b_key = "time_projection.1.diff_b"
-    if orig_diff_b_key in original_state_dict:
-        manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
-    # Head
-    orig_lora_down_key = "head.head.lora_down.weight"
-    orig_lora_up_key = "head.head.lora_up.weight"
-    target_base_key_peft = "proj_out" # Directly under transformer in Diffusers DiT
-    target_base_key_manual = "transformer.proj_out"
-    if orig_lora_down_key in original_state_dict and orig_lora_up_key in original_state_dict:
-        converted_state_dict_for_peft[f"{target_base_key_peft}.lora_A.weight"] = original_state_dict.pop(orig_lora_down_key)
-        converted_state_dict_for_peft[f"{target_base_key_peft}.lora_B.weight"] = original_state_dict.pop(orig_lora_up_key)
-    orig_diff_b_key = "head.head.diff_b"
-    if orig_diff_b_key in original_state_dict:
-        manual_diff_patches[f"{target_base_key_manual}.bias"] = original_state_dict.pop(orig_diff_b_key)
-    # Log any remaining keys from the original LoRA after stripping "diffusion_model."
-    if len(original_state_dict) > 0:
         logger.warning(
-            f"Following keys from LoRA (after stripping 'diffusion_model.') were not converted or explicitly handled for PEFT/manual patching: {original_state_dict.keys()}"
         )
-    # Add "transformer." prefix for Diffusers LoraLoaderMixin to the PEFT keys
-    final_peft_state_dict = {}
-    for k_peft, v_peft in converted_state_dict_for_peft.items():
-        final_peft_state_dict[f"transformer.{k_peft}"] = v_peft
-    MANUAL_PATCHES_STORE = manual_diff_patches # Store for later use
-    return final_peft_state_dict
-def apply_manual_diff_patches(pipe_model, patches):
-    """
-    Manually applies diff_b/diff patches to the model.
-    Assumes PEFT LoRA layers have already been loaded.
-    """
-    if not patches:
         logger.info("No manual diff patches to apply.")
         return
-    logger.info(f"Applying {len(patches)} manual diff patches...")
-    patched_keys_count = 0
-    unpatched_keys_count = 0
-    skipped_keys_details = []
-    for key, diff_tensor in patches.items():
         try:
-            # key is like "transformer.blocks.0.attn1.to_q.bias"
-            current_module = pipe_model # Starts from pipe.transformer
-            path_parts = key.split('.')[1:] # Remove "transformer." prefix for getattr navigation
-                                           # e.g., ["blocks", "0", "attn1", "to_q", "bias"]
-            # Navigate to the parent module of the parameter
-            # Example: for "blocks.0.attn1.to_q.bias", parent_module_path is "blocks.0.attn1.to_q"
-            parent_module_path = path_parts[:-1]
-            param_name_to_patch = path_parts[-1] # "bias" or "weight"
-            for part in parent_module_path:
-                if hasattr(current_module, part):
-                    current_module = getattr(current_module, part)
-                elif hasattr(current_module, 'base_layer') and hasattr(current_module.base_layer, part):
-                    # This case is unlikely here as we are navigating *to* the layer,
-                    # not trying to access a sub-component of a base_layer.
-                    # PEFT wrapping affects the layer itself, not its parent structure.
-                    current_module = getattr(current_module.base_layer, part)
-                else:
-                    raise AttributeError(f"Submodule '{part}' not found in path '{'.'.join(parent_module_path)}' within {key}")
-            # Now, current_module is the layer whose parameter we want to patch
-            # e.g., if key was transformer.blocks.0.attn1.to_q.bias,
-            # current_module is the to_q Linear layer (or LoraLayer wrapping it)
-            layer_to_modify = current_module
-            # If PEFT wrapped the Linear layer (common for attention q,k,v,o and ffn projections)
-            if hasattr(layer_to_modify, "base_layer") and isinstance(layer_to_modify.base_layer, (torch.nn.Linear, torch.nn.LayerNorm)):
-                actual_param_owner = layer_to_modify.base_layer
-            else: # For non-wrapped layers like LayerNorm, or if it's already the base_layer
-                actual_param_owner = layer_to_modify
-            if not hasattr(actual_param_owner, param_name_to_patch):
-                skipped_keys_details.append(f"Key: {key}, Reason: Parameter '{param_name_to_patch}' not found in layer '{actual_param_owner}'. Layer type: {type(actual_param_owner)}")
-                unpatched_keys_count += 1
-                continue
-            original_param = getattr(actual_param_owner, param_name_to_patch)
-            if original_param is None and param_name_to_patch == "bias":
-                logger.info(f"Key '{key}': Original bias is None. Attempting to initialize.")
-                if isinstance(actual_param_owner, torch.nn.Linear) or isinstance(actual_param_owner, torch.nn.LayerNorm):
-                    # For LayerNorm, bias exists if elementwise_affine=True (default).
-                    # If it was False, we are making it affine by adding a bias.
-                    # For Linear, if bias was False, we are adding one.
-                    actual_param_owner.bias = torch.nn.Parameter(torch.zeros_like(diff_tensor, device=diff_tensor.device, dtype=diff_tensor.dtype))
-                    original_param = actual_param_owner.bias
-                    logger.info(f"Key '{key}': Initialized bias for {type(actual_param_owner)}.")
-                else:
-                    skipped_keys_details.append(f"Key: {key}, Reason: Original bias is None and layer '{actual_param_owner}' is not Linear or LayerNorm. Cannot initialize.")
-                    unpatched_keys_count +=1
-                    continue
-            # Special handling for RMSNorm which typically has no bias
-            if isinstance(actual_param_owner, torch.nn.RMSNorm) and param_name_to_patch == "bias":
-                skipped_keys_details.append(f"Key: {key}, Reason: Layer '{actual_param_owner}' is RMSNorm which has no bias parameter. Skipping bias diff.")
-                unpatched_keys_count +=1
                 continue
-            if original_param is not None:
-                if original_param.shape != diff_tensor.shape:
-                    skipped_keys_details.append(f"Key: {key}, Reason: Shape mismatch. Model param: {original_param.shape}, LoRA diff: {diff_tensor.shape}. Layer: {actual_param_owner}")
-                    unpatched_keys_count += 1
-                    continue
-                with torch.no_grad():
-                    original_param.add_(diff_tensor.to(original_param.device, original_param.dtype))
-                # logger.info(f"Successfully applied diff to '{key}'") # Too verbose, will log summary
-                patched_keys_count += 1
-            else:
-                skipped_keys_details.append(f"Key: {key}, Reason: Original parameter '{param_name_to_patch}' is None and was not initialized. Layer: {actual_param_owner}")
-                unpatched_keys_count += 1
-        except AttributeError as e:
-            skipped_keys_details.append(f"Key: {key}, Reason: AttributeError - {e}")
-            unpatched_keys_count += 1
         except Exception as e:
-            skipped_keys_details.append(f"Key: {key}, Reason: General Exception - {e}")
-            unpatched_keys_count += 1
-    logger.info(f"Manual patching summary: {patched_keys_count} keys patched, {unpatched_keys_count} keys failed or skipped.")
-    if unpatched_keys_count > 0:
-        logger.warning("Details of unpatched/skipped keys:")
-        for detail in skipped_keys_details:
-            logger.warning(f"  - {detail}")
 # --- Model Loading ---
 logger.info(f"Loading VAE for {MODEL_ID}...")
@@ -326,21 +290,19 @@ logger.info("Loading LoRA weights with custom converter...")
 from safetensors.torch import load_file as load_safetensors
 raw_lora_state_dict = load_safetensors(causvid_path)
-# Now call our custom converter which will populate MANUAL_PATCHES_STORE
 peft_state_dict = _custom_convert_non_diffusers_wan_lora_to_diffusers(raw_lora_state_dict)
-# Load the LoRA A/B matrices using PEFT
 if peft_state_dict:
     pipe.load_lora_weights(
-        peft_state_dict, # Pass the dictionary directly
         adapter_name="causvid_lora"
     )
     logger.info("PEFT LoRA A/B weights loaded.")
 else:
     logger.warning("No PEFT-compatible LoRA weights found after conversion.")
-# Apply manual diff_b and diff patches
-apply_manual_diff_patches(pipe.transformer, MANUAL_PATCHES_STORE) # Apply to the transformer component
 logger.info("Manual diff_b/diff patches applied.")

 MANUAL_PATCHES_STORE = {}
 def _custom_convert_non_diffusers_wan_lora_to_diffusers(state_dict):
     global MANUAL_PATCHES_STORE
+    MANUAL_PATCHES_STORE = {} # Clear previous patches
+    peft_state_dict = {}
+    unhandled_keys = []
+    original_keys = list(state_dict.keys())
+    processed_state_dict = {}
+    for k, v in state_dict.items():
+        if k.startswith("diffusion_model."):
+            processed_state_dict[k[len("diffusion_model."):]] = v
+        elif k.startswith("difusion_model."): # Handle potential typo
+             processed_state_dict[k[len("difusion_model."):]] = v
+        else:
+            unhandled_keys.append(k) # Will be logged later if not handled by diff/diff_b
     block_indices = set()
+    for k_proc in processed_state_dict:
+        if k_proc.startswith("blocks."):
             try:
+                block_idx_str = k_proc.split("blocks.")[1].split(".")[0]
                 if block_idx_str.isdigit():
                     block_indices.add(int(block_idx_str))
+            except IndexError:
+                pass # Will be handled as a non-block key or logged
+    num_blocks = 0
+    if block_indices:
+        num_blocks = max(block_indices) + 1
+    is_i2v_lora = any("k_img" in k for k in processed_state_dict) and \
+                  any("v_img" in k for k in processed_state_dict)
+    handled_original_keys = set()
+    # --- Handle Block-level LoRAs & Diffs ---
+    for i in range(num_blocks):
+        # Self-attention (maps to attn1 in WanTransformerBlock)
+        for o_lora, c_diffusers in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
+            lora_down_key_proc = f"blocks.{i}.self_attn.{o_lora}.lora_down.weight"
+            lora_up_key_proc = f"blocks.{i}.self_attn.{o_lora}.lora_up.weight"
+            diff_b_key_proc = f"blocks.{i}.self_attn.{o_lora}.diff_b"
+            diff_w_key_proc = f"blocks.{i}.self_attn.{o_lora}.diff" # Assuming .diff for weight
+            if lora_down_key_proc in processed_state_dict and lora_up_key_proc in processed_state_dict:
+                peft_state_dict[f"transformer.blocks.{i}.attn1.{c_diffusers}.lora_A.weight"] = processed_state_dict[lora_down_key_proc]
+                peft_state_dict[f"transformer.blocks.{i}.attn1.{c_diffusers}.lora_B.weight"] = processed_state_dict[lora_up_key_proc]
+                handled_original_keys.add(f"diffusion_model.{lora_down_key_proc}")
+                handled_original_keys.add(f"diffusion_model.{lora_up_key_proc}")
+            if diff_b_key_proc in processed_state_dict:
+                target_bias_key = f"transformer.blocks.{i}.attn1.{c_diffusers}.bias"
+                MANUAL_PATCHES_STORE[target_bias_key] = ("diff_b", processed_state_dict[diff_b_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_b_key_proc}")
+            if diff_w_key_proc in processed_state_dict:
+                target_weight_key = f"transformer.blocks.{i}.attn1.{c_diffusers}.weight"
+                MANUAL_PATCHES_STORE[target_weight_key] = ("diff", processed_state_dict[diff_w_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_w_key_proc}")
+        # Cross-attention (maps to attn2 in WanTransformerBlock)
+        for o_lora, c_diffusers in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
+            lora_down_key_proc = f"blocks.{i}.cross_attn.{o_lora}.lora_down.weight"
+            lora_up_key_proc = f"blocks.{i}.cross_attn.{o_lora}.lora_up.weight"
+            diff_b_key_proc = f"blocks.{i}.cross_attn.{o_lora}.diff_b"
+            diff_w_key_proc = f"blocks.{i}.cross_attn.{o_lora}.diff"
+            norm_q_diff_key_proc = f"blocks.{i}.cross_attn.norm_q.diff" # specific norm diff
+            norm_k_diff_key_proc = f"blocks.{i}.cross_attn.norm_k.diff" # specific norm diff
+            if lora_down_key_proc in processed_state_dict and lora_up_key_proc in processed_state_dict:
+                peft_state_dict[f"transformer.blocks.{i}.attn2.{c_diffusers}.lora_A.weight"] = processed_state_dict[lora_down_key_proc]
+                peft_state_dict[f"transformer.blocks.{i}.attn2.{c_diffusers}.lora_B.weight"] = processed_state_dict[lora_up_key_proc]
+                handled_original_keys.add(f"diffusion_model.{lora_down_key_proc}")
+                handled_original_keys.add(f"diffusion_model.{lora_up_key_proc}")
+            if diff_b_key_proc in processed_state_dict:
+                target_bias_key = f"transformer.blocks.{i}.attn2.{c_diffusers}.bias"
+                MANUAL_PATCHES_STORE[target_bias_key] = ("diff_b", processed_state_dict[diff_b_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_b_key_proc}")
+            if diff_w_key_proc in processed_state_dict:
+                target_weight_key = f"transformer.blocks.{i}.attn2.{c_diffusers}.weight"
+                MANUAL_PATCHES_STORE[target_weight_key] = ("diff", processed_state_dict[diff_w_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_w_key_proc}")
+            if norm_q_diff_key_proc in processed_state_dict: # Assuming norm_q on q_proj
+                MANUAL_PATCHES_STORE[f"transformer.blocks.{i}.attn2.norm_q.weight"] = ("diff", processed_state_dict[norm_q_diff_key_proc])
+                handled_original_keys.add(f"diffusion_model.{norm_q_diff_key_proc}")
+            if norm_k_diff_key_proc in processed_state_dict: # Assuming norm_k on k_proj
+                MANUAL_PATCHES_STORE[f"transformer.blocks.{i}.attn2.norm_k.weight"] = ("diff", processed_state_dict[norm_k_diff_key_proc])
+                handled_original_keys.add(f"diffusion_model.{norm_k_diff_key_proc}")
+        if is_i2v_lora:
+            for o_lora, c_diffusers in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
+                lora_down_key_proc = f"blocks.{i}.cross_attn.{o_lora}.lora_down.weight"
+                lora_up_key_proc = f"blocks.{i}.cross_attn.{o_lora}.lora_up.weight"
+                diff_b_key_proc = f"blocks.{i}.cross_attn.{o_lora}.diff_b"
+                diff_w_key_proc = f"blocks.{i}.cross_attn.{o_lora}.diff"
+                if lora_down_key_proc in processed_state_dict and lora_up_key_proc in processed_state_dict:
+                    peft_state_dict[f"transformer.blocks.{i}.attn2.{c_diffusers}.lora_A.weight"] = processed_state_dict[lora_down_key_proc]
+                    peft_state_dict[f"transformer.blocks.{i}.attn2.{c_diffusers}.lora_B.weight"] = processed_state_dict[lora_up_key_proc]
+                    handled_original_keys.add(f"diffusion_model.{lora_down_key_proc}")
+                    handled_original_keys.add(f"diffusion_model.{lora_up_key_proc}")
+                if diff_b_key_proc in processed_state_dict:
+                    target_bias_key = f"transformer.blocks.{i}.attn2.{c_diffusers}.bias"
+                    MANUAL_PATCHES_STORE[target_bias_key] = ("diff_b", processed_state_dict[diff_b_key_proc])
+                    handled_original_keys.add(f"diffusion_model.{diff_b_key_proc}")
+                if diff_w_key_proc in processed_state_dict:
+                    target_weight_key = f"transformer.blocks.{i}.attn2.{c_diffusers}.weight"
+                    MANUAL_PATCHES_STORE[target_weight_key] = ("diff", processed_state_dict[diff_w_key_proc])
+                    handled_original_keys.add(f"diffusion_model.{diff_w_key_proc}")
         # FFN
+        for o_lora_suffix, c_diffusers_path in zip([".0", ".2"], ["net.0.proj", "net.2"]):
+            lora_down_key_proc = f"blocks.{i}.ffn{o_lora_suffix}.lora_down.weight"
+            lora_up_key_proc = f"blocks.{i}.ffn{o_lora_suffix}.lora_up.weight"
+            diff_b_key_proc = f"blocks.{i}.ffn{o_lora_suffix}.diff_b"
+            diff_w_key_proc = f"blocks.{i}.ffn{o_lora_suffix}.diff" # Assuming .diff for weight
+            if lora_down_key_proc in processed_state_dict and lora_up_key_proc in processed_state_dict:
+                peft_state_dict[f"transformer.blocks.{i}.ffn.{c_diffusers_path}.lora_A.weight"] = processed_state_dict[lora_down_key_proc]
+                peft_state_dict[f"transformer.blocks.{i}.ffn.{c_diffusers_path}.lora_B.weight"] = processed_state_dict[lora_up_key_proc]
+                handled_original_keys.add(f"diffusion_model.{lora_down_key_proc}")
+                handled_original_keys.add(f"diffusion_model.{lora_up_key_proc}")
+            if diff_b_key_proc in processed_state_dict:
+                target_bias_key = f"transformer.blocks.{i}.ffn.{c_diffusers_path}.bias"
+                MANUAL_PATCHES_STORE[target_bias_key] = ("diff_b", processed_state_dict[diff_b_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_b_key_proc}")
+            if diff_w_key_proc in processed_state_dict:
+                target_weight_key = f"transformer.blocks.{i}.ffn.{c_diffusers_path}.weight"
+                MANUAL_PATCHES_STORE[target_weight_key] = ("diff", processed_state_dict[diff_w_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_w_key_proc}")
+        # Block norm3 diffs (assuming norm3 applies to the output of the FFN in the original Wan block structure)
+        norm3_diff_key_proc = f"blocks.{i}.norm3.diff"
+        norm3_diff_b_key_proc = f"blocks.{i}.norm3.diff_b"
+        if norm3_diff_key_proc in processed_state_dict:
+            MANUAL_PATCHES_STORE[f"transformer.blocks.{i}.norm3.weight"] = ("diff", processed_state_dict[norm3_diff_key_proc]) # Norms usually have .weight
+            handled_original_keys.add(f"diffusion_model.{norm3_diff_key_proc}")
+        if norm3_diff_b_key_proc in processed_state_dict:
+            MANUAL_PATCHES_STORE[f"transformer.blocks.{i}.norm3.bias"] = ("diff_b", processed_state_dict[norm3_diff_b_key_proc]) # And .bias
+            handled_original_keys.add(f"diffusion_model.{norm3_diff_b_key_proc}")
+    # --- Handle Top-level LoRAs & Diffs ---
+    top_level_mappings = [
+        # (lora_base_path_proc, diffusers_base_path, lora_suffixes, diffusers_suffixes)
+        ("text_embedding", "transformer.condition_embedder.text_embedder", ["0", "2"], ["linear_1", "linear_2"]),
+        ("time_embedding", "transformer.condition_embedder.time_embedder", ["0", "2"], ["linear_1", "linear_2"]),
+        ("time_projection", "transformer.condition_embedder.time_proj", ["1"], [""]), # Wan has .1, Diffusers has no suffix
+        ("head", "transformer.proj_out", ["head"], [""]), # Wan has .head, Diffusers has no suffix
+    ]
+    for lora_base_proc, diffusers_base, lora_suffixes, diffusers_suffixes in top_level_mappings:
+        for l_suffix, d_suffix in zip(lora_suffixes, diffusers_suffixes):
+            actual_lora_path_proc = f"{lora_base_proc}.{l_suffix}" if l_suffix else lora_base_proc
+            actual_diffusers_path = f"{diffusers_base}.{d_suffix}" if d_suffix else diffusers_base
+            lora_down_key_proc = f"{actual_lora_path_proc}.lora_down.weight"
+            lora_up_key_proc = f"{actual_lora_path_proc}.lora_up.weight"
+            diff_b_key_proc = f"{actual_lora_path_proc}.diff_b"
+            diff_w_key_proc = f"{actual_lora_path_proc}.diff"
+            if lora_down_key_proc in processed_state_dict and lora_up_key_proc in processed_state_dict:
+                peft_state_dict[f"{actual_diffusers_path}.lora_A.weight"] = processed_state_dict[lora_down_key_proc]
+                peft_state_dict[f"{actual_diffusers_path}.lora_B.weight"] = processed_state_dict[lora_up_key_proc]
+                handled_original_keys.add(f"diffusion_model.{lora_down_key_proc}")
+                handled_original_keys.add(f"diffusion_model.{lora_up_key_proc}")
+            if diff_b_key_proc in processed_state_dict:
+                MANUAL_PATCHES_STORE[f"{actual_diffusers_path}.bias"] = ("diff_b", processed_state_dict[diff_b_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_b_key_proc}")
+            if diff_w_key_proc in processed_state_dict:
+                MANUAL_PATCHES_STORE[f"{actual_diffusers_path}.weight"] = ("diff", processed_state_dict[diff_w_key_proc])
+                handled_original_keys.add(f"diffusion_model.{diff_w_key_proc}")
     # Patch Embedding
     patch_emb_diff_b_key = "patch_embedding.diff_b"
+    if patch_emb_diff_b_key in processed_state_dict:
+        MANUAL_PATCHES_STORE["transformer.patch_embedding.bias"] = ("diff_b", processed_state_dict[patch_emb_diff_b_key])
+        handled_original_keys.add(f"diffusion_model.{patch_emb_diff_b_key}")
+    # Assuming .diff might exist for patch_embedding.weight, though not explicitly in your example list
+    patch_emb_diff_w_key = "patch_embedding.diff"
+    if patch_emb_diff_w_key in processed_state_dict:
+        MANUAL_PATCHES_STORE["transformer.patch_embedding.weight"] = ("diff", processed_state_dict[patch_emb_diff_w_key])
+        handled_original_keys.add(f"diffusion_model.{patch_emb_diff_w_key}")
+    # Log unhandled keys
+    final_unhandled_keys = []
+    for k_orig in original_keys:
+        # Reconstruct the processed key to check if it was actually handled by diff/diff_b or lora A/B logic
+        k_proc = None
+        if k_orig.startswith("diffusion_model."):
+            k_proc = k_orig[len("diffusion_model."):]
+        elif k_orig.startswith("difusion_model."):
+            k_proc = k_orig[len("difusion_model."):]
+        if k_orig not in handled_original_keys and (k_proc is None or not any(k_proc.endswith(s) for s in [".lora_down.weight", ".lora_up.weight", ".diff", ".diff_b", ".alpha"])):
+            final_unhandled_keys.append(k_orig)
+    if final_unhandled_keys:
         logger.warning(
+            f"The following keys from the Wan 2.1 LoRA checkpoint were not converted to PEFT LoRA A/B format "
+            f"nor identified as manual diff patches: {final_unhandled_keys}."
         )
+    if not peft_state_dict and not MANUAL_PATCHES_STORE:
+        logger.warning("No valid LoRA A/B weights or manual diff patches found after conversion.")
+    return peft_state_dict
+def apply_manual_diff_patches(pipe_model_component, patches_store, strength_model=1.0):
+    if not patches_store:
         logger.info("No manual diff patches to apply.")
         return
+    logger.info(f"Applying {len(patches_store)} manual diff patches...")
+    for target_key, (patch_type, diff_tensor) in patches_store.items():
         try:
+            module_path, param_name = target_key.rsplit('.', 1)
+            module = pipe_model_component.get_submodule(module_path)
+            original_param = getattr(module, param_name)
+            if original_param.shape != diff_tensor.shape:
+                logger.warning(f"Shape mismatch for {target_key}: model {original_param.shape}, LoRA {diff_tensor.shape}. Skipping patch.")
                 continue
+            with torch.no_grad():
+                # Ensure diff_tensor is on the same device and dtype as the original parameter
+                diff_tensor_casted = diff_tensor.to(device=original_param.device, dtype=original_param.dtype)
+                scaled_diff = diff_tensor_casted * strength_model
+                original_param.add_(scaled_diff)
+            # logger.info(f"Applied {patch_type} to {target_key} with strength {strength_model}")
+        except AttributeError:
+            logger.warning(f"Could not find parameter {target_key} in the model component. Skipping patch.")
         except Exception as e:
+            logger.error(f"Error applying patch to {target_key}: {e}")
+    logger.info("Finished applying manual diff patches.")
 # --- Model Loading ---
 logger.info(f"Loading VAE for {MODEL_ID}...")
 from safetensors.torch import load_file as load_safetensors
 raw_lora_state_dict = load_safetensors(causvid_path)
 peft_state_dict = _custom_convert_non_diffusers_wan_lora_to_diffusers(raw_lora_state_dict)
 if peft_state_dict:
     pipe.load_lora_weights(
+        peft_state_dict,
         adapter_name="causvid_lora"
     )
     logger.info("PEFT LoRA A/B weights loaded.")
 else:
     logger.warning("No PEFT-compatible LoRA weights found after conversion.")
+lora_strength = 1.0
+apply_manual_diff_patches(pipe.transformer, MANUAL_PATCHES_STORE, strength_model=lora_strength)
 logger.info("Manual diff_b/diff patches applied.")