Spaces:

roll-ai
/

EPiC

Paused

App Files Files Community

Muhammad Taqi Raza commited on 2 days ago

Commit

16c40e7

1 Parent(s): cdb41ad

add files

Browse files

Files changed (3) hide show

cogvideo_controlnet_pcd.py +8 -23
inference/cli_demo_camera_i2v_pcd.py +4 -3
requirements.txt +3 -0

cogvideo_controlnet_pcd.py CHANGED Viewed

@@ -54,12 +54,12 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
         super().__init__()
         inner_dim = num_attention_heads * attention_head_dim
-        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
-            raise ValueError(
-                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
-                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
-                "issue at https://github.com/huggingface/diffusers/issues."
-            )
         self.vae_channels = vae_channels
         start_channels = in_channels * (downscale_coef ** 2)
@@ -84,21 +84,6 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
         else:
             patch_embed_in_channels = vae_channels*2
-        # 1. Patch embedding
-        # self.patch_embed = CogVideoXPatchEmbed(
-        #     patch_size=patch_size,
-        #     in_channels=patch_embed_in_channels,
-        #     embed_dim=inner_dim,
-        #     bias=True,
-        #     sample_width=sample_width,
-        #     sample_height=sample_height,
-        #     sample_frames=sample_frames,
-        #     temporal_compression_ratio=temporal_compression_ratio,
-        #     spatial_interpolation_scale=spatial_interpolation_scale,
-        #     temporal_interpolation_scale=temporal_interpolation_scale,
-        #     use_positional_embeddings=not use_rotary_positional_embeddings,
-        #     use_learned_positional_embeddings=use_learned_positional_embeddings,
-        # )
         self.patch_embed = CogVideoXPatchEmbed(
             patch_size=patch_size,
             in_channels=patch_embed_in_channels,
@@ -110,8 +95,8 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
             temporal_compression_ratio=temporal_compression_ratio,
             spatial_interpolation_scale=spatial_interpolation_scale,
             temporal_interpolation_scale=temporal_interpolation_scale,
-            use_positional_embeddings=not use_rotary_positional_embeddings,
-            use_learned_positional_embeddings=use_learned_positional_embeddings,
         )
         self.embedding_dropout = nn.Dropout(dropout)

         super().__init__()
         inner_dim = num_attention_heads * attention_head_dim
+        # if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+        #     raise ValueError(
+        #         "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+        #         "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+        #         "issue at https://github.com/huggingface/diffusers/issues."
+        #     )
         self.vae_channels = vae_channels
         start_channels = in_channels * (downscale_coef ** 2)
         else:
             patch_embed_in_channels = vae_channels*2
         self.patch_embed = CogVideoXPatchEmbed(
             patch_size=patch_size,
             in_channels=patch_embed_in_channels,
             temporal_compression_ratio=temporal_compression_ratio,
             spatial_interpolation_scale=spatial_interpolation_scale,
             temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=False,
+            use_learned_positional_embeddings=False,
         )
         self.embedding_dropout = nn.Dropout(dropout)

inference/cli_demo_camera_i2v_pcd.py CHANGED Viewed

@@ -157,8 +157,8 @@ def generate_video(
     dtype: torch.dtype = torch.bfloat16,
     seed: int = 42,
     num_frames: int = 49,
-    height: int = 480,
-    width: int = 720,
     start_camera_idx: int = 0,
     end_camera_idx: int = 1,
     controlnet_transformer_num_attn_heads: int = None,
@@ -212,6 +212,7 @@ def generate_video(
     scheduler = CogVideoXDDIMScheduler.from_pretrained(
         base_model_path, subfolder="scheduler"
     )
     # ControlNet
     num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
     controlnet_kwargs = {}
@@ -252,7 +253,7 @@ def generate_video(
         controlnet=controlnet,
         scheduler=scheduler,
     ).to('cuda')
     # If you're using with lora, add this code
     if lora_path:
         pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")

     dtype: torch.dtype = torch.bfloat16,
     seed: int = 42,
     num_frames: int = 49,
+    height: int = 480,   # 768
+    width: int = 720,    # 1360
     start_camera_idx: int = 0,
     end_camera_idx: int = 1,
     controlnet_transformer_num_attn_heads: int = None,
     scheduler = CogVideoXDDIMScheduler.from_pretrained(
         base_model_path, subfolder="scheduler"
     )
     # ControlNet
     num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
     controlnet_kwargs = {}
         controlnet=controlnet,
         scheduler=scheduler,
     ).to('cuda')
     # If you're using with lora, add this code
     if lora_path:
         pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")

requirements.txt CHANGED Viewed

@@ -8,6 +8,9 @@ opencv-python>=4.10.0.84
 imageio>=2.35.1
 imageio-ffmpeg>=0.5.1
 sentencepiece>=0.2.0
 einops
 decord
 protobuf

 imageio>=2.35.1
 imageio-ffmpeg>=0.5.1
 sentencepiece>=0.2.0
+SwissArmyTransformer>=0.4.12
+openai>=1.54.0
+pydantic>=2.10.3
 einops
 decord
 protobuf