Spaces:

roll-ai
/

EPiC

Paused

Muhammad Taqi Raza commited on 2 days ago

Commit

34bf23a

1 Parent(s): 4ada65a

adding some prints

Files changed (3) hide show

cogvideo_controlnet_pcd.py CHANGED Viewed

@@ -172,6 +172,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
     ):
         print("hidden_states.shape =", hidden_states.shape)
         print("controlnet_states.shape =", controlnet_states.shape)
         hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)

     ):
         print("hidden_states.shape =", hidden_states.shape)
         print("controlnet_states.shape =", controlnet_states.shape)
+        print("image_rotary_emb.shape =", image_rotary_emb.shape)
         hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)

controlnet_pipeline.py CHANGED Viewed

@@ -659,7 +659,8 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
             device, dtype=prompt_embeds.dtype
         )
-        latent_channels = self.transformer.config.in_channels // 2  # 8
         latents, image_latents = self.prepare_latents(
             image,
             batch_size * num_videos_per_prompt,

             device, dtype=prompt_embeds.dtype
         )
+        latent_channels = self.transformer.config.in_channels // 2  # 32//2 = 16
         latents, image_latents = self.prepare_latents(
             image,
             batch_size * num_videos_per_prompt,

inference/cli_demo_camera_i2v_pcd.py CHANGED Viewed

@@ -81,7 +81,6 @@ def maxpool_mask_tensor(mask_tensor):
     # assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
     assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
     downsampling_factor_h = (H // 8) // 2
     downsampling_factor_w = (W // 8) // 2
@@ -343,7 +342,7 @@ def generate_video(
             if pool_style == 'max':
                 controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
             elif pool_style == 'avg':
-               controlnet_output_mask = avgpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
         else:
             controlnet_output_mask = None
         # if os.path.isfile(output_path_file):

     # assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
     assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
     downsampling_factor_h = (H // 8) // 2
     downsampling_factor_w = (W // 8) // 2
             if pool_style == 'max':
                 controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
             elif pool_style == 'avg':
+                controlnet_output_mask = avgpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
         else:
             controlnet_output_mask = None
         # if os.path.isfile(output_path_file):