Muhammad Taqi Raza commited on
Commit
34bf23a
·
1 Parent(s): 4ada65a

adding some prints

Browse files
cogvideo_controlnet_pcd.py CHANGED
@@ -172,6 +172,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
172
  ):
173
  print("hidden_states.shape =", hidden_states.shape)
174
  print("controlnet_states.shape =", controlnet_states.shape)
 
175
 
176
  hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
177
 
 
172
  ):
173
  print("hidden_states.shape =", hidden_states.shape)
174
  print("controlnet_states.shape =", controlnet_states.shape)
175
+ print("image_rotary_emb.shape =", image_rotary_emb.shape)
176
 
177
  hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
178
 
controlnet_pipeline.py CHANGED
@@ -659,7 +659,8 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
659
  device, dtype=prompt_embeds.dtype
660
  )
661
 
662
- latent_channels = self.transformer.config.in_channels // 2 # 8
 
663
  latents, image_latents = self.prepare_latents(
664
  image,
665
  batch_size * num_videos_per_prompt,
 
659
  device, dtype=prompt_embeds.dtype
660
  )
661
 
662
+ latent_channels = self.transformer.config.in_channels // 2 # 32//2 = 16
663
+
664
  latents, image_latents = self.prepare_latents(
665
  image,
666
  batch_size * num_videos_per_prompt,
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -81,7 +81,6 @@ def maxpool_mask_tensor(mask_tensor):
81
  # assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
82
  assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
83
 
84
-
85
  downsampling_factor_h = (H // 8) // 2
86
  downsampling_factor_w = (W // 8) // 2
87
 
@@ -343,7 +342,7 @@ def generate_video(
343
  if pool_style == 'max':
344
  controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
345
  elif pool_style == 'avg':
346
- controlnet_output_mask = avgpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
347
  else:
348
  controlnet_output_mask = None
349
  # if os.path.isfile(output_path_file):
 
81
  # assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
82
  assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
83
 
 
84
  downsampling_factor_h = (H // 8) // 2
85
  downsampling_factor_w = (W // 8) // 2
86
 
 
342
  if pool_style == 'max':
343
  controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
344
  elif pool_style == 'avg':
345
+ controlnet_output_mask = avgpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
346
  else:
347
  controlnet_output_mask = None
348
  # if os.path.isfile(output_path_file):