Muhammad Taqi Raza
commited on
Commit
·
34bf23a
1
Parent(s):
4ada65a
adding some prints
Browse files
cogvideo_controlnet_pcd.py
CHANGED
@@ -172,6 +172,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
172 |
):
|
173 |
print("hidden_states.shape =", hidden_states.shape)
|
174 |
print("controlnet_states.shape =", controlnet_states.shape)
|
|
|
175 |
|
176 |
hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
|
177 |
|
|
|
172 |
):
|
173 |
print("hidden_states.shape =", hidden_states.shape)
|
174 |
print("controlnet_states.shape =", controlnet_states.shape)
|
175 |
+
print("image_rotary_emb.shape =", image_rotary_emb.shape)
|
176 |
|
177 |
hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
|
178 |
|
controlnet_pipeline.py
CHANGED
@@ -659,7 +659,8 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
659 |
device, dtype=prompt_embeds.dtype
|
660 |
)
|
661 |
|
662 |
-
latent_channels = self.transformer.config.in_channels // 2 #
|
|
|
663 |
latents, image_latents = self.prepare_latents(
|
664 |
image,
|
665 |
batch_size * num_videos_per_prompt,
|
|
|
659 |
device, dtype=prompt_embeds.dtype
|
660 |
)
|
661 |
|
662 |
+
latent_channels = self.transformer.config.in_channels // 2 # 32//2 = 16
|
663 |
+
|
664 |
latents, image_latents = self.prepare_latents(
|
665 |
image,
|
666 |
batch_size * num_videos_per_prompt,
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -81,7 +81,6 @@ def maxpool_mask_tensor(mask_tensor):
|
|
81 |
# assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
|
82 |
assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
|
83 |
|
84 |
-
|
85 |
downsampling_factor_h = (H // 8) // 2
|
86 |
downsampling_factor_w = (W // 8) // 2
|
87 |
|
@@ -343,7 +342,7 @@ def generate_video(
|
|
343 |
if pool_style == 'max':
|
344 |
controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
|
345 |
elif pool_style == 'avg':
|
346 |
-
|
347 |
else:
|
348 |
controlnet_output_mask = None
|
349 |
# if os.path.isfile(output_path_file):
|
|
|
81 |
# assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
|
82 |
assert H % 8 == 0 and W % 8 == 0, "H and W must be divisible by 8 for spatial pooling"
|
83 |
|
|
|
84 |
downsampling_factor_h = (H // 8) // 2
|
85 |
downsampling_factor_w = (W // 8) // 2
|
86 |
|
|
|
342 |
if pool_style == 'max':
|
343 |
controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
|
344 |
elif pool_style == 'avg':
|
345 |
+
controlnet_output_mask = avgpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
|
346 |
else:
|
347 |
controlnet_output_mask = None
|
348 |
# if os.path.isfile(output_path_file):
|