Muhammad Taqi Raza
commited on
Commit
·
0d2f841
1
Parent(s):
509e4d7
print shapes
Browse files- cogvideo_controlnet_pcd.py +19 -1
- controlnet_pipeline.py +1 -0
- inference/cli_demo_camera_i2v_pcd.py +2 -0
cogvideo_controlnet_pcd.py
CHANGED
@@ -85,6 +85,20 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
85 |
patch_embed_in_channels = vae_channels*2
|
86 |
|
87 |
# 1. Patch embedding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
self.patch_embed = CogVideoXPatchEmbed(
|
89 |
patch_size=patch_size,
|
90 |
in_channels=patch_embed_in_channels,
|
@@ -99,7 +113,6 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
99 |
use_positional_embeddings=not use_rotary_positional_embeddings,
|
100 |
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
101 |
)
|
102 |
-
|
103 |
self.embedding_dropout = nn.Dropout(dropout)
|
104 |
|
105 |
# 2. Time embeddings
|
@@ -225,6 +238,11 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
225 |
image_rotary_emb=image_rotary_emb,
|
226 |
)
|
227 |
|
|
|
|
|
|
|
|
|
|
|
228 |
if self.out_projectors is not None:
|
229 |
if controlnet_output_mask is not None:
|
230 |
controlnet_hidden_states += (self.out_projectors[i](hidden_states) * controlnet_output_mask,)
|
|
|
85 |
patch_embed_in_channels = vae_channels*2
|
86 |
|
87 |
# 1. Patch embedding
|
88 |
+
# self.patch_embed = CogVideoXPatchEmbed(
|
89 |
+
# patch_size=patch_size,
|
90 |
+
# in_channels=patch_embed_in_channels,
|
91 |
+
# embed_dim=inner_dim,
|
92 |
+
# bias=True,
|
93 |
+
# sample_width=sample_width,
|
94 |
+
# sample_height=sample_height,
|
95 |
+
# sample_frames=sample_frames,
|
96 |
+
# temporal_compression_ratio=temporal_compression_ratio,
|
97 |
+
# spatial_interpolation_scale=spatial_interpolation_scale,
|
98 |
+
# temporal_interpolation_scale=temporal_interpolation_scale,
|
99 |
+
# use_positional_embeddings=not use_rotary_positional_embeddings,
|
100 |
+
# use_learned_positional_embeddings=use_learned_positional_embeddings,
|
101 |
+
# )
|
102 |
self.patch_embed = CogVideoXPatchEmbed(
|
103 |
patch_size=patch_size,
|
104 |
in_channels=patch_embed_in_channels,
|
|
|
113 |
use_positional_embeddings=not use_rotary_positional_embeddings,
|
114 |
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
115 |
)
|
|
|
116 |
self.embedding_dropout = nn.Dropout(dropout)
|
117 |
|
118 |
# 2. Time embeddings
|
|
|
238 |
image_rotary_emb=image_rotary_emb,
|
239 |
)
|
240 |
|
241 |
+
print("hidden_states shape:", hidden_states.shape)
|
242 |
+
print("out_projectors[i](hidden_states) shape:", self.out_projectors[i](hidden_states).shape)
|
243 |
+
print("controlnet_output_mask shape:", controlnet_output_mask.shape)
|
244 |
+
|
245 |
+
|
246 |
if self.out_projectors is not None:
|
247 |
if controlnet_output_mask is not None:
|
248 |
controlnet_hidden_states += (self.out_projectors[i](hidden_states) * controlnet_output_mask,)
|
controlnet_pipeline.py
CHANGED
@@ -733,6 +733,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
733 |
controlnet_output_mask = controlnet_output_mask,
|
734 |
timestep=timestep,
|
735 |
return_dict=False,
|
|
|
736 |
)[0]
|
737 |
if isinstance(controlnet_states, (tuple, list)):
|
738 |
controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
|
|
|
733 |
controlnet_output_mask = controlnet_output_mask,
|
734 |
timestep=timestep,
|
735 |
return_dict=False,
|
736 |
+
|
737 |
)[0]
|
738 |
if isinstance(controlnet_states, (tuple, list)):
|
739 |
controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -218,6 +218,8 @@ def generate_video(
|
|
218 |
in_channels=controlnet_input_channels,
|
219 |
use_zero_conv=use_zero_conv,
|
220 |
sample_frames = num_frames, # 49 frames
|
|
|
|
|
221 |
**controlnet_kwargs,
|
222 |
)
|
223 |
if controlnet_model_path:
|
|
|
218 |
in_channels=controlnet_input_channels,
|
219 |
use_zero_conv=use_zero_conv,
|
220 |
sample_frames = num_frames, # 49 frames
|
221 |
+
sample_height= height, # 480
|
222 |
+
sample_width= width, # 720
|
223 |
**controlnet_kwargs,
|
224 |
)
|
225 |
if controlnet_model_path:
|