Muhammad Taqi Raza commited on
Commit
0d2f841
·
1 Parent(s): 509e4d7

print shapes

Browse files
cogvideo_controlnet_pcd.py CHANGED
@@ -85,6 +85,20 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
85
  patch_embed_in_channels = vae_channels*2
86
 
87
  # 1. Patch embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  self.patch_embed = CogVideoXPatchEmbed(
89
  patch_size=patch_size,
90
  in_channels=patch_embed_in_channels,
@@ -99,7 +113,6 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
99
  use_positional_embeddings=not use_rotary_positional_embeddings,
100
  use_learned_positional_embeddings=use_learned_positional_embeddings,
101
  )
102
-
103
  self.embedding_dropout = nn.Dropout(dropout)
104
 
105
  # 2. Time embeddings
@@ -225,6 +238,11 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
225
  image_rotary_emb=image_rotary_emb,
226
  )
227
 
 
 
 
 
 
228
  if self.out_projectors is not None:
229
  if controlnet_output_mask is not None:
230
  controlnet_hidden_states += (self.out_projectors[i](hidden_states) * controlnet_output_mask,)
 
85
  patch_embed_in_channels = vae_channels*2
86
 
87
  # 1. Patch embedding
88
+ # self.patch_embed = CogVideoXPatchEmbed(
89
+ # patch_size=patch_size,
90
+ # in_channels=patch_embed_in_channels,
91
+ # embed_dim=inner_dim,
92
+ # bias=True,
93
+ # sample_width=sample_width,
94
+ # sample_height=sample_height,
95
+ # sample_frames=sample_frames,
96
+ # temporal_compression_ratio=temporal_compression_ratio,
97
+ # spatial_interpolation_scale=spatial_interpolation_scale,
98
+ # temporal_interpolation_scale=temporal_interpolation_scale,
99
+ # use_positional_embeddings=not use_rotary_positional_embeddings,
100
+ # use_learned_positional_embeddings=use_learned_positional_embeddings,
101
+ # )
102
  self.patch_embed = CogVideoXPatchEmbed(
103
  patch_size=patch_size,
104
  in_channels=patch_embed_in_channels,
 
113
  use_positional_embeddings=not use_rotary_positional_embeddings,
114
  use_learned_positional_embeddings=use_learned_positional_embeddings,
115
  )
 
116
  self.embedding_dropout = nn.Dropout(dropout)
117
 
118
  # 2. Time embeddings
 
238
  image_rotary_emb=image_rotary_emb,
239
  )
240
 
241
+ print("hidden_states shape:", hidden_states.shape)
242
+ print("out_projectors[i](hidden_states) shape:", self.out_projectors[i](hidden_states).shape)
243
+ print("controlnet_output_mask shape:", controlnet_output_mask.shape)
244
+
245
+
246
  if self.out_projectors is not None:
247
  if controlnet_output_mask is not None:
248
  controlnet_hidden_states += (self.out_projectors[i](hidden_states) * controlnet_output_mask,)
controlnet_pipeline.py CHANGED
@@ -733,6 +733,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
733
  controlnet_output_mask = controlnet_output_mask,
734
  timestep=timestep,
735
  return_dict=False,
 
736
  )[0]
737
  if isinstance(controlnet_states, (tuple, list)):
738
  controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
 
733
  controlnet_output_mask = controlnet_output_mask,
734
  timestep=timestep,
735
  return_dict=False,
736
+
737
  )[0]
738
  if isinstance(controlnet_states, (tuple, list)):
739
  controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -218,6 +218,8 @@ def generate_video(
218
  in_channels=controlnet_input_channels,
219
  use_zero_conv=use_zero_conv,
220
  sample_frames = num_frames, # 49 frames
 
 
221
  **controlnet_kwargs,
222
  )
223
  if controlnet_model_path:
 
218
  in_channels=controlnet_input_channels,
219
  use_zero_conv=use_zero_conv,
220
  sample_frames = num_frames, # 49 frames
221
+ sample_height= height, # 480
222
+ sample_width= width, # 720
223
  **controlnet_kwargs,
224
  )
225
  if controlnet_model_path: