Muhammad Taqi Raza commited on
Commit
16c40e7
·
1 Parent(s): cdb41ad
cogvideo_controlnet_pcd.py CHANGED
@@ -54,12 +54,12 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
54
  super().__init__()
55
  inner_dim = num_attention_heads * attention_head_dim
56
 
57
- if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
58
- raise ValueError(
59
- "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
60
- "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
61
- "issue at https://github.com/huggingface/diffusers/issues."
62
- )
63
 
64
  self.vae_channels = vae_channels
65
  start_channels = in_channels * (downscale_coef ** 2)
@@ -84,21 +84,6 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
84
  else:
85
  patch_embed_in_channels = vae_channels*2
86
 
87
- # 1. Patch embedding
88
- # self.patch_embed = CogVideoXPatchEmbed(
89
- # patch_size=patch_size,
90
- # in_channels=patch_embed_in_channels,
91
- # embed_dim=inner_dim,
92
- # bias=True,
93
- # sample_width=sample_width,
94
- # sample_height=sample_height,
95
- # sample_frames=sample_frames,
96
- # temporal_compression_ratio=temporal_compression_ratio,
97
- # spatial_interpolation_scale=spatial_interpolation_scale,
98
- # temporal_interpolation_scale=temporal_interpolation_scale,
99
- # use_positional_embeddings=not use_rotary_positional_embeddings,
100
- # use_learned_positional_embeddings=use_learned_positional_embeddings,
101
- # )
102
  self.patch_embed = CogVideoXPatchEmbed(
103
  patch_size=patch_size,
104
  in_channels=patch_embed_in_channels,
@@ -110,8 +95,8 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
110
  temporal_compression_ratio=temporal_compression_ratio,
111
  spatial_interpolation_scale=spatial_interpolation_scale,
112
  temporal_interpolation_scale=temporal_interpolation_scale,
113
- use_positional_embeddings=not use_rotary_positional_embeddings,
114
- use_learned_positional_embeddings=use_learned_positional_embeddings,
115
  )
116
  self.embedding_dropout = nn.Dropout(dropout)
117
 
 
54
  super().__init__()
55
  inner_dim = num_attention_heads * attention_head_dim
56
 
57
+ # if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
58
+ # raise ValueError(
59
+ # "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
60
+ # "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
61
+ # "issue at https://github.com/huggingface/diffusers/issues."
62
+ # )
63
 
64
  self.vae_channels = vae_channels
65
  start_channels = in_channels * (downscale_coef ** 2)
 
84
  else:
85
  patch_embed_in_channels = vae_channels*2
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  self.patch_embed = CogVideoXPatchEmbed(
88
  patch_size=patch_size,
89
  in_channels=patch_embed_in_channels,
 
95
  temporal_compression_ratio=temporal_compression_ratio,
96
  spatial_interpolation_scale=spatial_interpolation_scale,
97
  temporal_interpolation_scale=temporal_interpolation_scale,
98
+ use_positional_embeddings=False,
99
+ use_learned_positional_embeddings=False,
100
  )
101
  self.embedding_dropout = nn.Dropout(dropout)
102
 
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -157,8 +157,8 @@ def generate_video(
157
  dtype: torch.dtype = torch.bfloat16,
158
  seed: int = 42,
159
  num_frames: int = 49,
160
- height: int = 480,
161
- width: int = 720,
162
  start_camera_idx: int = 0,
163
  end_camera_idx: int = 1,
164
  controlnet_transformer_num_attn_heads: int = None,
@@ -212,6 +212,7 @@ def generate_video(
212
  scheduler = CogVideoXDDIMScheduler.from_pretrained(
213
  base_model_path, subfolder="scheduler"
214
  )
 
215
  # ControlNet
216
  num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
217
  controlnet_kwargs = {}
@@ -252,7 +253,7 @@ def generate_video(
252
  controlnet=controlnet,
253
  scheduler=scheduler,
254
  ).to('cuda')
255
-
256
  # If you're using with lora, add this code
257
  if lora_path:
258
  pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
 
157
  dtype: torch.dtype = torch.bfloat16,
158
  seed: int = 42,
159
  num_frames: int = 49,
160
+ height: int = 480, # 768
161
+ width: int = 720, # 1360
162
  start_camera_idx: int = 0,
163
  end_camera_idx: int = 1,
164
  controlnet_transformer_num_attn_heads: int = None,
 
212
  scheduler = CogVideoXDDIMScheduler.from_pretrained(
213
  base_model_path, subfolder="scheduler"
214
  )
215
+
216
  # ControlNet
217
  num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
218
  controlnet_kwargs = {}
 
253
  controlnet=controlnet,
254
  scheduler=scheduler,
255
  ).to('cuda')
256
+
257
  # If you're using with lora, add this code
258
  if lora_path:
259
  pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
requirements.txt CHANGED
@@ -8,6 +8,9 @@ opencv-python>=4.10.0.84
8
  imageio>=2.35.1
9
  imageio-ffmpeg>=0.5.1
10
  sentencepiece>=0.2.0
 
 
 
11
  einops
12
  decord
13
  protobuf
 
8
  imageio>=2.35.1
9
  imageio-ffmpeg>=0.5.1
10
  sentencepiece>=0.2.0
11
+ SwissArmyTransformer>=0.4.12
12
+ openai>=1.54.0
13
+ pydantic>=2.10.3
14
  einops
15
  decord
16
  protobuf