Muhammad Taqi Raza
commited on
Commit
·
16c40e7
1
Parent(s):
cdb41ad
add files
Browse files- cogvideo_controlnet_pcd.py +8 -23
- inference/cli_demo_camera_i2v_pcd.py +4 -3
- requirements.txt +3 -0
cogvideo_controlnet_pcd.py
CHANGED
@@ -54,12 +54,12 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
54 |
super().__init__()
|
55 |
inner_dim = num_attention_heads * attention_head_dim
|
56 |
|
57 |
-
if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
|
64 |
self.vae_channels = vae_channels
|
65 |
start_channels = in_channels * (downscale_coef ** 2)
|
@@ -84,21 +84,6 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
84 |
else:
|
85 |
patch_embed_in_channels = vae_channels*2
|
86 |
|
87 |
-
# 1. Patch embedding
|
88 |
-
# self.patch_embed = CogVideoXPatchEmbed(
|
89 |
-
# patch_size=patch_size,
|
90 |
-
# in_channels=patch_embed_in_channels,
|
91 |
-
# embed_dim=inner_dim,
|
92 |
-
# bias=True,
|
93 |
-
# sample_width=sample_width,
|
94 |
-
# sample_height=sample_height,
|
95 |
-
# sample_frames=sample_frames,
|
96 |
-
# temporal_compression_ratio=temporal_compression_ratio,
|
97 |
-
# spatial_interpolation_scale=spatial_interpolation_scale,
|
98 |
-
# temporal_interpolation_scale=temporal_interpolation_scale,
|
99 |
-
# use_positional_embeddings=not use_rotary_positional_embeddings,
|
100 |
-
# use_learned_positional_embeddings=use_learned_positional_embeddings,
|
101 |
-
# )
|
102 |
self.patch_embed = CogVideoXPatchEmbed(
|
103 |
patch_size=patch_size,
|
104 |
in_channels=patch_embed_in_channels,
|
@@ -110,8 +95,8 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
110 |
temporal_compression_ratio=temporal_compression_ratio,
|
111 |
spatial_interpolation_scale=spatial_interpolation_scale,
|
112 |
temporal_interpolation_scale=temporal_interpolation_scale,
|
113 |
-
use_positional_embeddings=
|
114 |
-
use_learned_positional_embeddings=
|
115 |
)
|
116 |
self.embedding_dropout = nn.Dropout(dropout)
|
117 |
|
|
|
54 |
super().__init__()
|
55 |
inner_dim = num_attention_heads * attention_head_dim
|
56 |
|
57 |
+
# if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
|
58 |
+
# raise ValueError(
|
59 |
+
# "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
|
60 |
+
# "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
|
61 |
+
# "issue at https://github.com/huggingface/diffusers/issues."
|
62 |
+
# )
|
63 |
|
64 |
self.vae_channels = vae_channels
|
65 |
start_channels = in_channels * (downscale_coef ** 2)
|
|
|
84 |
else:
|
85 |
patch_embed_in_channels = vae_channels*2
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
self.patch_embed = CogVideoXPatchEmbed(
|
88 |
patch_size=patch_size,
|
89 |
in_channels=patch_embed_in_channels,
|
|
|
95 |
temporal_compression_ratio=temporal_compression_ratio,
|
96 |
spatial_interpolation_scale=spatial_interpolation_scale,
|
97 |
temporal_interpolation_scale=temporal_interpolation_scale,
|
98 |
+
use_positional_embeddings=False,
|
99 |
+
use_learned_positional_embeddings=False,
|
100 |
)
|
101 |
self.embedding_dropout = nn.Dropout(dropout)
|
102 |
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -157,8 +157,8 @@ def generate_video(
|
|
157 |
dtype: torch.dtype = torch.bfloat16,
|
158 |
seed: int = 42,
|
159 |
num_frames: int = 49,
|
160 |
-
height: int = 480,
|
161 |
-
width: int = 720,
|
162 |
start_camera_idx: int = 0,
|
163 |
end_camera_idx: int = 1,
|
164 |
controlnet_transformer_num_attn_heads: int = None,
|
@@ -212,6 +212,7 @@ def generate_video(
|
|
212 |
scheduler = CogVideoXDDIMScheduler.from_pretrained(
|
213 |
base_model_path, subfolder="scheduler"
|
214 |
)
|
|
|
215 |
# ControlNet
|
216 |
num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
|
217 |
controlnet_kwargs = {}
|
@@ -252,7 +253,7 @@ def generate_video(
|
|
252 |
controlnet=controlnet,
|
253 |
scheduler=scheduler,
|
254 |
).to('cuda')
|
255 |
-
|
256 |
# If you're using with lora, add this code
|
257 |
if lora_path:
|
258 |
pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
|
|
|
157 |
dtype: torch.dtype = torch.bfloat16,
|
158 |
seed: int = 42,
|
159 |
num_frames: int = 49,
|
160 |
+
height: int = 480, # 768
|
161 |
+
width: int = 720, # 1360
|
162 |
start_camera_idx: int = 0,
|
163 |
end_camera_idx: int = 1,
|
164 |
controlnet_transformer_num_attn_heads: int = None,
|
|
|
212 |
scheduler = CogVideoXDDIMScheduler.from_pretrained(
|
213 |
base_model_path, subfolder="scheduler"
|
214 |
)
|
215 |
+
|
216 |
# ControlNet
|
217 |
num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
|
218 |
controlnet_kwargs = {}
|
|
|
253 |
controlnet=controlnet,
|
254 |
scheduler=scheduler,
|
255 |
).to('cuda')
|
256 |
+
|
257 |
# If you're using with lora, add this code
|
258 |
if lora_path:
|
259 |
pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
|
requirements.txt
CHANGED
@@ -8,6 +8,9 @@ opencv-python>=4.10.0.84
|
|
8 |
imageio>=2.35.1
|
9 |
imageio-ffmpeg>=0.5.1
|
10 |
sentencepiece>=0.2.0
|
|
|
|
|
|
|
11 |
einops
|
12 |
decord
|
13 |
protobuf
|
|
|
8 |
imageio>=2.35.1
|
9 |
imageio-ffmpeg>=0.5.1
|
10 |
sentencepiece>=0.2.0
|
11 |
+
SwissArmyTransformer>=0.4.12
|
12 |
+
openai>=1.54.0
|
13 |
+
pydantic>=2.10.3
|
14 |
einops
|
15 |
decord
|
16 |
protobuf
|