Muhammad Taqi Raza
commited on
Commit
·
3c780d6
1
Parent(s):
cacc58a
adding CogVideox1.5-5B-I2V
Browse files
cogvideo_controlnet_pcd.py
CHANGED
@@ -46,7 +46,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
46 |
norm_eps: float = 1e-5,
|
47 |
spatial_interpolation_scale: float = 1.875,
|
48 |
temporal_interpolation_scale: float = 1.0,
|
49 |
-
use_rotary_positional_embeddings: bool =
|
50 |
use_learned_positional_embeddings: bool = False,
|
51 |
out_proj_dim: int = None,
|
52 |
out_proj_dim_zero_init: bool = False,
|
@@ -95,8 +95,8 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
95 |
temporal_compression_ratio=temporal_compression_ratio,
|
96 |
spatial_interpolation_scale=spatial_interpolation_scale,
|
97 |
temporal_interpolation_scale=temporal_interpolation_scale,
|
98 |
-
use_positional_embeddings=
|
99 |
-
use_learned_positional_embeddings=
|
100 |
)
|
101 |
self.embedding_dropout = nn.Dropout(dropout)
|
102 |
|
|
|
46 |
norm_eps: float = 1e-5,
|
47 |
spatial_interpolation_scale: float = 1.875,
|
48 |
temporal_interpolation_scale: float = 1.0,
|
49 |
+
use_rotary_positional_embeddings: bool = True,
|
50 |
use_learned_positional_embeddings: bool = False,
|
51 |
out_proj_dim: int = None,
|
52 |
out_proj_dim_zero_init: bool = False,
|
|
|
95 |
temporal_compression_ratio=temporal_compression_ratio,
|
96 |
spatial_interpolation_scale=spatial_interpolation_scale,
|
97 |
temporal_interpolation_scale=temporal_interpolation_scale,
|
98 |
+
use_positional_embeddings=not use_rotary_positional_embeddings,
|
99 |
+
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
100 |
)
|
101 |
self.embedding_dropout = nn.Dropout(dropout)
|
102 |
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -227,6 +227,7 @@ def generate_video(
|
|
227 |
controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * controlnet_transformer_out_proj_dim_factor
|
228 |
controlnet_kwargs["out_proj_dim_zero_init"] = controlnet_transformer_out_proj_dim_zero_init
|
229 |
|
|
|
230 |
controlnet = CogVideoXControlnetPCD(
|
231 |
num_layers=controlnet_transformer_num_layers,
|
232 |
downscale_coef=downscale_coef,
|
@@ -367,7 +368,6 @@ def generate_video(
|
|
367 |
height=height, # Height of the generated video
|
368 |
width=width, # Width of the generated video
|
369 |
).frames
|
370 |
-
|
371 |
video_generate = video_generate_all[0]
|
372 |
|
373 |
# 6. Export the generated frames to a video file. fps must be 8 for original video.
|
|
|
227 |
controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * controlnet_transformer_out_proj_dim_factor
|
228 |
controlnet_kwargs["out_proj_dim_zero_init"] = controlnet_transformer_out_proj_dim_zero_init
|
229 |
|
230 |
+
|
231 |
controlnet = CogVideoXControlnetPCD(
|
232 |
num_layers=controlnet_transformer_num_layers,
|
233 |
downscale_coef=downscale_coef,
|
|
|
368 |
height=height, # Height of the generated video
|
369 |
width=width, # Width of the generated video
|
370 |
).frames
|
|
|
371 |
video_generate = video_generate_all[0]
|
372 |
|
373 |
# 6. Export the generated frames to a video file. fps must be 8 for original video.
|
training/controlnet_datasets_camera_pcd_mask.py
CHANGED
@@ -164,6 +164,7 @@ class RealEstate10KPCDRenderCapEmbDataset(RealEstate10KPCDRenderDataset):
|
|
164 |
|
165 |
anchor_pixels = torch.from_numpy(mask_video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
|
166 |
anchor_pixels = anchor_pixels / 255.
|
|
|
167 |
try:
|
168 |
masks = np.load(os.path.join(self.mask_root, clip_name + '.npz'))['mask']*1.0
|
169 |
masks = torch.from_numpy(masks).unsqueeze(1)
|
|
|
164 |
|
165 |
anchor_pixels = torch.from_numpy(mask_video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
|
166 |
anchor_pixels = anchor_pixels / 255.
|
167 |
+
|
168 |
try:
|
169 |
masks = np.load(os.path.join(self.mask_root, clip_name + '.npz'))['mask']*1.0
|
170 |
masks = torch.from_numpy(masks).unsqueeze(1)
|