Muhammad Taqi Raza commited on
Commit
3c780d6
·
1 Parent(s): cacc58a

adding CogVideox1.5-5B-I2V

Browse files
cogvideo_controlnet_pcd.py CHANGED
@@ -46,7 +46,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
46
  norm_eps: float = 1e-5,
47
  spatial_interpolation_scale: float = 1.875,
48
  temporal_interpolation_scale: float = 1.0,
49
- use_rotary_positional_embeddings: bool = False,
50
  use_learned_positional_embeddings: bool = False,
51
  out_proj_dim: int = None,
52
  out_proj_dim_zero_init: bool = False,
@@ -95,8 +95,8 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
95
  temporal_compression_ratio=temporal_compression_ratio,
96
  spatial_interpolation_scale=spatial_interpolation_scale,
97
  temporal_interpolation_scale=temporal_interpolation_scale,
98
- use_positional_embeddings=False,
99
- use_learned_positional_embeddings=False,
100
  )
101
  self.embedding_dropout = nn.Dropout(dropout)
102
 
 
46
  norm_eps: float = 1e-5,
47
  spatial_interpolation_scale: float = 1.875,
48
  temporal_interpolation_scale: float = 1.0,
49
+ use_rotary_positional_embeddings: bool = True,
50
  use_learned_positional_embeddings: bool = False,
51
  out_proj_dim: int = None,
52
  out_proj_dim_zero_init: bool = False,
 
95
  temporal_compression_ratio=temporal_compression_ratio,
96
  spatial_interpolation_scale=spatial_interpolation_scale,
97
  temporal_interpolation_scale=temporal_interpolation_scale,
98
+ use_positional_embeddings=not use_rotary_positional_embeddings,
99
+ use_learned_positional_embeddings=use_learned_positional_embeddings,
100
  )
101
  self.embedding_dropout = nn.Dropout(dropout)
102
 
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -227,6 +227,7 @@ def generate_video(
227
  controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * controlnet_transformer_out_proj_dim_factor
228
  controlnet_kwargs["out_proj_dim_zero_init"] = controlnet_transformer_out_proj_dim_zero_init
229
 
 
230
  controlnet = CogVideoXControlnetPCD(
231
  num_layers=controlnet_transformer_num_layers,
232
  downscale_coef=downscale_coef,
@@ -367,7 +368,6 @@ def generate_video(
367
  height=height, # Height of the generated video
368
  width=width, # Width of the generated video
369
  ).frames
370
-
371
  video_generate = video_generate_all[0]
372
 
373
  # 6. Export the generated frames to a video file. fps must be 8 for original video.
 
227
  controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * controlnet_transformer_out_proj_dim_factor
228
  controlnet_kwargs["out_proj_dim_zero_init"] = controlnet_transformer_out_proj_dim_zero_init
229
 
230
+
231
  controlnet = CogVideoXControlnetPCD(
232
  num_layers=controlnet_transformer_num_layers,
233
  downscale_coef=downscale_coef,
 
368
  height=height, # Height of the generated video
369
  width=width, # Width of the generated video
370
  ).frames
 
371
  video_generate = video_generate_all[0]
372
 
373
  # 6. Export the generated frames to a video file. fps must be 8 for original video.
training/controlnet_datasets_camera_pcd_mask.py CHANGED
@@ -164,6 +164,7 @@ class RealEstate10KPCDRenderCapEmbDataset(RealEstate10KPCDRenderDataset):
164
 
165
  anchor_pixels = torch.from_numpy(mask_video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
166
  anchor_pixels = anchor_pixels / 255.
 
167
  try:
168
  masks = np.load(os.path.join(self.mask_root, clip_name + '.npz'))['mask']*1.0
169
  masks = torch.from_numpy(masks).unsqueeze(1)
 
164
 
165
  anchor_pixels = torch.from_numpy(mask_video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
166
  anchor_pixels = anchor_pixels / 255.
167
+
168
  try:
169
  masks = np.load(os.path.join(self.mask_root, clip_name + '.npz'))['mask']*1.0
170
  masks = torch.from_numpy(masks).unsqueeze(1)