Muhammad Taqi Raza commited on
Commit
2d59f81
·
1 Parent(s): 21e49a3

passing correct arguments

Browse files
cogvideo_controlnet_pcd.py CHANGED
@@ -36,7 +36,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
36
  attention_bias: bool = True,
37
  sample_width: int = 90,
38
  sample_height: int = 60,
39
- sample_frames: int = 97,
40
  patch_size: int = 2,
41
  temporal_compression_ratio: int = 4,
42
  max_text_seq_length: int = 226,
 
36
  attention_bias: bool = True,
37
  sample_width: int = 90,
38
  sample_height: int = 60,
39
+ sample_frames: int = 49,
40
  patch_size: int = 2,
41
  temporal_compression_ratio: int = 4,
42
  max_text_seq_length: int = 226,
controlnet_pipeline.py CHANGED
@@ -562,7 +562,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
562
  negative_prompt: Optional[Union[str, List[str]]] = None,
563
  height: Optional[int] = None,
564
  width: Optional[int] = None,
565
- num_frames: int = 97,
566
  num_inference_steps: int = 50,
567
  timesteps: Optional[List[int]] = None,
568
  guidance_scale: float = 6,
 
562
  negative_prompt: Optional[Union[str, List[str]]] = None,
563
  height: Optional[int] = None,
564
  width: Optional[int] = None,
565
+ num_frames: int = 49,
566
  num_inference_steps: int = 50,
567
  timesteps: Optional[List[int]] = None,
568
  guidance_scale: float = 6,
gradio_app.py CHANGED
@@ -193,7 +193,7 @@ with demo:
193
  seed_input2 = gr.Number(value=42, label="Seed")
194
  height_input = gr.Number(value=480, label="Height")
195
  width_input = gr.Number(value=720, label="Width")
196
- num_frames_input2 = gr.Number(value=97, label="Num Frames")
197
  fps_input2 = gr.Number(value=8, label="FPS")
198
  downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
199
  vae_channels_input = gr.Number(value=16, label="VAE Channels")
 
193
  seed_input2 = gr.Number(value=42, label="Seed")
194
  height_input = gr.Number(value=480, label="Height")
195
  width_input = gr.Number(value=720, label="Width")
196
+ num_frames_input2 = gr.Number(value=49, label="Num Frames")
197
  fps_input2 = gr.Number(value=8, label="FPS")
198
  downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
199
  vae_channels_input = gr.Number(value=16, label="VAE Channels")
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -145,7 +145,7 @@ def generate_video(
145
  num_videos_per_prompt: int = 1,
146
  dtype: torch.dtype = torch.bfloat16,
147
  seed: int = 42,
148
- num_frames: int = 97,
149
  height: int = 480,
150
  width: int = 720,
151
  start_camera_idx: int = 0,
@@ -382,7 +382,7 @@ if __name__ == "__main__":
382
  )
383
  parser.add_argument("--controlnet_weights", type=float, default=0.5, help="Strenght of controlnet")
384
  parser.add_argument("--use_zero_conv", action="store_true", default=False, help="Use zero conv")
385
- parser.add_argument("--infer_with_mask", action="store_true", default=False, help="add mask to controlnet")
386
  parser.add_argument("--pool_style", default='max', help="max pool or avg pool")
387
  parser.add_argument("--controlnet_guidance_start", type=float, default=0.0, help="The stage when the controlnet starts to be applied")
388
  parser.add_argument("--controlnet_guidance_end", type=float, default=0.5, help="The stage when the controlnet end to be applied")
@@ -403,17 +403,17 @@ if __name__ == "__main__":
403
  parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
404
  parser.add_argument("--height", type=int, default=480)
405
  parser.add_argument("--width", type=int, default=720)
406
- parser.add_argument("--num_frames", type=int, default=97)
407
  parser.add_argument("--start_camera_idx", type=int, default=0)
408
  parser.add_argument("--end_camera_idx", type=int, default=1)
409
- parser.add_argument("--controlnet_transformer_num_attn_heads", type=int, default=None)
410
- parser.add_argument("--controlnet_transformer_attention_head_dim", type=int, default=None)
411
- parser.add_argument("--controlnet_transformer_out_proj_dim_factor", type=int, default=None)
412
- parser.add_argument("--controlnet_transformer_out_proj_dim_zero_init", action="store_true", default=False, help=("Init project zero."),
413
  )
414
  parser.add_argument("--downscale_coef", type=int, default=8)
415
  parser.add_argument("--vae_channels", type=int, default=16)
416
- parser.add_argument("--controlnet_input_channels", type=int, default=6)
417
  parser.add_argument("--controlnet_transformer_num_layers", type=int, default=8)
418
  parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
419
  parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
 
145
  num_videos_per_prompt: int = 1,
146
  dtype: torch.dtype = torch.bfloat16,
147
  seed: int = 42,
148
+ num_frames: int = 49,
149
  height: int = 480,
150
  width: int = 720,
151
  start_camera_idx: int = 0,
 
382
  )
383
  parser.add_argument("--controlnet_weights", type=float, default=0.5, help="Strenght of controlnet")
384
  parser.add_argument("--use_zero_conv", action="store_true", default=False, help="Use zero conv")
385
+ parser.add_argument("--infer_with_mask", action="store_true", default=True, help="add mask to controlnet")
386
  parser.add_argument("--pool_style", default='max', help="max pool or avg pool")
387
  parser.add_argument("--controlnet_guidance_start", type=float, default=0.0, help="The stage when the controlnet starts to be applied")
388
  parser.add_argument("--controlnet_guidance_end", type=float, default=0.5, help="The stage when the controlnet end to be applied")
 
403
  parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
404
  parser.add_argument("--height", type=int, default=480)
405
  parser.add_argument("--width", type=int, default=720)
406
+ parser.add_argument("--num_frames", type=int, default=49)
407
  parser.add_argument("--start_camera_idx", type=int, default=0)
408
  parser.add_argument("--end_camera_idx", type=int, default=1)
409
+ parser.add_argument("--controlnet_transformer_num_attn_heads", type=int, default=4)
410
+ parser.add_argument("--controlnet_transformer_attention_head_dim", type=int, default=64)
411
+ parser.add_argument("--controlnet_transformer_out_proj_dim_factor", type=int, default=64)
412
+ parser.add_argument("--controlnet_transformer_out_proj_dim_zero_init", action="store_true", default=True, help=("Init project zero."),
413
  )
414
  parser.add_argument("--downscale_coef", type=int, default=8)
415
  parser.add_argument("--vae_channels", type=int, default=16)
416
+ parser.add_argument("--controlnet_input_channels", type=int, default=3)
417
  parser.add_argument("--controlnet_transformer_num_layers", type=int, default=8)
418
  parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
419
  parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
preprocess/get_vae_latent.py CHANGED
@@ -88,12 +88,12 @@ def main(args):
88
 
89
  try:
90
  vr = VideoReader(source_video_file)
91
- video = torch.from_numpy(vr.get_batch(np.arange(97)).asnumpy()).permute(0, 3, 1, 2).contiguous()
92
  video = (video / 255.0) * 2 - 1
93
  source_latent = encode_video(video, vae)
94
 
95
  vr = VideoReader(masked_video_file)
96
- video = torch.from_numpy(vr.get_batch(np.arange(97)).asnumpy()).permute(0, 3, 1, 2).contiguous()
97
  video = (video / 255.0) * 2 - 1
98
  video = add_dashed_rays_to_video(video)
99
  masked_latent = encode_video(video, vae)
 
88
 
89
  try:
90
  vr = VideoReader(source_video_file)
91
+ video = torch.from_numpy(vr.get_batch(np.arange(49)).asnumpy()).permute(0, 3, 1, 2).contiguous()
92
  video = (video / 255.0) * 2 - 1
93
  source_latent = encode_video(video, vae)
94
 
95
  vr = VideoReader(masked_video_file)
96
+ video = torch.from_numpy(vr.get_batch(np.arange(49)).asnumpy()).permute(0, 3, 1, 2).contiguous()
97
  video = (video / 255.0) * 2 - 1
98
  video = add_dashed_rays_to_video(video)
99
  masked_latent = encode_video(video, vae)
scripts/inference.sh CHANGED
@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 python inference/cli_demo_camera_i2v_pcd.py \
27
  --controlnet_transformer_out_proj_dim_factor 64 \
28
  --controlnet_transformer_out_proj_dim_zero_init \
29
  --vae_channels 16 \
30
- --num_frames 97 \
31
  --controlnet_transformer_num_layers 8 \
32
  --infer_with_mask \
33
  --pool_style 'max' \
 
27
  --controlnet_transformer_out_proj_dim_factor 64 \
28
  --controlnet_transformer_out_proj_dim_zero_init \
29
  --vae_channels 16 \
30
+ --num_frames 49 \
31
  --controlnet_transformer_num_layers 8 \
32
  --infer_with_mask \
33
  --pool_style 'max' \
scripts/train.sh CHANGED
@@ -23,7 +23,7 @@ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" accelerate launch --config_file accelerat
23
  --height 480 \
24
  --width 720 \
25
  --fps 8 \
26
- --max_num_frames 97 \
27
  --video_root_dir $video_root_dir \
28
  --hflip_p 0.0 \
29
  --controlnet_transformer_num_layers 8 \
 
23
  --height 480 \
24
  --width 720 \
25
  --fps 8 \
26
+ --max_num_frames 49 \
27
  --video_root_dir $video_root_dir \
28
  --hflip_p 0.0 \
29
  --controlnet_transformer_num_layers 8 \
scripts/train_with_latent.sh CHANGED
@@ -23,7 +23,7 @@ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" accelerate launch --config_file accelerat
23
  --height 480 \
24
  --width 720 \
25
  --fps 8 \
26
- --max_num_frames 97 \
27
  --video_root_dir $video_root_dir \
28
  --hflip_p 0.0 \
29
  --controlnet_transformer_num_layers 8 \
 
23
  --height 480 \
24
  --width 720 \
25
  --fps 8 \
26
+ --max_num_frames 49 \
27
  --video_root_dir $video_root_dir \
28
  --hflip_p 0.0 \
29
  --controlnet_transformer_num_layers 8 \
training/controlnet_datasets_camera_pcd_mask.py CHANGED
@@ -39,7 +39,7 @@ class RealEstate10KPCDRenderDataset(Dataset):
39
  def __init__(
40
  self,
41
  video_root_dir,
42
- sample_n_frames=97,
43
  image_size=[480, 720],
44
  shuffle_frames=False,
45
  hflip_p=0.0,
@@ -135,7 +135,7 @@ class RealEstate10KPCDRenderCapEmbDataset(RealEstate10KPCDRenderDataset):
135
  self,
136
  video_root_dir,
137
  text_embedding_path,
138
- sample_n_frames=97,
139
  image_size=[480, 720],
140
  shuffle_frames=False,
141
  hflip_p=0.0,
 
39
  def __init__(
40
  self,
41
  video_root_dir,
42
+ sample_n_frames=49,
43
  image_size=[480, 720],
44
  shuffle_frames=False,
45
  hflip_p=0.0,
 
135
  self,
136
  video_root_dir,
137
  text_embedding_path,
138
+ sample_n_frames=49,
139
  image_size=[480, 720],
140
  shuffle_frames=False,
141
  hflip_p=0.0,
training/train_controlnet_i2v_pcd_render_mask_aware_add_dash.py CHANGED
@@ -255,7 +255,7 @@ def get_args():
255
  )
256
  parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
257
  parser.add_argument(
258
- "--max_num_frames", type=int, default=97, help="All input videos will be truncated to these many frames."
259
  )
260
  parser.add_argument(
261
  "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
@@ -479,7 +479,7 @@ def get_args():
479
  return parser.parse_args()
480
 
481
 
482
- def read_video(video_path, start_index=0, frames_count=97, stride=1):
483
  video_reader = VideoReader(video_path)
484
  end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
485
  batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
 
255
  )
256
  parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
257
  parser.add_argument(
258
+ "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
259
  )
260
  parser.add_argument(
261
  "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
 
479
  return parser.parse_args()
480
 
481
 
482
+ def read_video(video_path, start_index=0, frames_count=49, stride=1):
483
  video_reader = VideoReader(video_path)
484
  end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
485
  batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
training/train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py CHANGED
@@ -255,7 +255,7 @@ def get_args():
255
  )
256
  parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
257
  parser.add_argument(
258
- "--max_num_frames", type=int, default=97, help="All input videos will be truncated to these many frames."
259
  )
260
  parser.add_argument(
261
  "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
@@ -479,7 +479,7 @@ def get_args():
479
  return parser.parse_args()
480
 
481
 
482
- def read_video(video_path, start_index=0, frames_count=97, stride=1):
483
  video_reader = VideoReader(video_path)
484
  end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
485
  batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
 
255
  )
256
  parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
257
  parser.add_argument(
258
+ "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
259
  )
260
  parser.add_argument(
261
  "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
 
479
  return parser.parse_args()
480
 
481
 
482
+ def read_video(video_path, start_index=0, frames_count=49, stride=1):
483
  video_reader = VideoReader(video_path)
484
  end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
485
  batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)