Muhammad Taqi Raza
commited on
Commit
·
2d59f81
1
Parent(s):
21e49a3
passing correct arguments
Browse files- cogvideo_controlnet_pcd.py +1 -1
- controlnet_pipeline.py +1 -1
- gradio_app.py +1 -1
- inference/cli_demo_camera_i2v_pcd.py +8 -8
- preprocess/get_vae_latent.py +2 -2
- scripts/inference.sh +1 -1
- scripts/train.sh +1 -1
- scripts/train_with_latent.sh +1 -1
- training/controlnet_datasets_camera_pcd_mask.py +2 -2
- training/train_controlnet_i2v_pcd_render_mask_aware_add_dash.py +2 -2
- training/train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py +2 -2
cogvideo_controlnet_pcd.py
CHANGED
@@ -36,7 +36,7 @@ class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
|
|
36 |
attention_bias: bool = True,
|
37 |
sample_width: int = 90,
|
38 |
sample_height: int = 60,
|
39 |
-
sample_frames: int =
|
40 |
patch_size: int = 2,
|
41 |
temporal_compression_ratio: int = 4,
|
42 |
max_text_seq_length: int = 226,
|
|
|
36 |
attention_bias: bool = True,
|
37 |
sample_width: int = 90,
|
38 |
sample_height: int = 60,
|
39 |
+
sample_frames: int = 49,
|
40 |
patch_size: int = 2,
|
41 |
temporal_compression_ratio: int = 4,
|
42 |
max_text_seq_length: int = 226,
|
controlnet_pipeline.py
CHANGED
@@ -562,7 +562,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
562 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
563 |
height: Optional[int] = None,
|
564 |
width: Optional[int] = None,
|
565 |
-
num_frames: int =
|
566 |
num_inference_steps: int = 50,
|
567 |
timesteps: Optional[List[int]] = None,
|
568 |
guidance_scale: float = 6,
|
|
|
562 |
negative_prompt: Optional[Union[str, List[str]]] = None,
|
563 |
height: Optional[int] = None,
|
564 |
width: Optional[int] = None,
|
565 |
+
num_frames: int = 49,
|
566 |
num_inference_steps: int = 50,
|
567 |
timesteps: Optional[List[int]] = None,
|
568 |
guidance_scale: float = 6,
|
gradio_app.py
CHANGED
@@ -193,7 +193,7 @@ with demo:
|
|
193 |
seed_input2 = gr.Number(value=42, label="Seed")
|
194 |
height_input = gr.Number(value=480, label="Height")
|
195 |
width_input = gr.Number(value=720, label="Width")
|
196 |
-
num_frames_input2 = gr.Number(value=
|
197 |
fps_input2 = gr.Number(value=8, label="FPS")
|
198 |
downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
|
199 |
vae_channels_input = gr.Number(value=16, label="VAE Channels")
|
|
|
193 |
seed_input2 = gr.Number(value=42, label="Seed")
|
194 |
height_input = gr.Number(value=480, label="Height")
|
195 |
width_input = gr.Number(value=720, label="Width")
|
196 |
+
num_frames_input2 = gr.Number(value=49, label="Num Frames")
|
197 |
fps_input2 = gr.Number(value=8, label="FPS")
|
198 |
downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
|
199 |
vae_channels_input = gr.Number(value=16, label="VAE Channels")
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -145,7 +145,7 @@ def generate_video(
|
|
145 |
num_videos_per_prompt: int = 1,
|
146 |
dtype: torch.dtype = torch.bfloat16,
|
147 |
seed: int = 42,
|
148 |
-
num_frames: int =
|
149 |
height: int = 480,
|
150 |
width: int = 720,
|
151 |
start_camera_idx: int = 0,
|
@@ -382,7 +382,7 @@ if __name__ == "__main__":
|
|
382 |
)
|
383 |
parser.add_argument("--controlnet_weights", type=float, default=0.5, help="Strenght of controlnet")
|
384 |
parser.add_argument("--use_zero_conv", action="store_true", default=False, help="Use zero conv")
|
385 |
-
parser.add_argument("--infer_with_mask", action="store_true", default=
|
386 |
parser.add_argument("--pool_style", default='max', help="max pool or avg pool")
|
387 |
parser.add_argument("--controlnet_guidance_start", type=float, default=0.0, help="The stage when the controlnet starts to be applied")
|
388 |
parser.add_argument("--controlnet_guidance_end", type=float, default=0.5, help="The stage when the controlnet end to be applied")
|
@@ -403,17 +403,17 @@ if __name__ == "__main__":
|
|
403 |
parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
|
404 |
parser.add_argument("--height", type=int, default=480)
|
405 |
parser.add_argument("--width", type=int, default=720)
|
406 |
-
parser.add_argument("--num_frames", type=int, default=
|
407 |
parser.add_argument("--start_camera_idx", type=int, default=0)
|
408 |
parser.add_argument("--end_camera_idx", type=int, default=1)
|
409 |
-
parser.add_argument("--controlnet_transformer_num_attn_heads", type=int, default=
|
410 |
-
parser.add_argument("--controlnet_transformer_attention_head_dim", type=int, default=
|
411 |
-
parser.add_argument("--controlnet_transformer_out_proj_dim_factor", type=int, default=
|
412 |
-
parser.add_argument("--controlnet_transformer_out_proj_dim_zero_init", action="store_true", default=
|
413 |
)
|
414 |
parser.add_argument("--downscale_coef", type=int, default=8)
|
415 |
parser.add_argument("--vae_channels", type=int, default=16)
|
416 |
-
parser.add_argument("--controlnet_input_channels", type=int, default=
|
417 |
parser.add_argument("--controlnet_transformer_num_layers", type=int, default=8)
|
418 |
parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
|
419 |
parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
|
|
|
145 |
num_videos_per_prompt: int = 1,
|
146 |
dtype: torch.dtype = torch.bfloat16,
|
147 |
seed: int = 42,
|
148 |
+
num_frames: int = 49,
|
149 |
height: int = 480,
|
150 |
width: int = 720,
|
151 |
start_camera_idx: int = 0,
|
|
|
382 |
)
|
383 |
parser.add_argument("--controlnet_weights", type=float, default=0.5, help="Strenght of controlnet")
|
384 |
parser.add_argument("--use_zero_conv", action="store_true", default=False, help="Use zero conv")
|
385 |
+
parser.add_argument("--infer_with_mask", action="store_true", default=True, help="add mask to controlnet")
|
386 |
parser.add_argument("--pool_style", default='max', help="max pool or avg pool")
|
387 |
parser.add_argument("--controlnet_guidance_start", type=float, default=0.0, help="The stage when the controlnet starts to be applied")
|
388 |
parser.add_argument("--controlnet_guidance_end", type=float, default=0.5, help="The stage when the controlnet end to be applied")
|
|
|
403 |
parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
|
404 |
parser.add_argument("--height", type=int, default=480)
|
405 |
parser.add_argument("--width", type=int, default=720)
|
406 |
+
parser.add_argument("--num_frames", type=int, default=49)
|
407 |
parser.add_argument("--start_camera_idx", type=int, default=0)
|
408 |
parser.add_argument("--end_camera_idx", type=int, default=1)
|
409 |
+
parser.add_argument("--controlnet_transformer_num_attn_heads", type=int, default=4)
|
410 |
+
parser.add_argument("--controlnet_transformer_attention_head_dim", type=int, default=64)
|
411 |
+
parser.add_argument("--controlnet_transformer_out_proj_dim_factor", type=int, default=64)
|
412 |
+
parser.add_argument("--controlnet_transformer_out_proj_dim_zero_init", action="store_true", default=True, help=("Init project zero."),
|
413 |
)
|
414 |
parser.add_argument("--downscale_coef", type=int, default=8)
|
415 |
parser.add_argument("--vae_channels", type=int, default=16)
|
416 |
+
parser.add_argument("--controlnet_input_channels", type=int, default=3)
|
417 |
parser.add_argument("--controlnet_transformer_num_layers", type=int, default=8)
|
418 |
parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
|
419 |
parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
|
preprocess/get_vae_latent.py
CHANGED
@@ -88,12 +88,12 @@ def main(args):
|
|
88 |
|
89 |
try:
|
90 |
vr = VideoReader(source_video_file)
|
91 |
-
video = torch.from_numpy(vr.get_batch(np.arange(
|
92 |
video = (video / 255.0) * 2 - 1
|
93 |
source_latent = encode_video(video, vae)
|
94 |
|
95 |
vr = VideoReader(masked_video_file)
|
96 |
-
video = torch.from_numpy(vr.get_batch(np.arange(
|
97 |
video = (video / 255.0) * 2 - 1
|
98 |
video = add_dashed_rays_to_video(video)
|
99 |
masked_latent = encode_video(video, vae)
|
|
|
88 |
|
89 |
try:
|
90 |
vr = VideoReader(source_video_file)
|
91 |
+
video = torch.from_numpy(vr.get_batch(np.arange(49)).asnumpy()).permute(0, 3, 1, 2).contiguous()
|
92 |
video = (video / 255.0) * 2 - 1
|
93 |
source_latent = encode_video(video, vae)
|
94 |
|
95 |
vr = VideoReader(masked_video_file)
|
96 |
+
video = torch.from_numpy(vr.get_batch(np.arange(49)).asnumpy()).permute(0, 3, 1, 2).contiguous()
|
97 |
video = (video / 255.0) * 2 - 1
|
98 |
video = add_dashed_rays_to_video(video)
|
99 |
masked_latent = encode_video(video, vae)
|
scripts/inference.sh
CHANGED
@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES=0 python inference/cli_demo_camera_i2v_pcd.py \
|
|
27 |
--controlnet_transformer_out_proj_dim_factor 64 \
|
28 |
--controlnet_transformer_out_proj_dim_zero_init \
|
29 |
--vae_channels 16 \
|
30 |
-
--num_frames
|
31 |
--controlnet_transformer_num_layers 8 \
|
32 |
--infer_with_mask \
|
33 |
--pool_style 'max' \
|
|
|
27 |
--controlnet_transformer_out_proj_dim_factor 64 \
|
28 |
--controlnet_transformer_out_proj_dim_zero_init \
|
29 |
--vae_channels 16 \
|
30 |
+
--num_frames 49 \
|
31 |
--controlnet_transformer_num_layers 8 \
|
32 |
--infer_with_mask \
|
33 |
--pool_style 'max' \
|
scripts/train.sh
CHANGED
@@ -23,7 +23,7 @@ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" accelerate launch --config_file accelerat
|
|
23 |
--height 480 \
|
24 |
--width 720 \
|
25 |
--fps 8 \
|
26 |
-
--max_num_frames
|
27 |
--video_root_dir $video_root_dir \
|
28 |
--hflip_p 0.0 \
|
29 |
--controlnet_transformer_num_layers 8 \
|
|
|
23 |
--height 480 \
|
24 |
--width 720 \
|
25 |
--fps 8 \
|
26 |
+
--max_num_frames 49 \
|
27 |
--video_root_dir $video_root_dir \
|
28 |
--hflip_p 0.0 \
|
29 |
--controlnet_transformer_num_layers 8 \
|
scripts/train_with_latent.sh
CHANGED
@@ -23,7 +23,7 @@ CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" accelerate launch --config_file accelerat
|
|
23 |
--height 480 \
|
24 |
--width 720 \
|
25 |
--fps 8 \
|
26 |
-
--max_num_frames
|
27 |
--video_root_dir $video_root_dir \
|
28 |
--hflip_p 0.0 \
|
29 |
--controlnet_transformer_num_layers 8 \
|
|
|
23 |
--height 480 \
|
24 |
--width 720 \
|
25 |
--fps 8 \
|
26 |
+
--max_num_frames 49 \
|
27 |
--video_root_dir $video_root_dir \
|
28 |
--hflip_p 0.0 \
|
29 |
--controlnet_transformer_num_layers 8 \
|
training/controlnet_datasets_camera_pcd_mask.py
CHANGED
@@ -39,7 +39,7 @@ class RealEstate10KPCDRenderDataset(Dataset):
|
|
39 |
def __init__(
|
40 |
self,
|
41 |
video_root_dir,
|
42 |
-
sample_n_frames=
|
43 |
image_size=[480, 720],
|
44 |
shuffle_frames=False,
|
45 |
hflip_p=0.0,
|
@@ -135,7 +135,7 @@ class RealEstate10KPCDRenderCapEmbDataset(RealEstate10KPCDRenderDataset):
|
|
135 |
self,
|
136 |
video_root_dir,
|
137 |
text_embedding_path,
|
138 |
-
sample_n_frames=
|
139 |
image_size=[480, 720],
|
140 |
shuffle_frames=False,
|
141 |
hflip_p=0.0,
|
|
|
39 |
def __init__(
|
40 |
self,
|
41 |
video_root_dir,
|
42 |
+
sample_n_frames=49,
|
43 |
image_size=[480, 720],
|
44 |
shuffle_frames=False,
|
45 |
hflip_p=0.0,
|
|
|
135 |
self,
|
136 |
video_root_dir,
|
137 |
text_embedding_path,
|
138 |
+
sample_n_frames=49,
|
139 |
image_size=[480, 720],
|
140 |
shuffle_frames=False,
|
141 |
hflip_p=0.0,
|
training/train_controlnet_i2v_pcd_render_mask_aware_add_dash.py
CHANGED
@@ -255,7 +255,7 @@ def get_args():
|
|
255 |
)
|
256 |
parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
|
257 |
parser.add_argument(
|
258 |
-
"--max_num_frames", type=int, default=
|
259 |
)
|
260 |
parser.add_argument(
|
261 |
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
|
@@ -479,7 +479,7 @@ def get_args():
|
|
479 |
return parser.parse_args()
|
480 |
|
481 |
|
482 |
-
def read_video(video_path, start_index=0, frames_count=
|
483 |
video_reader = VideoReader(video_path)
|
484 |
end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
|
485 |
batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
|
|
|
255 |
)
|
256 |
parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
|
257 |
parser.add_argument(
|
258 |
+
"--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
|
259 |
)
|
260 |
parser.add_argument(
|
261 |
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
|
|
|
479 |
return parser.parse_args()
|
480 |
|
481 |
|
482 |
+
def read_video(video_path, start_index=0, frames_count=49, stride=1):
|
483 |
video_reader = VideoReader(video_path)
|
484 |
end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
|
485 |
batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
|
training/train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py
CHANGED
@@ -255,7 +255,7 @@ def get_args():
|
|
255 |
)
|
256 |
parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
|
257 |
parser.add_argument(
|
258 |
-
"--max_num_frames", type=int, default=
|
259 |
)
|
260 |
parser.add_argument(
|
261 |
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
|
@@ -479,7 +479,7 @@ def get_args():
|
|
479 |
return parser.parse_args()
|
480 |
|
481 |
|
482 |
-
def read_video(video_path, start_index=0, frames_count=
|
483 |
video_reader = VideoReader(video_path)
|
484 |
end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
|
485 |
batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
|
|
|
255 |
)
|
256 |
parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
|
257 |
parser.add_argument(
|
258 |
+
"--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
|
259 |
)
|
260 |
parser.add_argument(
|
261 |
"--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
|
|
|
479 |
return parser.parse_args()
|
480 |
|
481 |
|
482 |
+
def read_video(video_path, start_index=0, frames_count=49, stride=1):
|
483 |
video_reader = VideoReader(video_path)
|
484 |
end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
|
485 |
batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
|