Muhammad Taqi Raza
commited on
Commit
·
509e4d7
1
Parent(s):
208621f
add files
Browse files
controlnet_pipeline.py
CHANGED
@@ -167,7 +167,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
167 |
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
168 |
)
|
169 |
self.vae_scaling_factor_image = (
|
170 |
-
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
|
171 |
)
|
172 |
|
173 |
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
@@ -364,6 +364,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
364 |
image_latents = torch.cat([first_frame, image_latents], dim=1)
|
365 |
|
366 |
if latents is None:
|
|
|
367 |
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
368 |
else:
|
369 |
latents = latents.to(device)
|
@@ -588,8 +589,8 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
588 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
589 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
590 |
|
591 |
-
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
|
592 |
-
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
|
593 |
num_frames = num_frames or self.transformer.config.sample_frames
|
594 |
|
595 |
num_videos_per_prompt = 1
|
@@ -656,7 +657,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
656 |
device, dtype=prompt_embeds.dtype
|
657 |
)
|
658 |
|
659 |
-
latent_channels = self.transformer.config.in_channels // 2
|
660 |
latents, image_latents = self.prepare_latents(
|
661 |
image,
|
662 |
batch_size * num_videos_per_prompt,
|
@@ -719,7 +720,9 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
|
|
719 |
prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
|
720 |
|
721 |
controlnet_states = None
|
722 |
-
|
|
|
|
|
723 |
input_controlnet_states = anchor_states
|
724 |
if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
|
725 |
controlnet_states = self.controlnet(
|
|
|
167 |
self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
|
168 |
)
|
169 |
self.vae_scaling_factor_image = (
|
170 |
+
self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7 # 1.15258426
|
171 |
)
|
172 |
|
173 |
self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
|
|
|
364 |
image_latents = torch.cat([first_frame, image_latents], dim=1)
|
365 |
|
366 |
if latents is None:
|
367 |
+
print("Latent is known")
|
368 |
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
369 |
else:
|
370 |
latents = latents.to(device)
|
|
|
589 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
590 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
591 |
|
592 |
+
height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial #720
|
593 |
+
width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial # 480
|
594 |
num_frames = num_frames or self.transformer.config.sample_frames
|
595 |
|
596 |
num_videos_per_prompt = 1
|
|
|
657 |
device, dtype=prompt_embeds.dtype
|
658 |
)
|
659 |
|
660 |
+
latent_channels = self.transformer.config.in_channels // 2 # 8
|
661 |
latents, image_latents = self.prepare_latents(
|
662 |
image,
|
663 |
batch_size * num_videos_per_prompt,
|
|
|
720 |
prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
|
721 |
|
722 |
controlnet_states = None
|
723 |
+
# latent_model_input
|
724 |
+
# input_controlnet_states
|
725 |
+
|
726 |
input_controlnet_states = anchor_states
|
727 |
if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
|
728 |
controlnet_states = self.controlnet(
|
inference/cli_demo_camera_i2v_pcd.py
CHANGED
@@ -346,6 +346,8 @@ def generate_video(
|
|
346 |
controlnet_weights=controlnet_weights,
|
347 |
controlnet_guidance_start=controlnet_guidance_start,
|
348 |
controlnet_guidance_end=controlnet_guidance_end,
|
|
|
|
|
349 |
).frames
|
350 |
video_generate = video_generate_all[0]
|
351 |
|
|
|
346 |
controlnet_weights=controlnet_weights,
|
347 |
controlnet_guidance_start=controlnet_guidance_start,
|
348 |
controlnet_guidance_end=controlnet_guidance_end,
|
349 |
+
height=height, # Height of the generated video
|
350 |
+
width=width, # Width of the generated video
|
351 |
).frames
|
352 |
video_generate = video_generate_all[0]
|
353 |
|