Muhammad Taqi Raza commited on
Commit
509e4d7
·
1 Parent(s): 208621f
controlnet_pipeline.py CHANGED
@@ -167,7 +167,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
167
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
168
  )
169
  self.vae_scaling_factor_image = (
170
- self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
171
  )
172
 
173
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
@@ -364,6 +364,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
364
  image_latents = torch.cat([first_frame, image_latents], dim=1)
365
 
366
  if latents is None:
 
367
  latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
368
  else:
369
  latents = latents.to(device)
@@ -588,8 +589,8 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
588
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
589
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
590
 
591
- height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
592
- width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
593
  num_frames = num_frames or self.transformer.config.sample_frames
594
 
595
  num_videos_per_prompt = 1
@@ -656,7 +657,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
656
  device, dtype=prompt_embeds.dtype
657
  )
658
 
659
- latent_channels = self.transformer.config.in_channels // 2
660
  latents, image_latents = self.prepare_latents(
661
  image,
662
  batch_size * num_videos_per_prompt,
@@ -719,7 +720,9 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
719
  prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
720
 
721
  controlnet_states = None
722
-
 
 
723
  input_controlnet_states = anchor_states
724
  if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
725
  controlnet_states = self.controlnet(
 
167
  self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
168
  )
169
  self.vae_scaling_factor_image = (
170
+ self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7 # 1.15258426
171
  )
172
 
173
  self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
 
364
  image_latents = torch.cat([first_frame, image_latents], dim=1)
365
 
366
  if latents is None:
367
+ print("Latent is known")
368
  latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
369
  else:
370
  latents = latents.to(device)
 
589
  if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
590
  callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
591
 
592
+ height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial #720
593
+ width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial # 480
594
  num_frames = num_frames or self.transformer.config.sample_frames
595
 
596
  num_videos_per_prompt = 1
 
657
  device, dtype=prompt_embeds.dtype
658
  )
659
 
660
+ latent_channels = self.transformer.config.in_channels // 2 # 8
661
  latents, image_latents = self.prepare_latents(
662
  image,
663
  batch_size * num_videos_per_prompt,
 
720
  prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
721
 
722
  controlnet_states = None
723
+ # latent_model_input
724
+ # input_controlnet_states
725
+
726
  input_controlnet_states = anchor_states
727
  if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
728
  controlnet_states = self.controlnet(
inference/cli_demo_camera_i2v_pcd.py CHANGED
@@ -346,6 +346,8 @@ def generate_video(
346
  controlnet_weights=controlnet_weights,
347
  controlnet_guidance_start=controlnet_guidance_start,
348
  controlnet_guidance_end=controlnet_guidance_end,
 
 
349
  ).frames
350
  video_generate = video_generate_all[0]
351
 
 
346
  controlnet_weights=controlnet_weights,
347
  controlnet_guidance_start=controlnet_guidance_start,
348
  controlnet_guidance_end=controlnet_guidance_end,
349
+ height=height, # Height of the generated video
350
+ width=width, # Width of the generated video
351
  ).frames
352
  video_generate = video_generate_all[0]
353