Spaces:

roll-ai
/

EPiC

Running on L40S

App Files Files Community

Muhammad Taqi Raza commited on 12 days ago

Commit

509e4d7

1 Parent(s): 208621f

add files

Browse files

Files changed (2) hide show

controlnet_pipeline.py +8 -5
inference/cli_demo_camera_i2v_pcd.py +2 -0

controlnet_pipeline.py CHANGED Viewed

@@ -167,7 +167,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
             self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
         )
         self.vae_scaling_factor_image = (
-            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
         )
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
@@ -364,6 +364,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
             image_latents = torch.cat([first_frame, image_latents], dim=1)
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
@@ -588,8 +589,8 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
-        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
         num_frames = num_frames or self.transformer.config.sample_frames
         num_videos_per_prompt = 1
@@ -656,7 +657,7 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
             device, dtype=prompt_embeds.dtype
         )
-        latent_channels = self.transformer.config.in_channels // 2
         latents, image_latents = self.prepare_latents(
             image,
             batch_size * num_videos_per_prompt,
@@ -719,7 +720,9 @@ class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLor
                 prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
                 controlnet_states = None
                 input_controlnet_states = anchor_states
                 if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
                     controlnet_states = self.controlnet(

             self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
         )
         self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7 # 1.15258426
         )
         self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
             image_latents = torch.cat([first_frame, image_latents], dim=1)
         if latents is None:
+            print("Latent is known")
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial #720
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial    # 480
         num_frames = num_frames or self.transformer.config.sample_frames
         num_videos_per_prompt = 1
             device, dtype=prompt_embeds.dtype
         )
+        latent_channels = self.transformer.config.in_channels // 2  # 8
         latents, image_latents = self.prepare_latents(
             image,
             batch_size * num_videos_per_prompt,
                 prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
                 controlnet_states = None
+                # latent_model_input
+                # input_controlnet_states
                 input_controlnet_states = anchor_states
                 if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
                     controlnet_states = self.controlnet(

inference/cli_demo_camera_i2v_pcd.py CHANGED Viewed

@@ -346,6 +346,8 @@ def generate_video(
             controlnet_weights=controlnet_weights,
             controlnet_guidance_start=controlnet_guidance_start,
             controlnet_guidance_end=controlnet_guidance_end,
         ).frames
         video_generate = video_generate_all[0]

             controlnet_weights=controlnet_weights,
             controlnet_guidance_start=controlnet_guidance_start,
             controlnet_guidance_end=controlnet_guidance_end,
+            height=height,  # Height of the generated video
+            width=width,  # Width of the generated video
         ).frames
         video_generate = video_generate_all[0]