Spaces:

roll-ai
/

EPiC

Running on L40S

App Files Files Community

Muhammad Taqi Raza commited on 9 days ago

Commit

ee6a765

1 Parent(s): 7d2ae5b

upscale, refine, upscale_factor

Browse files

Files changed (5) hide show

gradio_app.py +15 -8
inference/cli_demo_camera_i2v_pcd.py +20 -6
inference/utils.py +1 -1
inference/v2v_data/demo.py +1 -0
inference/v2v_data/models/infer.py +5 -1

gradio_app.py CHANGED Viewed

@@ -74,7 +74,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
         "--fps", str(fps),
         "--depth_inference_steps", str(depth_inference_steps),
         "--depth_guidance_scale", str(depth_guidance_scale),
-        # "--near_far_estimated", str(near_far_estimated),
         "--sampler_name", sampler_name,
         "--diffusion_guidance_scale", str(diffusion_guidance_scale),
         "--diffusion_inference_steps", str(diffusion_inference_steps),
@@ -105,7 +105,7 @@ def inference(
     fps, num_frames, controlnet_weights, controlnet_guidance_start,
     controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
     seed, height, width, downscale_coef, vae_channels,
-    controlnet_input_channels, controlnet_transformer_num_layers
 ):
     MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
     ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
@@ -132,7 +132,10 @@ def inference(
         "--downscale_coef", str(downscale_coef),
         "--vae_channels", str(vae_channels),
         "--controlnet_input_channels", str(controlnet_input_channels),
-        "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers)
     ]
     try:
         result = subprocess.run(command, capture_output=True, text=True, check=True)
@@ -164,7 +167,7 @@ with demo:
                         fps_input = gr.Number(value=24, label="FPS")
                         num_frames_input = gr.Number(value=49, label="Number of Frames")
                         radius_input = gr.Number(value = 1.0, label="Radius Scale")
-                        mode_input = gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode")
                         sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
                         diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
                         diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
@@ -175,8 +178,8 @@ with demo:
                         maxres_input = gr.Number(value=1920, label="Max Resolution")
                         sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
                         seed_input = gr.Number(value=43, label="Seed")
-                        height = gr.Number(value=576, label="Height")
-                        width = gr.Number(value=1024, label="Width")
                         prompt_input = gr.Textbox(label="Prompt")
                         neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
                         refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
@@ -190,6 +193,10 @@ with demo:
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                         controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
                         controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
                         controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
@@ -200,7 +207,7 @@ with demo:
                         height_input = gr.Number(value=480, label="Height")
                         width_input = gr.Number(value=720, label="Width")
                         num_frames_input2 = gr.Number(value=49, label="Num Frames")
-                        fps_input2 = gr.Number(value=8, label="FPS")
                         downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
                         vae_channels_input = gr.Number(value=16, label="VAE Channels")
                         controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
@@ -232,7 +239,7 @@ with demo:
             inference_steps_input, dtype_input, seed_input2,
             height_input, width_input, downscale_coef_input,
             vae_channels_input, controlnet_input_channels_input,
-            controlnet_layers_input
         ],
         outputs=[step2_video, step2_logs]
     )

         "--fps", str(fps),
         "--depth_inference_steps", str(depth_inference_steps),
         "--depth_guidance_scale", str(depth_guidance_scale),
+        "--near_far_estimated", str(near_far_estimated),
         "--sampler_name", sampler_name,
         "--diffusion_guidance_scale", str(diffusion_guidance_scale),
         "--diffusion_inference_steps", str(diffusion_inference_steps),
     fps, num_frames, controlnet_weights, controlnet_guidance_start,
     controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
     seed, height, width, downscale_coef, vae_channels,
+    controlnet_input_channels, controlnet_transformer_num_layers, upscale, upscale_factor, refine
 ):
     MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
     ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
         "--downscale_coef", str(downscale_coef),
         "--vae_channels", str(vae_channels),
         "--controlnet_input_channels", str(controlnet_input_channels),
+        "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers),
+        "--upscale", str(upscale),
+        "--upscale_factor", str(upscale_factor),
+        "--refine", str(refine),
     ]
     try:
         result = subprocess.run(command, capture_output=True, text=True, check=True)
                         fps_input = gr.Number(value=24, label="FPS")
                         num_frames_input = gr.Number(value=49, label="Number of Frames")
                         radius_input = gr.Number(value = 1.0, label="Radius Scale")
+                        mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
                         sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
                         diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
                         diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
                         maxres_input = gr.Number(value=1920, label="Max Resolution")
                         sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
                         seed_input = gr.Number(value=43, label="Seed")
+                        height = gr.Number(value=480, label="Height")
+                        width = gr.Number(value=720, label="Width")
                         prompt_input = gr.Textbox(label="Prompt")
                         neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
                         refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
+                        upscale = gr.Checkbox(label="Upscale", value=True)
+                        upscale_factor = gr.Number(label="Upscale factor", value=4)
+                        refine = gr.Checkbox(label="refine", value=True)
                         controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
                         controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
                         controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
                         height_input = gr.Number(value=480, label="Height")
                         width_input = gr.Number(value=720, label="Width")
                         num_frames_input2 = gr.Number(value=49, label="Num Frames")
+                        fps_input2 = gr.Number(value=24, label="FPS")
                         downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
                         vae_channels_input = gr.Number(value=16, label="VAE Channels")
                         controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
             inference_steps_input, dtype_input, seed_input2,
             height_input, width_input, downscale_coef_input,
             vae_channels_input, controlnet_input_channels_input,
+            controlnet_layers_input, upscale, upscale_factor, refine
         ],
         outputs=[step2_video, step2_logs]
     )

inference/cli_demo_camera_i2v_pcd.py CHANGED Viewed

@@ -174,6 +174,9 @@ def generate_video(
     pool_style: str = 'avg',
     pipe_cpu_offload: bool = False,
     fps: int = 8,
 ):
     """
     Generates a video based on the given prompt and saves it to the specified path.
@@ -399,12 +402,11 @@ def generate_video(
             else:
                 print(f"  Value: {item}")
-        scale_status = True
-        rife_status = True
-        if scale_status:
-            latents = utils.upscale_batch_and_concatenate(upscale_model, latents, device)
-        if rife_status:
-            latents = rife_inference_with_latents(frame_interpolation_model, latents)
         # Convert latents back to PIL images after processing
@@ -489,6 +491,15 @@ if __name__ == "__main__":
     parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
     parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
     args = parser.parse_args()
     dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
     generate_video(
@@ -525,4 +536,7 @@ if __name__ == "__main__":
         pool_style=args.pool_style,
         pipe_cpu_offload=args.enable_model_cpu_offload,
         fps=args.fps,
     )

     pool_style: str = 'avg',
     pipe_cpu_offload: bool = False,
     fps: int = 8,
+    upscale: bool = True,
+    upscale_factor: int = 4,
+    refine: bool = True,
 ):
     """
     Generates a video based on the given prompt and saves it to the specified path.
             else:
                 print(f"  Value: {item}")
+        if upscale:
+            latents = utils.upscale_batch_and_concatenate(upscale_model, latents, device, upscale_factor=upscale_factor)
+        if refine:
+            latents = rife_inference_with_latents(frame_interpolation_model, latents) # upscale here is assigned 1.
         # Convert latents back to PIL images after processing
     parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
     parser.add_argument("--fps", type=int, default=8, help="Frames per second for the output video")
+    parser.add_argument("--upscale", action="store_true", default=False, help="Enable upscaling of the output video")
+    parser.add_argument("--upscale_factor", type=int, default=4, help="Factor by which to upscale the output video")
+    parser.add_argument("--refine", action="store_true", default=False, help="Enable refinement of the output video")
+    #  "--upscale", str(upscale),
+    #     "--upscale_factor", str(upscale_factor),
+    #     "--refine", str(refine),
     args = parser.parse_args()
     dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
     generate_video(
         pool_style=args.pool_style,
         pipe_cpu_offload=args.enable_model_cpu_offload,
         fps=args.fps,
+        upscale=args.upscale,
+        upscale_factor=args.upscale_factor,
+        refine=args.refine,
     )

inference/utils.py CHANGED Viewed

@@ -200,7 +200,7 @@ def upscale(upscale_model, tensor: torch.Tensor, inf_device, output_device="cpu"
     return s
-def upscale_batch_and_concatenate(upscale_model, latents, inf_device, output_device="cpu") -> torch.Tensor:
     upscaled_latents = []
     for i in range(latents.size(0)):
         latent = latents[i]

     return s
+def upscale_batch_and_concatenate(upscale_model, latents, inf_device, output_device="cpu", upscale_factor = 4) -> torch.Tensor:
     upscaled_latents = []
     for i in range(latents.size(0)):
         latent = latents[i]

inference/v2v_data/demo.py CHANGED Viewed

@@ -129,6 +129,7 @@ class GetAnchorVideos:
             opts.depth_guidance_scale,
             window_size=opts.window_size,
             overlap=opts.overlap,
         ).to(opts.device)
         frames = (

             opts.depth_guidance_scale,
             window_size=opts.window_size,
             overlap=opts.overlap,
+            near_far_estimated = opts.near_far_estimated,
         ).to(opts.device)
         frames = (

inference/v2v_data/models/infer.py CHANGED Viewed

@@ -66,6 +66,7 @@ class DepthCrafterDemo:
         overlap: int = 25,
         seed: int = 42,
         track_time: bool = True,
     ):
         set_seed(seed)
@@ -94,7 +95,10 @@ class DepthCrafterDemo:
         depths[depths < 1e-5] = 1e-5
         depths = 10000.0 / depths
-        near, far = self.estimate_near_far(depths)
         print(f"Estimated near: {near}, far: {far}")
         depths = depths.clip(near, far)

         overlap: int = 25,
         seed: int = 42,
         track_time: bool = True,
+        near_far_estimated: bool = True,
     ):
         set_seed(seed)
         depths[depths < 1e-5] = 1e-5
         depths = 10000.0 / depths
+        if near_far_estimated:
+            print("Estimating near and far values from the depth map...")
+            near, far = self.estimate_near_far(depths)
         print(f"Estimated near: {near}, far: {far}")
         depths = depths.clip(near, far)