Spaces:

roll-ai
/

EPiC

Paused

App Files Files Community

Muhammad Taqi Raza commited on 13 days ago

Commit

ba201a1

1 Parent(s): 15db18d

adding options

Browse files

Files changed (4) hide show

gradio_app.py +175 -111
inference/v2v_data/demo.py +3 -3
inference/v2v_data/inference.py +19 -4
inference/v2v_data/models/utils.py +5 -5

gradio_app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import subprocess
 from datetime import datetime
@@ -6,165 +5,230 @@ from pathlib import Path
 import gradio as gr
 import numpy as np
-# -----------------------------
-# Setup paths and env
-# -----------------------------
-HF_HOME = "/app/hf_cache"
-os.environ["HF_HOME"] = HF_HOME
-os.environ["TRANSFORMERS_CACHE"] = HF_HOME
-os.makedirs(HF_HOME, exist_ok=True)
-PRETRAINED_DIR = "/app/pretrained"
-os.makedirs(PRETRAINED_DIR, exist_ok=True)
-# -----------------------------
-# Step 1: Optional Model Download
-# -----------------------------
-def download_models():
-    expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth")
-    if not Path(expected_model).exists():
-        print("⚙️ Downloading pretrained models...")
-        try:
-            subprocess.check_call(["bash", "download/download_models.sh"])
-            print("✅ Models downloaded.")
-        except subprocess.CalledProcessError as e:
-            print(f"❌ Model download failed: {e}")
-    else:
-        print("✅ Pretrained models already exist.")
-download_models()
 # -----------------------------
-# Step 2: Inference Logic
 # -----------------------------
-def estimate_near_far(depths, lower_percentile=5, upper_percentile=95):
-    flat = depths.flatten()
-    near = np.percentile(flat, lower_percentile)
-    far = np.percentile(flat, upper_percentile)
-    return near, far
-def run_epic_inference(video_path, fps, num_frames, target_pose, mode):
     temp_input_path = "/app/temp_input.mp4"
     output_dir = "/app/output_anchor"
     video_output_path = f"{output_dir}/masked_videos/output.mp4"
-    # Save uploaded video
     if video_path:
         os.system(f"cp '{video_path}' {temp_input_path}")
     try:
         theta, phi, r, x, y = target_pose.strip().split()
     except ValueError:
-        return f"Invalid target pose format. Use: θ φ r x y", None
     logs =  f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
     command = [
         "python", "/app/inference/v2v_data/inference.py",
         "--video_path", temp_input_path,
         "--stride", "1",
         "--out_dir", output_dir,
-        "--radius_scale", "1",
         "--camera", "target",
         "--mask",
         "--target_pose", theta, phi, r, x, y,
         "--video_length", str(num_frames),
         "--save_name", "output",
         "--mode", mode,
-        "--fps", str(fps)
-    ]
     try:
         result = subprocess.run(command, capture_output=True, text=True, check=True)
         logs += result.stdout
     except subprocess.CalledProcessError as e:
-        logs += f"❌ Inference failed:\n{e.stderr}{e.stdout}"
-        return logs, None
-    return logs + result.stdout, str(video_output_path) if os.path.exists(video_output_path) else (logs, None)
-def print_output_directory(out_dir):
-    result = ""
-    for root, dirs, files in os.walk(out_dir):
-        level = root.replace(out_dir, '').count(os.sep)
-        indent = ' ' * 4 * level
-        result += f"{indent}{os.path.basename(root)}/\n"
-        sub_indent = ' ' * 4 * (level + 1)
-        for f in files:
-            result += f"{sub_indent}{f}\n"
-    return result
-def inference(video_path, num_frames, fps, target_pose, mode):
-    logs, video_masked = run_epic_inference(video_path, fps, num_frames, target_pose, mode)
-    # return logs, video_masked, video_masked
-    result_dir = print_output_directory("/app/output_anchor")
     MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
-    ckpt_steps = 500
-    ckpt_dir = "/app/out/EPiC_pretrained"
-    ckpt_file = f"checkpoint-{ckpt_steps}.pt"
-    ckpt_path = f"{ckpt_dir}/{ckpt_file}"
     video_root_dir = "/app/output_anchor"
     out_dir = "/app/output"
     command = [
         "python", "/app/inference/cli_demo_camera_i2v_pcd.py",
         "--video_root_dir", video_root_dir,
         "--base_model_path", MODEL_PATH,
         "--controlnet_model_path", ckpt_path,
         "--output_path", out_dir,
-        "--start_camera_idx", "0",
-        "--end_camera_idx", "8",
-        "--controlnet_weights", "1.0",
-        "--controlnet_guidance_start", "0.0",
-        "--controlnet_guidance_end", "0.4",
-        "--controlnet_input_channels", "3",
-        "--controlnet_transformer_num_attn_heads", "4",
-        "--controlnet_transformer_attention_head_dim", "64",
-        "--controlnet_transformer_out_proj_dim_factor", "64",
-        "--controlnet_transformer_out_proj_dim_zero_init",
-        "--vae_channels", "16",
         "--num_frames", str(num_frames),
-        "--controlnet_transformer_num_layers", "8",
-        "--infer_with_mask",
-        "--pool_style", "max",
-        "--seed", "43",
-        "--fps", str(fps)
     ]
     result = subprocess.run(command, capture_output=True, text=True)
-    logs += "\n" + result.stdout
-    result_dir = print_output_directory(out_dir)
-    if result.returncode == 0:
-        logs += "Inference completed successfully."
-    else:
-        logs += f"Error occurred during inference: {result.stderr}"
-    return logs + result_dir + "Hello! it is successful", str(f"{out_dir}/00000_43_out.mp4"), video_masked
 # -----------------------------
-# Step 3: Create Gradio UI
 # -----------------------------
-demo = gr.Interface(
-    fn=inference,
-    inputs=[
-        gr.Video(label="Upload Video (MP4)"),
-        gr.Slider(minimum=1, maximum=120, value=50, step=1, label="Number of Frames"),
-        gr.Slider(minimum=1, maximum=90, value=10, step=1, label="FPS"),
-        gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0"),
-        gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode"),
-    ],
-    outputs=[
-        gr.Textbox(label="Inference Logs"),
-        gr.Video(label="Generated Video`"),
-        gr.Video(label="Masked Video")
-    ],
-    title="🎬 EPiC: Efficient Video Camera Control",
-    description="Upload a video, describe the scene, and apply cinematic camera motion using pretrained EPiC models.",
-)
-# -----------------------------
-# Step 4: Launch App
-# -----------------------------
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import subprocess
 from datetime import datetime
 import gradio as gr
 import numpy as np
+# # -----------------------------
+# # Setup paths and env
+# # -----------------------------
+# HF_HOME = "/app/hf_cache"
+# os.environ["HF_HOME"] = HF_HOME
+# os.environ["TRANSFORMERS_CACHE"] = HF_HOME
+# os.makedirs(HF_HOME, exist_ok=True)
+# PRETRAINED_DIR = "/app/pretrained"
+# os.makedirs(PRETRAINED_DIR, exist_ok=True)
+# # -----------------------------
+# # Step 1: Optional Model Download
+# # -----------------------------
+# def download_models():
+#     expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth")
+#     if not Path(expected_model).exists():
+#         print("⚙️ Downloading pretrained models...")
+#         try:
+#             subprocess.check_call(["bash", "download/download_models.sh"])
+#             print("✅ Models downloaded.")
+#         except subprocess.CalledProcessError as e:
+#             print(f"❌ Model download failed: {e}")
+#     else:
+#         print("✅ Pretrained models already exist.")
 # -----------------------------
+# Step 1: Get Anchor Video
 # -----------------------------
+def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
+                       radius_scale, near_far_estimated,
+                       sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
+                       prompt, negative_prompt, refine_prompt,
+                       depth_inference_steps, depth_guidance_scale,
+                       window_size, overlap, max_res, sample_size, seed_input, height, width):
     temp_input_path = "/app/temp_input.mp4"
     output_dir = "/app/output_anchor"
     video_output_path = f"{output_dir}/masked_videos/output.mp4"
     if video_path:
         os.system(f"cp '{video_path}' {temp_input_path}")
     try:
         theta, phi, r, x, y = target_pose.strip().split()
     except ValueError:
+        return f"Invalid target pose format. Use: θ φ r x y", None, None
     logs =  f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
+    # INTEGRATE HEIGHT AND WIDTH PARAMETERS
     command = [
         "python", "/app/inference/v2v_data/inference.py",
         "--video_path", temp_input_path,
         "--stride", "1",
         "--out_dir", output_dir,
+        "--radius_scale", str(radius_scale),
         "--camera", "target",
         "--mask",
         "--target_pose", theta, phi, r, x, y,
         "--video_length", str(num_frames),
         "--save_name", "output",
         "--mode", mode,
+        "--fps", str(fps),
+        "--depth_inference_steps", str(depth_inference_steps),
+        "--depth_guidance_scale", str(depth_guidance_scale),
+        "--near_far_estimated", near_far_estimated,
+        "--sampler_name", sampler_name,
+        "--diffusion_guidance_scale", str(diffusion_guidance_scale),
+        "--diffusion_inference_steps", str(diffusion_inference_steps),
+        "--prompt", prompt if prompt else "",
+        "--negative_prompt", negative_prompt,
+        "--refine_prompt", refine_prompt,
+        "--window_size", str(window_size),
+        "--overlap", str(overlap),
+        "--max_res", str(max_res),
+        "--sample_size", sample_size if sample_size else "384, 672",
+        "--seed", seed_input,
+        "--height", str(height),  # Fixed height
+        "--width", str(width)
+    ]
     try:
         result = subprocess.run(command, capture_output=True, text=True, check=True)
         logs += result.stdout
     except subprocess.CalledProcessError as e:
+        logs += f"❌ Inference failed:\n{e.stderr}{e.stdout}"
+        return None, logs
+    return str(video_output_path), logs
+# -----------------------------
+# Step 2: Run Inference
+# -----------------------------
+def inference(
+    fps, num_frames, controlnet_weights, controlnet_guidance_start,
+    controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
+    seed, height, width, downscale_coef, vae_channels,
+    controlnet_input_channels, controlnet_transformer_num_layers
+):
     MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
+    ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
     video_root_dir = "/app/output_anchor"
     out_dir = "/app/output"
     command = [
         "python", "/app/inference/cli_demo_camera_i2v_pcd.py",
         "--video_root_dir", video_root_dir,
         "--base_model_path", MODEL_PATH,
         "--controlnet_model_path", ckpt_path,
         "--output_path", out_dir,
+        "--controlnet_weights", str(controlnet_weights),
+        "--controlnet_guidance_start", str(controlnet_guidance_start),
+        "--controlnet_guidance_end", str(controlnet_guidance_end),
+        "--guidance_scale", str(guidance_scale),
+        "--num_inference_steps", str(num_inference_steps),
+        "--dtype", dtype,
+        "--seed", str(seed),
+        "--height", str(height),
+        "--width", str(width),
         "--num_frames", str(num_frames),
+        "--fps", str(fps),
+        "--downscale_coef", str(downscale_coef),
+        "--vae_channels", str(vae_channels),
+        "--controlnet_input_channels", str(controlnet_input_channels),
+        "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers)
     ]
     result = subprocess.run(command, capture_output=True, text=True)
+    logs = result.stdout
+    video_output = f"{out_dir}/00000_{seed}_out.mp4"
+    return video_output if os.path.exists(video_output) else None, logs
 # -----------------------------
+# UI
 # -----------------------------
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("## 🎬 EPiC: Cinematic Camera Control")
+    with gr.Tabs():
+        with gr.TabItem("Step 1: Camera Anchor"):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
+                        pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
+                        fps_input = gr.Number(value=24, label="FPS")
+                        num_frames_input = gr.Number(value=49, label="Number of Frames")
+                        radius_input = gr.Number(value = 1.0, label="Radius Scale")
+                        mode_input = gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode")
+                        sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
+                        diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
+                        diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
+                        depth_steps_input = gr.Number(value=5, label="Depth Steps")
+                        depth_guidance_input = gr.Number(value=1.0, label="Depth Guidance")
+                        window_input = gr.Number(value=64, label="Window Size")
+                        overlap_input = gr.Number(value=25, label="Overlap")
+                        maxres_input = gr.Number(value=1024, label="Max Resolution")
+                        sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
+                        seed_input = gr.Number(value=43, label="Seed")
+                        height = gr.Number(value=576, label="Height")
+                        width = gr.Number(value=1024, label="Width")
+                        prompt_input = gr.Textbox(label="Prompt")
+                        neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
+                        refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
+                with gr.Column():
+                    video_input = gr.Video(label="Upload Video (MP4)")
+                    step1_button = gr.Button("▶️ Run Step 1")
+                    step1_video = gr.Video(label="[Step 1] Masked Video")
+                    step1_logs = gr.Textbox(label="[Step 1] Logs")
+        with gr.TabItem("Step 2: CogVideoX Refinement"):
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
+                        controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
+                        controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
+                        guidance_scale_input = gr.Number(value=6.0, label="Guidance Scale")
+                        inference_steps_input = gr.Number(value=50, label="Num Inference Steps")
+                        dtype_input = gr.Dropdown(choices=["float16", "bfloat16"], value="bfloat16", label="Compute Dtype")
+                        seed_input2 = gr.Number(value=42, label="Seed")
+                        height_input = gr.Number(value=480, label="Height")
+                        width_input = gr.Number(value=720, label="Width")
+                        num_frames_input2 = gr.Number(value=97, label="Num Frames")
+                        fps_input2 = gr.Number(value=8, label="FPS")
+                        downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
+                        vae_channels_input = gr.Number(value=16, label="VAE Channels")
+                        controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
+                        controlnet_layers_input = gr.Number(value=8, label="ControlNet Transformer Layers")
+                with gr.Column():
+                    step2_video = gr.Video(label="[Step 2] Final Refined Video")
+                    step2_button = gr.Button("▶️ Run Step 2")
+                    step2_logs = gr.Textbox(label="[Step 2] Logs")
+    step1_button.click(
+        get_anchor_video,
+        inputs=[
+            video_input, fps_input, num_frames_input, pose_input, mode_input,
+            radius_input, near_far_estimated,
+            sampler_input, diff_guidance_input, diff_steps_input,
+            prompt_input, neg_prompt_input, refine_prompt_input,
+            depth_steps_input, depth_guidance_input,
+            window_input, overlap_input, maxres_input, sample_size, seed_input, height, width
+        ],
+        outputs=[step1_video, step1_logs]
+    )
+    step2_button.click(
+        inference,
+        inputs=[
+            fps_input2, num_frames_input2,
+            controlnet_weights_input, controlnet_guidance_start_input,
+            controlnet_guidance_end_input, guidance_scale_input,
+            inference_steps_input, dtype_input, seed_input2,
+            height_input, width_input, downscale_coef_input,
+            vae_channels_input, controlnet_input_channels_input,
+            controlnet_layers_input
+        ],
+        outputs=[step2_video, step2_logs]
+    )
 if __name__ == "__main__":
+    # download_models()
+    demo.launch(server_name="0.0.0.0", server_port=7860)

inference/v2v_data/demo.py CHANGED Viewed

@@ -24,8 +24,8 @@ def get_center_crop_resolution(original_resoultion, target_aspect_ratio=(2, 3)):
         crop_w = original_w
         crop_h = int(crop_w / aspect_ratio)
-    resized_h = 576
-    resized_w = 1024
     h_ratio = resized_h / original_h
     w_ratio = resized_w / original_w
@@ -111,7 +111,7 @@ class GetAnchorVideos:
     def infer_gradual(self, opts):
         frames = read_video_frames(
-            opts.video_path, opts.video_length, opts.stride, opts.max_res
         )
         vr = VideoReader(opts.video_path, ctx=cpu(0))
         frame_shape = vr[0].shape  # (H, W, 3)

         crop_w = original_w
         crop_h = int(crop_w / aspect_ratio)
+    resized_h = original_resoultion[0] # previous 576
+    resized_w = original_resoultion[1] # previous 1024
     h_ratio = resized_h / original_h
     w_ratio = resized_w / original_w
     def infer_gradual(self, opts):
         frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res, opts.height, opts.width
         )
         vr = VideoReader(opts.video_path, ctx=cpu(0))
         frame_shape = vr[0].shape  # (H, W, 3)

inference/v2v_data/inference.py CHANGED Viewed

@@ -32,7 +32,7 @@ def get_parser():
         '--seed', type=int, default=43, help='Random seed for reproducibility'
     )
     parser.add_argument(
-        '--video_length', type=int, default=97, help='Length of the video frames'
     )
     parser.add_argument('--fps', type=int, default=10, help='Fps for saved video')
     parser.add_argument(
@@ -48,6 +48,7 @@ def get_parser():
         help='Scale factor for the spherical radius',
     )
     parser.add_argument('--camera', type=str, default='traj', help='traj or target')
     parser.add_argument(
         '--mode', type=str, default='gradual', help='gradual, bullet or direct'
     )
@@ -71,8 +72,21 @@ def get_parser():
     parser.add_argument(
         '--far', type=float, default=10000.0, help='Far clipping plane distance'
     )
     parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
     ## diffusion
     parser.add_argument(
         '--low_gpu_memory_mode',
@@ -80,6 +94,7 @@ def get_parser():
         default=False,
         help='Enable low GPU memory mode',
     )
     # parser.add_argument('--model_name', type=str, default='checkpoints/CogVideoX-Fun-V1.1-5b-InP', help='Path to the model')
     parser.add_argument(
         '--model_name',
@@ -126,13 +141,13 @@ def get_parser():
     parser.add_argument(
         '--negative_prompt',
         type=str,
-        default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.",
         help='Negative prompt for video generation',
     )
     parser.add_argument(
         '--refine_prompt',
         type=str,
-        default=". The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
         help='Prompt for video generation',
     )
     parser.add_argument('--qwen_path', type=str, default="/app/pretrained/Qwen2.5-VL-7B-Instruct")

         '--seed', type=int, default=43, help='Random seed for reproducibility'
     )
     parser.add_argument(
+        '--video_length', type=int, default=49, help='Length of the video frames'
     )
     parser.add_argument('--fps', type=int, default=10, help='Fps for saved video')
     parser.add_argument(
         help='Scale factor for the spherical radius',
     )
     parser.add_argument('--camera', type=str, default='traj', help='traj or target')
     parser.add_argument(
         '--mode', type=str, default='gradual', help='gradual, bullet or direct'
     )
     parser.add_argument(
         '--far', type=float, default=10000.0, help='Far clipping plane distance'
     )
+    parser.add_argument(
+        '--height', type=int, default=1080, help='Height'
+    )
+    parser.add_argument(
+        '--width', type=int, default=1920, help='width'
+    )
     parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
+    parser.add_argument(
+        '--near_far_estimated',
+        type=bool,
+        default=True,
+        help='Use estimated near and far values',
+    )
     ## diffusion
     parser.add_argument(
         '--low_gpu_memory_mode',
         default=False,
         help='Enable low GPU memory mode',
     )
     # parser.add_argument('--model_name', type=str, default='checkpoints/CogVideoX-Fun-V1.1-5b-InP', help='Path to the model')
     parser.add_argument(
         '--model_name',
     parser.add_argument(
         '--negative_prompt',
         type=str,
+        default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid.",
         help='Negative prompt for video generation',
     )
     parser.add_argument(
         '--refine_prompt',
         type=str,
+        default=". The video is of high quality, and the view is very clear. ",
         help='Prompt for video generation',
     )
     parser.add_argument('--qwen_path', type=str, default="/app/pretrained/Qwen2.5-VL-7B-Instruct")

inference/v2v_data/models/utils.py CHANGED Viewed

@@ -28,7 +28,7 @@ from decord import VideoReader, cpu
 from PIL import Image
-def read_video_frames(video_path, process_length, stride, max_res, dataset="open"):
     def is_image(path):
         return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
@@ -36,8 +36,8 @@ def read_video_frames(video_path, process_length, stride, max_res, dataset="open
         print("==> Detected image. Loading as single-frame video:", video_path)
         img = Image.open(video_path).convert("RGB")
         # FIXME: hard coded
-        width = 1024
-        height = 576
         img = img.resize((width, height), Image.BICUBIC)
         img = np.array(img).astype("float32") / 255.0  # [H, W, 3]
         frames = img[None, ...]  # [1, H, W, 3]
@@ -50,8 +50,8 @@ def read_video_frames(video_path, process_length, stride, max_res, dataset="open
         print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
         # FIXME: hard coded
-        width = 1024
-        height = 576
     vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)

 from PIL import Image
+def read_video_frames(video_path, process_length, stride, max_res, dataset="open", height=576, width=1024):
     def is_image(path):
         return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
         print("==> Detected image. Loading as single-frame video:", video_path)
         img = Image.open(video_path).convert("RGB")
         # FIXME: hard coded
+        width = width
+        height = height
         img = img.resize((width, height), Image.BICUBIC)
         img = np.array(img).astype("float32") / 255.0  # [H, W, 3]
         frames = img[None, ...]  # [1, H, W, 3]
         print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
         # FIXME: hard coded
+        width = width
+        height = height
     vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)