Spaces:
Paused
Paused
import os | |
import subprocess | |
from datetime import datetime | |
from pathlib import Path | |
import gradio as gr | |
import numpy as np | |
from huggingface_hub import hf_hub_download, snapshot_download | |
# ----------------------------- | |
# Setup paths and env | |
# ----------------------------- | |
HF_HOME = "/app/hf_cache" | |
os.environ["HF_HOME"] = HF_HOME | |
os.environ["TRANSFORMERS_CACHE"] = HF_HOME | |
os.makedirs(HF_HOME, exist_ok=True) | |
# hf_hub_download(repo_id="ai-forever/Real-ESRGAN", filename="RealESRGAN_x4.pth", local_dir="model_real_esran") | |
# snapshot_download(repo_id="AlexWortega/RIFE", local_dir="model_rife") | |
PRETRAINED_DIR = "/app/pretrained" | |
os.makedirs(PRETRAINED_DIR, exist_ok=True) | |
# ----------------------------- | |
# Step 1: Optional Model Download | |
# ----------------------------- | |
def download_models(): | |
expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth") | |
if not Path(expected_model).exists(): | |
print("⚙️ Downloading pretrained models...") | |
try: | |
subprocess.check_call(["bash", "download/download_models.sh"]) | |
print("✅ Models downloaded.") | |
except subprocess.CalledProcessError as e: | |
print(f"Model download failed: {e}") | |
else: | |
print("✅ Pretrained models already exist.") | |
# ----------------------------- | |
# Step 1: Get Anchor Video | |
# ----------------------------- | |
def get_anchor_video(video_path, fps, num_frames, target_pose, mode, | |
radius_scale, near_far_estimated, | |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps, | |
prompt, negative_prompt, refine_prompt, | |
depth_inference_steps, depth_guidance_scale, | |
window_size, overlap, max_res, sample_size, | |
seed_input, height, width, aspect_ratio_inputs, | |
init_dx, init_dy, init_dz): | |
temp_input_path = "/app/temp_input.mp4" | |
output_dir = "/app/output_anchor" | |
video_output_path = f"{output_dir}/masked_videos/output.mp4" | |
if video_path: | |
os.system(f"cp '{video_path}' {temp_input_path}") | |
try: | |
theta, phi, r, x, y = target_pose.strip().split() | |
except ValueError: | |
return f"Invalid target pose format. Use: θ φ r x y", None, None | |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n" | |
w, h = aspect_ratio_inputs.strip().split(",") | |
h_s, w_s = sample_size.strip().split(",") | |
command = [ | |
"python", "/app/inference/v2v_data/inference.py", | |
"--video_path", temp_input_path, | |
"--stride", "1", | |
"--out_dir", output_dir, | |
"--radius_scale", str(radius_scale), | |
"--camera", "target", | |
"--mask", | |
"--target_pose", theta, phi, r, x, y, | |
"--video_length", str(num_frames), | |
"--save_name", "output", | |
"--mode", mode, | |
"--fps", str(fps), | |
"--depth_inference_steps", str(depth_inference_steps), | |
"--depth_guidance_scale", str(depth_guidance_scale), | |
"--near_far_estimated", str(near_far_estimated), | |
"--sampler_name", sampler_name, | |
"--diffusion_guidance_scale", str(diffusion_guidance_scale), | |
"--diffusion_inference_steps", str(diffusion_inference_steps), | |
"--prompt", prompt if prompt else "", | |
"--negative_prompt", negative_prompt, | |
"--refine_prompt", refine_prompt, | |
"--window_size", str(window_size), | |
"--overlap", str(overlap), | |
"--max_res", str(max_res), | |
"--sample_size", h_s.strip(), w_s.strip(), | |
"--seed", str(seed_input), | |
"--height", str(height), | |
"--width", str(width), | |
"--target_aspect_ratio", w.strip(), h.strip(), | |
"--init_dx", str(init_dx), | |
"--init_dy", str(init_dy), | |
"--init_dz", str(init_dz), | |
] | |
try: | |
result = subprocess.run(command, capture_output=True, text=True, check=True) | |
logs += result.stdout | |
except subprocess.CalledProcessError as e: | |
logs += f"Inference failed:\n{e.stderr}{e.stdout}" | |
return None, logs | |
return str(video_output_path), logs | |
# ----------------------------- | |
# Step 2: Run Inference | |
# ----------------------------- | |
def inference( | |
fps, num_frames, controlnet_weights, controlnet_guidance_start, | |
controlnet_guidance_end, guidance_scale, num_inference_steps, dtype, | |
seed, height, width, downscale_coef, vae_channels, | |
controlnet_input_channels, controlnet_transformer_num_layers | |
): | |
model_path = "/app/pretrained/CogVideoX-5b-I2V" | |
ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt" | |
video_root_dir = "/app/output_anchor" | |
out_dir = "/app/output" | |
command = [ | |
"python", "/app/inference/cli_demo_camera_i2v_pcd.py", | |
"--video_root_dir", video_root_dir, | |
"--base_model_path", model_path, | |
"--controlnet_model_path", ckpt_path, | |
"--output_path", out_dir, | |
"--controlnet_weights", str(controlnet_weights), | |
"--controlnet_guidance_start", str(controlnet_guidance_start), | |
"--controlnet_guidance_end", str(controlnet_guidance_end), | |
"--guidance_scale", str(guidance_scale), | |
"--num_inference_steps", str(num_inference_steps), | |
"--dtype", dtype, | |
"--seed", str(seed), | |
"--height", str(height), | |
"--width", str(width), | |
"--num_frames", str(num_frames), | |
"--fps", str(fps), | |
"--downscale_coef", str(downscale_coef), | |
"--vae_channels", str(vae_channels), | |
"--controlnet_input_channels", str(controlnet_input_channels), | |
"--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers), | |
] | |
try: | |
result = subprocess.run(command, capture_output=True, text=True, check=True) | |
logs = result.stdout | |
except subprocess.CalledProcessError as e: | |
logs = f"❌ Step 2 Inference Failed:\nSTDERR:\n{e.stderr}\nSTDOUT:\n{e.stdout}" | |
return None, logs | |
video_output = f"{out_dir}/00000_{seed}_out.mp4" | |
return video_output if os.path.exists(video_output) else None, logs | |
# ----------------------------- | |
# UI | |
# ----------------------------- | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("## 🎬 EPiC: Cinematic Camera Control") | |
with gr.Tabs(): | |
with gr.TabItem("Step 1: Camera Anchor"): | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) | |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0") | |
fps_input = gr.Number(value=24, label="FPS") | |
aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)") | |
init_dx = gr.Number(value=0.0, label="Start Camera Offset X") | |
init_dy = gr.Number(value=0.0, label="Start Camera Offset Y") | |
init_dz = gr.Number(value=0.0, label="Start Camera Offset Z") | |
num_frames_input = gr.Number(value=49, label="Number of Frames") | |
radius_input = gr.Number(value = 1.0, label="Radius Scale") | |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode") | |
sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler") | |
diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance") | |
diff_steps_input = gr.Number(value=50, label="Diffusion Steps") | |
depth_steps_input = gr.Number(value=5, label="Depth Steps") | |
depth_guidance_input = gr.Number(value=1.0, label="Depth Guidance") | |
window_input = gr.Number(value=64, label="Window Size") | |
overlap_input = gr.Number(value=25, label="Overlap") | |
maxres_input = gr.Number(value=1920, label="Max Resolution") | |
sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672") | |
seed_input = gr.Number(value=43, label="Seed") | |
height = gr.Number(value=480, label="Height") | |
width = gr.Number(value=720, label="Width") | |
prompt_input = gr.Textbox(label="Prompt") | |
neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.") | |
refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ") | |
with gr.Column(): | |
video_input = gr.Video(label="Upload Video (MP4)") | |
step1_button = gr.Button("▶️ Run Step 1") | |
step1_video = gr.Video(label="[Step 1] Masked Video") | |
step1_logs = gr.Textbox(label="[Step 1] Logs") | |
with gr.TabItem("Step 2: CogVideoX Refinement"): | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights") | |
controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start") | |
controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End") | |
guidance_scale_input = gr.Number(value=6.0, label="Guidance Scale") | |
inference_steps_input = gr.Number(value=50, label="Num Inference Steps") | |
dtype_input = gr.Dropdown(choices=["float16", "bfloat16"], value="bfloat16", label="Compute Dtype") | |
seed_input2 = gr.Number(value=42, label="Seed") | |
height_input = gr.Number(value=480, label="Height") | |
width_input = gr.Number(value=720, label="Width") | |
num_frames_input2 = gr.Number(value=49, label="Num Frames") | |
fps_input2 = gr.Number(value=24, label="FPS") | |
downscale_coef_input = gr.Number(value=8, label="Downscale Coef") | |
vae_channels_input = gr.Number(value=16, label="VAE Channels") | |
controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels") | |
controlnet_layers_input = gr.Number(value=8, label="ControlNet Transformer Layers") | |
with gr.Column(): | |
step2_video = gr.Video(label="[Step 2] Final Refined Video") | |
step2_button = gr.Button("▶️ Run Step 2") | |
step2_logs = gr.Textbox(label="[Step 2] Logs") | |
step1_button.click( | |
get_anchor_video, | |
inputs=[ | |
video_input, fps_input, num_frames_input, pose_input, mode_input, | |
radius_input, near_far_estimated, | |
sampler_input, diff_guidance_input, diff_steps_input, | |
prompt_input, neg_prompt_input, refine_prompt_input, | |
depth_steps_input, depth_guidance_input, | |
window_input, overlap_input, maxres_input, sample_size, | |
seed_input, height, width, aspect_ratio_inputs, | |
init_dx, init_dy, init_dz # ← NEW INPUTS | |
], | |
outputs=[step1_video, step1_logs] | |
) | |
step2_button.click( | |
inference, | |
inputs=[ | |
fps_input2, num_frames_input2, | |
controlnet_weights_input, controlnet_guidance_start_input, | |
controlnet_guidance_end_input, guidance_scale_input, | |
inference_steps_input, dtype_input, seed_input2, | |
height_input, width_input, downscale_coef_input, | |
vae_channels_input, controlnet_input_channels_input, | |
controlnet_layers_input | |
], | |
outputs=[step2_video, step2_logs] | |
) | |
if __name__ == "__main__": | |
download_models() | |
demo.launch(server_name="0.0.0.0", server_port=7860) |