Spaces:

roll-ai
/

EPiC

Running on L40S

EPiC / gradio_app.py

Muhammad Taqi Raza

out put video path

47f4b64 13 days ago

11.3 kB

	import os
	import subprocess
	from datetime import datetime
	from pathlib import Path
	import gradio as gr
	import numpy as np

	# -----------------------------
	# Setup paths and env
	# -----------------------------
	HF_HOME = "/app/hf_cache"
	os.environ["HF_HOME"] = HF_HOME
	os.environ["TRANSFORMERS_CACHE"] = HF_HOME
	os.makedirs(HF_HOME, exist_ok=True)

	PRETRAINED_DIR = "/app/pretrained"
	os.makedirs(PRETRAINED_DIR, exist_ok=True)

	# -----------------------------
	# Step 1: Optional Model Download
	# -----------------------------
	def download_models():
	expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth")
	if not Path(expected_model).exists():
	print("⚙️ Downloading pretrained models...")
	try:
	subprocess.check_call(["bash", "download/download_models.sh"])
	print("✅ Models downloaded.")
	except subprocess.CalledProcessError as e:
	print(f"❌ Model download failed: {e}")
	else:
	print("✅ Pretrained models already exist.")


	# -----------------------------
	# Step 1: Get Anchor Video
	# -----------------------------
	def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
	radius_scale, near_far_estimated,
	sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
	prompt, negative_prompt, refine_prompt,
	depth_inference_steps, depth_guidance_scale,
	window_size, overlap, max_res, sample_size, seed_input, height, width):

	temp_input_path = "/app/temp_input.mp4"
	output_dir = "/app/output_anchor"
	video_output_path = f"{output_dir}/masked_videos/output.mp4"

	if video_path:
	os.system(f"cp '{video_path}' {temp_input_path}")

	try:
	theta, phi, r, x, y = target_pose.strip().split()
	except ValueError:
	return f"Invalid target pose format. Use: θ φ r x y", None, None
	logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"

	# INTEGRATE HEIGHT AND WIDTH PARAMETERS
	command = [
	"python", "/app/inference/v2v_data/inference.py",
	"--video_path", temp_input_path,
	"--stride", "1",
	"--out_dir", output_dir,
	"--radius_scale", str(radius_scale),
	"--camera", "target",
	"--mask",
	"--target_pose", theta, phi, r, x, y,
	"--video_length", str(num_frames),
	"--save_name", "output",
	"--mode", mode,
	"--fps", str(fps),
	"--depth_inference_steps", str(depth_inference_steps),
	"--depth_guidance_scale", str(depth_guidance_scale),
	# "--near_far_estimated", str(near_far_estimated),
	"--sampler_name", sampler_name,
	"--diffusion_guidance_scale", str(diffusion_guidance_scale),
	"--diffusion_inference_steps", str(diffusion_inference_steps),
	"--prompt", prompt if prompt else "",
	"--negative_prompt", negative_prompt,
	"--refine_prompt", refine_prompt,
	"--window_size", str(window_size),
	"--overlap", str(overlap),
	"--max_res", str(max_res),
	# "--sample_size", sample_size if sample_size else "384,672",
	"--seed", str(seed_input),
	"--height", str(height), # Fixed height
	"--width", str(width)
	]

	try:
	result = subprocess.run(command, capture_output=True, text=True, check=True)
	logs += result.stdout
	except subprocess.CalledProcessError as e:
	logs += f"❌ Inference failed:\n{e.stderr}{e.stdout}"
	return None, logs

	return str(video_output_path), logs
	# -----------------------------
	# Step 2: Run Inference
	# -----------------------------
	def inference(
	fps, num_frames, controlnet_weights, controlnet_guidance_start,
	controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
	seed, height, width, downscale_coef, vae_channels,
	controlnet_input_channels, controlnet_transformer_num_layers
	):
	MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
	ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
	video_root_dir = "/app/output_anchor"
	out_dir = "/app/output"

	command = [
	"python", "/app/inference/cli_demo_camera_i2v_pcd.py",
	"--video_root_dir", video_root_dir,
	"--base_model_path", MODEL_PATH,
	"--controlnet_model_path", ckpt_path,
	"--output_path", out_dir,
	"--controlnet_weights", str(controlnet_weights),
	"--controlnet_guidance_start", str(controlnet_guidance_start),
	"--controlnet_guidance_end", str(controlnet_guidance_end),
	"--guidance_scale", str(guidance_scale),
	"--num_inference_steps", str(num_inference_steps),
	"--dtype", dtype,
	"--seed", str(seed),
	"--height", str(height),
	"--width", str(width),
	"--num_frames", str(num_frames),
	"--fps", str(fps),
	"--downscale_coef", str(downscale_coef),
	"--vae_channels", str(vae_channels),
	"--controlnet_input_channels", str(controlnet_input_channels),
	"--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers)
	]
	try:
	result = subprocess.run(command, capture_output=True, text=True, check=True)
	logs = result.stdout
	except subprocess.CalledProcessError as e:
	logs = f"❌ Step 2 Inference Failed:\nSTDERR:\n{e.stderr}\nSTDOUT:\n{e.stdout}"
	return None, logs
	video_output = f"{out_dir}/00000_{seed}_out.mp4"
	return video_output if os.path.exists(video_output) else None, logs




	# -----------------------------
	# UI
	# -----------------------------
	demo = gr.Blocks()

	with demo:
	gr.Markdown("## 🎬 EPiC: Cinematic Camera Control")

	with gr.Tabs():
	with gr.TabItem("Step 1: Camera Anchor"):
	with gr.Row():
	with gr.Column():
	with gr.Row():
	near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
	pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
	fps_input = gr.Number(value=24, label="FPS")
	num_frames_input = gr.Number(value=49, label="Number of Frames")
	radius_input = gr.Number(value = 1.0, label="Radius Scale")
	mode_input = gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode")
	sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
	diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
	diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
	depth_steps_input = gr.Number(value=5, label="Depth Steps")
	depth_guidance_input = gr.Number(value=1.0, label="Depth Guidance")
	window_input = gr.Number(value=64, label="Window Size")
	overlap_input = gr.Number(value=25, label="Overlap")
	maxres_input = gr.Number(value=1920, label="Max Resolution")
	sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
	seed_input = gr.Number(value=43, label="Seed")
	height = gr.Number(value=576, label="Height")
	width = gr.Number(value=1024, label="Width")
	prompt_input = gr.Textbox(label="Prompt")
	neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
	refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
	with gr.Column():
	video_input = gr.Video(label="Upload Video (MP4)")
	step1_button = gr.Button("▶️ Run Step 1")
	step1_video = gr.Video(label="[Step 1] Masked Video")
	step1_logs = gr.Textbox(label="[Step 1] Logs")

	with gr.TabItem("Step 2: CogVideoX Refinement"):
	with gr.Row():
	with gr.Column():
	with gr.Row():
	controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
	controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
	controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
	guidance_scale_input = gr.Number(value=6.0, label="Guidance Scale")
	inference_steps_input = gr.Number(value=50, label="Num Inference Steps")
	dtype_input = gr.Dropdown(choices=["float16", "bfloat16"], value="bfloat16", label="Compute Dtype")
	seed_input2 = gr.Number(value=42, label="Seed")
	height_input = gr.Number(value=480, label="Height")
	width_input = gr.Number(value=720, label="Width")
	num_frames_input2 = gr.Number(value=49, label="Num Frames")
	fps_input2 = gr.Number(value=8, label="FPS")
	downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
	vae_channels_input = gr.Number(value=16, label="VAE Channels")
	controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
	controlnet_layers_input = gr.Number(value=8, label="ControlNet Transformer Layers")
	with gr.Column():
	step2_video = gr.Video(label="[Step 2] Final Refined Video")
	step2_button = gr.Button("▶️ Run Step 2")
	step2_logs = gr.Textbox(label="[Step 2] Logs")


	step1_button.click(
	get_anchor_video,
	inputs=[
	video_input, fps_input, num_frames_input, pose_input, mode_input,
	radius_input, near_far_estimated,
	sampler_input, diff_guidance_input, diff_steps_input,
	prompt_input, neg_prompt_input, refine_prompt_input,
	depth_steps_input, depth_guidance_input,
	window_input, overlap_input, maxres_input, sample_size, seed_input, height, width
	],
	outputs=[step1_video, step1_logs]
	)
	step2_button.click(
	inference,
	inputs=[
	fps_input2, num_frames_input2,
	controlnet_weights_input, controlnet_guidance_start_input,
	controlnet_guidance_end_input, guidance_scale_input,
	inference_steps_input, dtype_input, seed_input2,
	height_input, width_input, downscale_coef_input,
	vae_channels_input, controlnet_input_channels_input,
	controlnet_layers_input
	],
	outputs=[step2_video, step2_logs]
	)

	if __name__ == "__main__":
	download_models()
	demo.launch(server_name="0.0.0.0", server_port=7860)