Spaces:
Running
on
Zero
Running
on
Zero
import gc | |
import os | |
import numpy as np | |
import torch | |
from diffusers.training_utils import set_seed | |
from diffusers import AutoencoderKLTemporalDecoder | |
from fire import Fire | |
from normalcrafter.normal_crafter_ppl import NormalCrafterPipeline | |
from normalcrafter.unet import DiffusersUNetSpatioTemporalConditionModelNormalCrafter | |
from normalcrafter.utils import vis_sequence_normal, save_video, read_video_frames | |
class DepthCrafterDemo: | |
def __init__( | |
self, | |
unet_path: str, | |
pre_train_path: str, | |
cpu_offload: str = "model", | |
): | |
unet = DiffusersUNetSpatioTemporalConditionModelNormalCrafter.from_pretrained( | |
unet_path, | |
subfolder="unet", | |
low_cpu_mem_usage=True, | |
) | |
vae = AutoencoderKLTemporalDecoder.from_pretrained( | |
unet_path, subfolder="vae" | |
) | |
weight_dtype = torch.float16 | |
vae.to(dtype=weight_dtype) | |
unet.to(dtype=weight_dtype) | |
# load weights of other components from the provided checkpoint | |
self.pipe = NormalCrafterPipeline.from_pretrained( | |
pre_train_path, | |
unet=unet, | |
vae=vae, | |
torch_dtype=weight_dtype, | |
variant="fp16", | |
) | |
# for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory | |
if cpu_offload is not None: | |
if cpu_offload == "sequential": | |
# This will slow, but save more memory | |
self.pipe.enable_sequential_cpu_offload() | |
elif cpu_offload == "model": | |
self.pipe.enable_model_cpu_offload() | |
else: | |
raise ValueError(f"Unknown cpu offload option: {cpu_offload}") | |
else: | |
self.pipe.to("cuda") | |
# enable attention slicing and xformers memory efficient attention | |
try: | |
self.pipe.enable_xformers_memory_efficient_attention() | |
except Exception as e: | |
print(e) | |
print("Xformers is not enabled") | |
# self.pipe.enable_attention_slicing() | |
def infer( | |
self, | |
video: str, | |
save_folder: str = "./demo_output", | |
window_size: int = 14, | |
time_step_size: int = 10, | |
process_length: int = 195, | |
decode_chunk_size: int = 7, | |
max_res: int = 1024, | |
dataset: str = "open", | |
target_fps: int = 15, | |
seed: int = 42, | |
save_npz: bool = False, | |
): | |
set_seed(seed) | |
frames, target_fps = read_video_frames( | |
video, | |
process_length, | |
target_fps, | |
max_res, | |
) | |
# inference the depth map using the DepthCrafter pipeline | |
with torch.inference_mode(): | |
res = self.pipe( | |
frames, | |
decode_chunk_size=decode_chunk_size, | |
time_step_size=time_step_size, | |
window_size=window_size, | |
).frames[0] | |
# visualize the depth map and save the results | |
vis = vis_sequence_normal(res) | |
# save the depth map and visualization with the target FPS | |
save_path = os.path.join( | |
save_folder, os.path.splitext(os.path.basename(video))[0] | |
) | |
os.makedirs(os.path.dirname(save_path), exist_ok=True) | |
save_video(vis, save_path + "_vis.mp4", fps=target_fps) | |
save_video(frames, save_path + "_input.mp4", fps=target_fps) | |
if save_npz: | |
np.savez_compressed(save_path + ".npz", depth=res) | |
return [ | |
save_path + "_input.mp4", | |
save_path + "_vis.mp4", | |
] | |
def run( | |
self, | |
input_video, | |
num_denoising_steps, | |
guidance_scale, | |
max_res=1024, | |
process_length=195, | |
): | |
res_path = self.infer( | |
input_video, | |
num_denoising_steps, | |
guidance_scale, | |
max_res=max_res, | |
process_length=process_length, | |
) | |
# clear the cache for the next video | |
gc.collect() | |
torch.cuda.empty_cache() | |
return res_path[:2] | |
def main( | |
video_path: str, | |
save_folder: str = "./demo_output", | |
unet_path: str = "Yanrui95/NormalCrafter", | |
pre_train_path: str = "stabilityai/stable-video-diffusion-img2vid-xt", | |
process_length: int = -1, | |
cpu_offload: str = "model", | |
target_fps: int = -1, | |
seed: int = 42, | |
window_size: int = 14, | |
time_step_size: int = 10, | |
max_res: int = 1024, | |
dataset: str = "open", | |
save_npz: bool = False | |
): | |
depthcrafter_demo = DepthCrafterDemo( | |
unet_path=unet_path, | |
pre_train_path=pre_train_path, | |
cpu_offload=cpu_offload, | |
) | |
# process the videos, the video paths are separated by comma | |
video_paths = video_path.split(",") | |
for video in video_paths: | |
depthcrafter_demo.infer( | |
video, | |
save_folder=save_folder, | |
window_size=window_size, | |
process_length=process_length, | |
time_step_size=time_step_size, | |
max_res=max_res, | |
dataset=dataset, | |
target_fps=target_fps, | |
seed=seed, | |
save_npz=save_npz, | |
) | |
# clear the cache for the next video | |
gc.collect() | |
torch.cuda.empty_cache() | |
if __name__ == "__main__": | |
# running configs | |
# the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size` | |
# the most important arguments for trade-off between quality and speed are | |
# `num_inference_steps`, `guidance_scale`, and `max_res` | |
Fire(main) | |