Spaces:
Paused
Paused
from demo import GetAnchorVideos | |
import os | |
from datetime import datetime | |
import argparse | |
import torch | |
def get_parser(): | |
parser = argparse.ArgumentParser() | |
## general | |
parser.add_argument('--video_path', type=str, help='Input path') | |
parser.add_argument( | |
'--out_dir', type=str, required=True, help='Output dir' | |
) | |
parser.add_argument( | |
'--device', type=str, default='cuda:0', help='The device to use' | |
) | |
parser.add_argument( | |
'--exp_name', | |
type=str, | |
default=None, | |
help='Experiment name, use video file name by default', | |
) | |
parser.add_argument( | |
'--save_name', | |
type=str, | |
default=None, | |
help='Experiment name, use video file name by default', | |
) | |
parser.add_argument( | |
'--seed', type=int, default=43, help='Random seed for reproducibility' | |
) | |
parser.add_argument( | |
'--video_length', type=int, default=49, help='Length of the video frames' | |
) | |
parser.add_argument('--fps', type=int, default=10, help='Fps for saved video') | |
parser.add_argument( | |
'--stride', type=int, default=1, help='Sampling stride for input video' | |
) | |
parser.add_argument('--server_name', type=str, help='Server IP address') | |
## render | |
parser.add_argument( | |
'--radius_scale', | |
type=float, | |
default=1.0, | |
help='Scale factor for the spherical radius', | |
) | |
parser.add_argument('--camera', type=str, default='traj', help='traj or target') | |
parser.add_argument( | |
'--mode', type=str, default='gradual', help='gradual, bullet or direct' | |
) | |
parser.add_argument( | |
'--mask', action='store_true', default=False, help='Clean the pcd if true' | |
) | |
parser.add_argument( | |
'--traj_txt', | |
type=str, | |
help="Required for 'traj' camera, a txt file that specify camera trajectory", | |
) | |
parser.add_argument( | |
'--target_pose', | |
nargs=5, | |
type=float, | |
help="Required for 'target' mode, specify target camera pose, <theta phi r x y>", | |
) | |
parser.add_argument( | |
'--near', type=float, default=0.0001, help='Near clipping plane distance' | |
) | |
parser.add_argument( | |
'--far', type=float, default=10000.0, help='Far clipping plane distance' | |
) | |
parser.add_argument( | |
'--height', type=int, default=1080, help='Height' | |
) | |
parser.add_argument( | |
'--width', type=int, default=1920, help='width' | |
) | |
parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame') | |
parser.add_argument( | |
'--near_far_estimated', | |
type=bool, | |
default=True, | |
help='Use estimated near and far values', | |
) | |
## diffusion | |
parser.add_argument( | |
'--low_gpu_memory_mode', | |
type=bool, | |
default=False, | |
help='Enable low GPU memory mode', | |
) | |
# parser.add_argument('--model_name', type=str, default='checkpoints/CogVideoX-Fun-V1.1-5b-InP', help='Path to the model') | |
parser.add_argument( | |
'--model_name', | |
type=str, | |
default='/app/pretrained/CogVideoX-Fun-V1.1-5b-InP', | |
help='Path to the model', | |
) | |
parser.add_argument( | |
'--sampler_name', | |
type=str, | |
choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], | |
default='DDIM_Origin', | |
help='Choose the sampler', | |
) | |
# parser.add_argument('--transformer_path', type=str,kdefault='checkpoints/TrajectoryCrafter/crosstransformer', help='Path to the pretrained transformer model') | |
parser.add_argument( | |
'--transformer_path', | |
type=str, | |
default="/app/pretrained/TrajectoryCrafter", | |
help='Path to the pretrained transformer model', | |
) | |
parser.add_argument( | |
'--sample_size', | |
type=int, | |
nargs=2, | |
default=[384, 672], | |
help='Sample size as [height, width]', | |
) | |
parser.add_argument( | |
'--diffusion_guidance_scale', | |
type=float, | |
default=6.0, | |
help='Guidance scale for inference', | |
) | |
parser.add_argument( | |
'--diffusion_inference_steps', | |
type=int, | |
default=50, | |
help='Number of inference steps', | |
) | |
parser.add_argument( | |
'--prompt', type=str, default=None, help='Prompt for video generation' | |
) | |
parser.add_argument( | |
'--negative_prompt', | |
type=str, | |
default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid.", | |
help='Negative prompt for video generation', | |
) | |
parser.add_argument( | |
'--refine_prompt', | |
type=str, | |
default=". The video is of high quality, and the view is very clear. ", | |
help='Prompt for video generation', | |
) | |
parser.add_argument('--qwen_path', type=str, default="/app/pretrained/Qwen2.5-VL-7B-Instruct") | |
## depth | |
# parser.add_argument('--unet_path', type=str, default='checkpoints/DepthCrafter', help='Path to the UNet model') | |
parser.add_argument( | |
'--unet_path', | |
type=str, | |
default="/app/pretrained/DepthCrafter", | |
help='Path to the UNet model', | |
) | |
# parser.add_argument('--pre_train_path', type=str, default='checkpoints/stable-video-diffusion-img2vid-xt', help='Path to the pre-trained model') | |
parser.add_argument( | |
'--pre_train_path', | |
type=str, | |
default="/app/pretrained/stable-video-diffusion-img2vid", | |
help='Path to the pre-trained model', | |
) | |
parser.add_argument( | |
'--cpu_offload', type=str, default='model', help='CPU offload strategy' | |
) | |
parser.add_argument( | |
'--depth_inference_steps', type=int, default=5, help='Number of inference steps' | |
) | |
parser.add_argument( | |
'--depth_guidance_scale', | |
type=float, | |
default=1.0, | |
help='Guidance scale for inference', | |
) | |
parser.add_argument( | |
'--window_size', type=int, default=110, help='Window size for processing' | |
) | |
parser.add_argument( | |
'--overlap', type=int, default=25, help='Overlap size for processing' | |
) | |
parser.add_argument( | |
'--max_res', type=int, default=1024, help='Maximum resolution for processing' | |
) | |
parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None) | |
parser.add_argument('--init_dx', type=float, default=0.0) | |
parser.add_argument('--init_dy', type=float, default=0.0) | |
parser.add_argument('--init_dz', type=float, default=0.0) | |
return parser | |
if __name__ == "__main__": | |
parser = get_parser() # infer config.py | |
opts = parser.parse_args() | |
opts.weight_dtype = torch.bfloat16 | |
pvd = GetAnchorVideos(opts) | |
if opts.mode == 'gradual': | |
pvd.infer_gradual(opts) | |
elif opts.mode == 'direct': | |
pvd.infer_direct(opts) | |
elif opts.mode == 'bullet': | |
pvd.infer_bullet(opts) | |
elif opts.mode == 'image': | |
pvd.infer_image(opts) | |
elif opts.mode == 'start_end': | |
pvd.infer_start_end(opts) | |
elif opts.mode == 'zoom': | |
pvd.infer_zoom(opts) |