dynamcraf2

Paused

File size: 6,525 Bytes

15d6d88
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01a0b3d
4afdd4d
 
28fb111
4afdd4d
28fb111
 
4afdd4d
28fb111
4afdd4d
28fb111
be0fea4
 
 
 
 
 
01a0b3d
be0fea4
 
 
 
 
 
01a0b3d
 
 
 
4afdd4d
01a0b3d
 
 
 
 
28fb111
4afdd4d
 
 
28fb111
 
4afdd4d
01a0b3d
915ecc0
4afdd4d
 
 
 
915ecc0
4afdd4d
 
28fb111
4afdd4d
 
01a0b3d
b5d93b2
915ecc0
b5d93b2
 
01a0b3d
 
b5d93b2
01a0b3d
b5d93b2
4afdd4d
01a0b3d
b5d93b2
 
 
 
 
 
 
7f31be9
b5d93b2
 
 
4afdd4d
 
eec49a2
090b2f9
 
 
 
 
 
eec49a2
4afdd4d
28fb111
4afdd4d
 
 
28fb111
4afdd4d
 
 
 
 
 
01a0b3d
4afdd4d
 
 
 
 
87bc4ad
4de476c
4afdd4d
 
 
 
 
 
 
 
d9114d9
4afdd4d
 
 
 
 
 
01a0b3d

import spaces
import gradio as gr
import os
import sys
import random
import time
from omegaconf import OmegaConf
import torch
import torchvision
from pytorch_lightning import seed_everything
from huggingface_hub import hf_hub_download
from einops import repeat
import torchvision.transforms as transforms
from utils.utils import instantiate_from_config
sys.path.insert(0, "scripts/evaluation")
from funcs import (
    batch_ddim_sampling,
    load_model_checkpoint,
    get_latent_z,
    save_videos
)
from transformers import pipeline

def download_model():
    REPO_ID = 'Doubiiu/DynamiCrafter_1024'
    filename_list = ['model.ckpt']
    if not os.path.exists('./checkpoints/dynamicrafter_1024_v1/'):
        os.makedirs('./checkpoints/dynamicrafter_1024_v1/')
    for filename in filename_list:
        local_file = os.path.join('./checkpoints/dynamicrafter_1024_v1/', filename)
        if not os.path.exists(local_file):
            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)

download_model()
ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
config_file='configs/inference_1024_v1.0.yaml'
config = OmegaConf.load(config_file)
model_config = config.pop("model", OmegaConf.create())
model_config['params']['unet_config']['params']['use_checkpoint']=True
model = instantiate_from_config(model_config)
assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
model = load_model_checkpoint(model, ckpt_path)
model.eval()
model = model.cuda()

# 번역 모델 로드
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

@spaces.GPU(duration=300, gpu_type="h100")
def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123):
    # 한글 입력 확인 및 번역
    if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
        translated = translator(prompt, max_length=512)
        prompt = translated[0]['translation_text']
    
    resolution = (576, 1024)
    save_fps = 8
    seed_everything(seed)
    transform = transforms.Compose([
        transforms.Resize(min(resolution)),
        transforms.CenterCrop(resolution),
        ])
    torch.cuda.empty_cache()
    print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    start = time.time()
    if steps > 60:
        steps = 60 

    batch_size = 1
    channels = model.model.diffusion_model.out_channels
    frames = model.temporal_length
    h, w = resolution[0] // 8, resolution[1] // 8
    noise_shape = [batch_size, channels, frames, h, w]

    with torch.no_grad(), torch.cuda.amp.autocast():
        text_emb = model.get_learned_conditioning([prompt])
        
        img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
        img_tensor = (img_tensor / 255. - 0.5) * 2
        image_tensor_resized = transform(img_tensor)
        videos = image_tensor_resized.unsqueeze(0)
        
        z = get_latent_z(model, videos.unsqueeze(2))
        img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
    
        cond_images = model.embedder(img_tensor.unsqueeze(0))
        img_emb = model.image_proj_model(cond_images)
    
        imtext_cond = torch.cat([text_emb, img_emb], dim=1)
    
        fs = torch.tensor([fs], dtype=torch.long, device=model.device)
        cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
        
        batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
    
        video_path = './output.mp4'
        save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
    return video_path

i2v_examples = [
    ['prompts/1024/astronaut04.png', 'a man in an astronaut suit playing a guitar', 30, 7.5, 1.0, 6, 123],
    ['prompts/1024/bloom01.png', 'time-lapse of a blooming flower with leaves and a stem', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/girl07.png', 'a beautiful woman with long hair and a dress blowing in the wind', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/pour_bear.png', 'pouring beer into a glass of ice and beer', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/robot01.png', 'a robot is walking through a destroyed city', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/firework03.png', 'fireworks display', 30, 7.5, 1.0, 10, 123],
]

css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""

with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
    
    with gr.Tab(label='ImageAnimation_576x1024'):
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
                    with gr.Row():
                        i2v_input_text = gr.Text(label='Prompts (한글 입력 가능)')
                    with gr.Row():
                        i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
                        i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
                        i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
                    with gr.Row():
                        i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
                        i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=10)
                    i2v_end_btn = gr.Button("Generate")
                with gr.Row():
                    i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)

            gr.Examples(examples=i2v_examples,
                        inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video],
                        fn = infer,
                        cache_examples=True,
            )
        i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video],
                        fn = infer
        )

dynamicrafter_iface.launch(server_port=7890, server_name="0.0.0.0", share=True)