File size: 6,525 Bytes
15d6d88
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01a0b3d
4afdd4d
 
28fb111
4afdd4d
28fb111
 
4afdd4d
28fb111
4afdd4d
28fb111
be0fea4
 
 
 
 
 
01a0b3d
be0fea4
 
 
 
 
 
01a0b3d
 
 
 
4afdd4d
01a0b3d
 
 
 
 
28fb111
4afdd4d
 
 
28fb111
 
4afdd4d
01a0b3d
915ecc0
4afdd4d
 
 
 
915ecc0
4afdd4d
 
28fb111
4afdd4d
 
01a0b3d
b5d93b2
915ecc0
b5d93b2
 
01a0b3d
 
b5d93b2
01a0b3d
b5d93b2
4afdd4d
01a0b3d
b5d93b2
 
 
 
 
 
 
7f31be9
b5d93b2
 
 
4afdd4d
 
eec49a2
090b2f9
 
 
 
 
 
eec49a2
4afdd4d
28fb111
4afdd4d
 
 
28fb111
4afdd4d
 
 
 
 
 
01a0b3d
4afdd4d
 
 
 
 
87bc4ad
4de476c
4afdd4d
 
 
 
 
 
 
 
d9114d9
4afdd4d
 
 
 
 
 
01a0b3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import spaces
import gradio as gr
import os
import sys
import random
import time
from omegaconf import OmegaConf
import torch
import torchvision
from pytorch_lightning import seed_everything
from huggingface_hub import hf_hub_download
from einops import repeat
import torchvision.transforms as transforms
from utils.utils import instantiate_from_config
sys.path.insert(0, "scripts/evaluation")
from funcs import (
    batch_ddim_sampling,
    load_model_checkpoint,
    get_latent_z,
    save_videos
)
from transformers import pipeline

def download_model():
    REPO_ID = 'Doubiiu/DynamiCrafter_1024'
    filename_list = ['model.ckpt']
    if not os.path.exists('./checkpoints/dynamicrafter_1024_v1/'):
        os.makedirs('./checkpoints/dynamicrafter_1024_v1/')
    for filename in filename_list:
        local_file = os.path.join('./checkpoints/dynamicrafter_1024_v1/', filename)
        if not os.path.exists(local_file):
            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)

download_model()
ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
config_file='configs/inference_1024_v1.0.yaml'
config = OmegaConf.load(config_file)
model_config = config.pop("model", OmegaConf.create())
model_config['params']['unet_config']['params']['use_checkpoint']=True
model = instantiate_from_config(model_config)
assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
model = load_model_checkpoint(model, ckpt_path)
model.eval()
model = model.cuda()

# 번역 모델 로드
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

@spaces.GPU(duration=300, gpu_type="h100")
def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123):
    # 한글 입력 확인 및 번역
    if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
        translated = translator(prompt, max_length=512)
        prompt = translated[0]['translation_text']
    
    resolution = (576, 1024)
    save_fps = 8
    seed_everything(seed)
    transform = transforms.Compose([
        transforms.Resize(min(resolution)),
        transforms.CenterCrop(resolution),
        ])
    torch.cuda.empty_cache()
    print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    start = time.time()
    if steps > 60:
        steps = 60 

    batch_size = 1
    channels = model.model.diffusion_model.out_channels
    frames = model.temporal_length
    h, w = resolution[0] // 8, resolution[1] // 8
    noise_shape = [batch_size, channels, frames, h, w]

    with torch.no_grad(), torch.cuda.amp.autocast():
        text_emb = model.get_learned_conditioning([prompt])
        
        img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
        img_tensor = (img_tensor / 255. - 0.5) * 2
        image_tensor_resized = transform(img_tensor)
        videos = image_tensor_resized.unsqueeze(0)
        
        z = get_latent_z(model, videos.unsqueeze(2))
        img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
    
        cond_images = model.embedder(img_tensor.unsqueeze(0))
        img_emb = model.image_proj_model(cond_images)
    
        imtext_cond = torch.cat([text_emb, img_emb], dim=1)
    
        fs = torch.tensor([fs], dtype=torch.long, device=model.device)
        cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
        
        batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
    
        video_path = './output.mp4'
        save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
    return video_path

i2v_examples = [
    ['prompts/1024/astronaut04.png', 'a man in an astronaut suit playing a guitar', 30, 7.5, 1.0, 6, 123],
    ['prompts/1024/bloom01.png', 'time-lapse of a blooming flower with leaves and a stem', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/girl07.png', 'a beautiful woman with long hair and a dress blowing in the wind', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/pour_bear.png', 'pouring beer into a glass of ice and beer', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/robot01.png', 'a robot is walking through a destroyed city', 30, 7.5, 1.0, 10, 123],
    ['prompts/1024/firework03.png', 'fireworks display', 30, 7.5, 1.0, 10, 123],
]

css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""

with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
    
    with gr.Tab(label='ImageAnimation_576x1024'):
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
                    with gr.Row():
                        i2v_input_text = gr.Text(label='Prompts (한글 입력 가능)')
                    with gr.Row():
                        i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
                        i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
                        i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
                    with gr.Row():
                        i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
                        i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=10)
                    i2v_end_btn = gr.Button("Generate")
                with gr.Row():
                    i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)

            gr.Examples(examples=i2v_examples,
                        inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video],
                        fn = infer,
                        cache_examples=True,
            )
        i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video],
                        fn = infer
        )

dynamicrafter_iface.launch(server_port=7890, server_name="0.0.0.0", share=True)