dynamcraf2

Sleeping

File size: 6,736 Bytes

07e35a2
15d6d88
4afdd4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01a0b3d
4afdd4d
 
28fb111
4afdd4d
28fb111
 
4afdd4d
28fb111
4afdd4d
28fb111
be0fea4
 
 
 
 
 
01a0b3d
be0fea4
 
 
 
 
 
01a0b3d
 
 
07e35a2
4afdd4d
07e35a2
 
 
 
 
 
 
 
 
 
 
 
4afdd4d
07e35a2
 
 
 
 
4afdd4d
07e35a2
 
 
 
 
4afdd4d
07e35a2
 
 
 
 
 
 
 
 
 
915ecc0
07e35a2
 
b5d93b2
07e35a2
b5d93b2
07e35a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4afdd4d
eec49a2
695a07e
090b2f9
eec49a2
4afdd4d
28fb111
4afdd4d
 
 
28fb111
4afdd4d
 
 
 
 
 
07e35a2
4afdd4d
 
 
 
 
87bc4ad
4de476c
4afdd4d
 
 
 
 
 
 
 
07e35a2
4afdd4d
 
 
 
 
 
07e35a2

# -*- coding: utf-8 -*-
import spaces
import gradio as gr
import os
import sys
import random
import time
from omegaconf import OmegaConf
import torch
import torchvision
from pytorch_lightning import seed_everything
from huggingface_hub import hf_hub_download
from einops import repeat
import torchvision.transforms as transforms
from utils.utils import instantiate_from_config
sys.path.insert(0, "scripts/evaluation")
from funcs import (
    batch_ddim_sampling,
    load_model_checkpoint,
    get_latent_z,
    save_videos
)
from transformers import pipeline

def download_model():
    REPO_ID = 'Doubiiu/DynamiCrafter_1024'
    filename_list = ['model.ckpt']
    if not os.path.exists('./checkpoints/dynamicrafter_1024_v1/'):
        os.makedirs('./checkpoints/dynamicrafter_1024_v1/')
    for filename in filename_list:
        local_file = os.path.join('./checkpoints/dynamicrafter_1024_v1/', filename)
        if not os.path.exists(local_file):
            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)

download_model()
ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
config_file='configs/inference_1024_v1.0.yaml'
config = OmegaConf.load(config_file)
model_config = config.pop("model", OmegaConf.create())
model_config['params']['unet_config']['params']['use_checkpoint']=True
model = instantiate_from_config(model_config)
assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
model = load_model_checkpoint(model, ckpt_path)
model.eval()
model = model.cuda()

# 번역 모델 로드
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")

@spaces.GPU(duration=300, gpu_type="l40s")
def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123):
    try:
        # 한글 입력 확인 및 번역
        if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
            translated = translator(prompt, max_length=512)
            prompt = translated[0]['translation_text']
        
        resolution = (576, 1024)
        save_fps = 8
        seed_everything(seed)
        transform = transforms.Compose([
            transforms.Resize(min(resolution), antialias=True),
            transforms.CenterCrop(resolution),
        ])
        
        print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        start = time.time()
        if steps > 60:
            steps = 60 

        batch_size = 1
        channels = model.model.diffusion_model.out_channels
        frames = model.temporal_length
        h, w = resolution[0] // 8, resolution[1] // 8
        noise_shape = [batch_size, channels, frames, h, w]

        with torch.no_grad(), torch.cuda.amp.autocast():
            text_emb = model.get_learned_conditioning([prompt])
            
            img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
            img_tensor = (img_tensor / 255. - 0.5) * 2
            image_tensor_resized = transform(img_tensor)
            videos = image_tensor_resized.unsqueeze(0)
            
            z = get_latent_z(model, videos.unsqueeze(2))
            img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
        
            cond_images = model.embedder(img_tensor.unsqueeze(0))
            img_emb = model.image_proj_model(cond_images)
        
            imtext_cond = torch.cat([text_emb, img_emb], dim=1)
        
            fs = torch.tensor([fs], dtype=torch.long, device=model.device)
            cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
            
            batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
        
            video_path = './output.mp4'
            save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
        
        # 메모리 정리
        del text_emb, img_tensor, image_tensor_resized, videos, z, img_tensor_repeat, cond_images, img_emb, imtext_cond, cond, batch_samples
        torch.cuda.empty_cache()
        
        return video_path
    except Exception as e:
        print(f"Error occurred: {e}")
        return None
    finally:
        torch.cuda.empty_cache()

i2v_examples = [
    ['prompts/1024/astronaut04.png', '우주인 복장으로 기타를 치는 남자', 30, 7.5, 1.0, 6, 123],
    ['prompts/1024/bloom01.png', 'time-lapse of a blooming flower with leaves and a stem', 30, 7.5, 1.0, 10, 123],
]

css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""

with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
    
    with gr.Tab(label='ImageAnimation_576x1024'):
        with gr.Column():
            with gr.Row():
                with gr.Column():
                    with gr.Row():
                        i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
                    with gr.Row():
                        i2v_input_text = gr.Textbox(label='Prompts (한글 입력 가능)')
                    with gr.Row():
                        i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
                        i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
                        i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
                    with gr.Row():
                        i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
                        i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=10)
                    i2v_end_btn = gr.Button("Generate")
                with gr.Row():
                    i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)

            gr.Examples(examples=i2v_examples,
                        inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video],
                        fn = infer,
                        cache_examples=False  # 이 부분을 False로 설정하여 캐시를 비활성화
            )
        i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],
                        outputs=[i2v_output_video],
                        fn = infer
        )

dynamicrafter_iface.launch(server_port=7890, server_name="0.0.0.0", share=True)