Spaces:
Running
Running
import gradio as gr | |
from gradio_toggle import Toggle | |
import torch | |
from huggingface_hub import snapshot_download | |
from transformers import pipeline | |
from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder | |
from xora.models.transformers.transformer3d import Transformer3DModel | |
from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier | |
from xora.schedulers.rf import RectifiedFlowScheduler | |
from xora.pipelines.pipeline_xora_video import XoraVideoPipeline | |
from transformers import T5EncoderModel, T5Tokenizer | |
from xora.utils.conditioning_method import ConditioningMethod | |
from pathlib import Path | |
import safetensors.torch | |
import json | |
import numpy as np | |
import cv2 | |
from PIL import Image | |
import tempfile | |
import os | |
import gc | |
from openai import OpenAI | |
import re | |
# Load system prompts | |
system_prompt_t2v = """๋น์ ์ ๋น๋์ค ์์ฑ์ ์ํ ํ๋กฌํํธ ์ ๋ฌธ๊ฐ์ ๋๋ค. | |
์ฃผ์ด์ง ํ๋กฌํํธ๋ฅผ ๋ค์ ๊ตฌ์กฐ์ ๋ง๊ฒ ๊ฐ์ ํด์ฃผ์ธ์: | |
1. ์ฃผ์ ๋์์ ๋ช ํํ ํ ๋ฌธ์ฅ์ผ๋ก ์์ | |
2. ๊ตฌ์ฒด์ ์ธ ๋์๊ณผ ์ ์ค์ฒ๋ฅผ ์๊ฐ ์์๋๋ก ์ค๋ช | |
3. ์บ๋ฆญํฐ/๊ฐ์ฒด์ ์ธ๋ชจ๋ฅผ ์์ธํ ๋ฌ์ฌ | |
4. ๋ฐฐ๊ฒฝ๊ณผ ํ๊ฒฝ ์ธ๋ถ ์ฌํญ์ ๊ตฌ์ฒด์ ์ผ๋ก ํฌํจ | |
5. ์นด๋ฉ๋ผ ๊ฐ๋์ ์์ง์์ ๋ช ์ | |
6. ์กฐ๋ช ๊ณผ ์์์ ์์ธํ ์ค๋ช | |
7. ๋ณํ๋ ๊ฐ์์ค๋ฌ์ด ์ฌ๊ฑด์ ์์ฐ์ค๋ฝ๊ฒ ํฌํจ | |
๋ชจ๋ ์ค๋ช ์ ํ๋์ ์์ฐ์ค๋ฌ์ด ๋ฌธ๋จ์ผ๋ก ์์ฑํ๊ณ , | |
์ดฌ์ ๊ฐ๋ ์ด ์ดฌ์ ๋ชฉ๋ก์ ์ค๋ช ํ๋ ๊ฒ์ฒ๋ผ ๊ตฌ์ฒด์ ์ด๊ณ ์๊ฐ์ ์ผ๋ก ์์ฑํ์ธ์. | |
200๋จ์ด๋ฅผ ๋์ง ์๋๋ก ํ๋, ์ต๋ํ ์์ธํ๊ฒ ์์ฑํ์ธ์.""" | |
system_prompt_i2v = """๋น์ ์ ์ด๋ฏธ์ง ๊ธฐ๋ฐ ๋น๋์ค ์์ฑ์ ์ํ ํ๋กฌํํธ ์ ๋ฌธ๊ฐ์ ๋๋ค. | |
์ฃผ์ด์ง ํ๋กฌํํธ๋ฅผ ๋ค์ ๊ตฌ์กฐ์ ๋ง๊ฒ ๊ฐ์ ํด์ฃผ์ธ์: | |
1. ์ฃผ์ ๋์์ ๋ช ํํ ํ ๋ฌธ์ฅ์ผ๋ก ์์ | |
2. ๊ตฌ์ฒด์ ์ธ ๋์๊ณผ ์ ์ค์ฒ๋ฅผ ์๊ฐ ์์๋๋ก ์ค๋ช | |
3. ์บ๋ฆญํฐ/๊ฐ์ฒด์ ์ธ๋ชจ๋ฅผ ์์ธํ ๋ฌ์ฌ | |
4. ๋ฐฐ๊ฒฝ๊ณผ ํ๊ฒฝ ์ธ๋ถ ์ฌํญ์ ๊ตฌ์ฒด์ ์ผ๋ก ํฌํจ | |
5. ์นด๋ฉ๋ผ ๊ฐ๋์ ์์ง์์ ๋ช ์ | |
6. ์กฐ๋ช ๊ณผ ์์์ ์์ธํ ์ค๋ช | |
7. ๋ณํ๋ ๊ฐ์์ค๋ฌ์ด ์ฌ๊ฑด์ ์์ฐ์ค๋ฝ๊ฒ ํฌํจ | |
๋ชจ๋ ์ค๋ช ์ ํ๋์ ์์ฐ์ค๋ฌ์ด ๋ฌธ๋จ์ผ๋ก ์์ฑํ๊ณ , | |
์ดฌ์ ๊ฐ๋ ์ด ์ดฌ์ ๋ชฉ๋ก์ ์ค๋ช ํ๋ ๊ฒ์ฒ๋ผ ๊ตฌ์ฒด์ ์ด๊ณ ์๊ฐ์ ์ผ๋ก ์์ฑํ์ธ์. | |
200๋จ์ด๋ฅผ ๋์ง ์๋๋ก ํ๋, ์ต๋ํ ์์ธํ๊ฒ ์์ฑํ์ธ์.""" | |
# Load Hugging Face token if needed | |
hf_token = os.getenv("HF_TOKEN") | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
client = OpenAI(api_key=openai_api_key) | |
# Initialize translation pipeline | |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en") | |
# Korean text detection function | |
def contains_korean(text): | |
korean_pattern = re.compile('[ใฑ-ใ ใ -ใ ฃ๊ฐ-ํฃ]') | |
return bool(korean_pattern.search(text)) | |
def translate_korean_prompt(prompt): | |
""" | |
Translate Korean prompt to English if Korean text is detected | |
""" | |
if contains_korean(prompt): | |
translated = translator(prompt)[0]['translation_text'] | |
print(f"Original Korean prompt: {prompt}") | |
print(f"Translated English prompt: {translated}") | |
return translated | |
return prompt | |
def enhance_prompt(prompt, type="t2v"): | |
system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": prompt}, | |
] | |
try: | |
response = client.chat.completions.create( | |
model="gpt-4-1106-preview", | |
messages=messages, | |
max_tokens=2000, | |
) | |
enhanced_prompt = response.choices[0].message.content.strip() | |
print("\n=== ํ๋กฌํํธ ์ฆ๊ฐ ๊ฒฐ๊ณผ ===") | |
print("Original Prompt:") | |
print(prompt) | |
print("\nEnhanced Prompt:") | |
print(enhanced_prompt) | |
print("========================\n") | |
return enhanced_prompt | |
except Exception as e: | |
print(f"Error during prompt enhancement: {e}") | |
return prompt | |
def update_prompt_t2v(prompt, enhance_toggle): | |
return update_prompt(prompt, enhance_toggle, "t2v") | |
def update_prompt_i2v(prompt, enhance_toggle): | |
return update_prompt(prompt, enhance_toggle, "i2v") | |
def update_prompt(prompt, enhance_toggle, type="t2v"): | |
if enhance_toggle: | |
return enhance_prompt(prompt, type) | |
return prompt | |
# Set model download directory within Hugging Face Spaces | |
model_path = "asset" | |
if not os.path.exists(model_path): | |
snapshot_download( | |
"Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token | |
) | |
# Global variables to load components | |
vae_dir = Path(model_path) / "vae" | |
unet_dir = Path(model_path) / "unet" | |
scheduler_dir = Path(model_path) / "scheduler" | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
def load_vae(vae_dir): | |
vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors" | |
vae_config_path = vae_dir / "config.json" | |
with open(vae_config_path, "r") as f: | |
vae_config = json.load(f) | |
vae = CausalVideoAutoencoder.from_config(vae_config) | |
vae_state_dict = safetensors.torch.load_file(vae_ckpt_path) | |
vae.load_state_dict(vae_state_dict) | |
return vae.to(device=device, dtype=torch.bfloat16) | |
def load_unet(unet_dir): | |
unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors" | |
unet_config_path = unet_dir / "config.json" | |
transformer_config = Transformer3DModel.load_config(unet_config_path) | |
transformer = Transformer3DModel.from_config(transformer_config) | |
unet_state_dict = safetensors.torch.load_file(unet_ckpt_path) | |
transformer.load_state_dict(unet_state_dict, strict=True) | |
return transformer.to(device=device, dtype=torch.bfloat16) | |
def load_scheduler(scheduler_dir): | |
scheduler_config_path = scheduler_dir / "scheduler_config.json" | |
scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path) | |
return RectifiedFlowScheduler.from_config(scheduler_config) | |
# Helper function for image processing | |
def center_crop_and_resize(frame, target_height, target_width): | |
h, w, _ = frame.shape | |
aspect_ratio_target = target_width / target_height | |
aspect_ratio_frame = w / h | |
if aspect_ratio_frame > aspect_ratio_target: | |
new_width = int(h * aspect_ratio_target) | |
x_start = (w - new_width) // 2 | |
frame_cropped = frame[:, x_start : x_start + new_width] | |
else: | |
new_height = int(w / aspect_ratio_target) | |
y_start = (h - new_height) // 2 | |
frame_cropped = frame[y_start : y_start + new_height, :] | |
frame_resized = cv2.resize(frame_cropped, (target_width, target_height)) | |
return frame_resized | |
def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768): | |
image = Image.open(image_path).convert("RGB") | |
image_np = np.array(image) | |
frame_resized = center_crop_and_resize(image_np, target_height, target_width) | |
frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float() | |
frame_tensor = (frame_tensor / 127.5) - 1.0 | |
return frame_tensor.unsqueeze(0).unsqueeze(2) | |
# Load models | |
vae = load_vae(vae_dir) | |
unet = load_unet(unet_dir) | |
scheduler = load_scheduler(scheduler_dir) | |
patchifier = SymmetricPatchifier(patch_size=1) | |
text_encoder = T5EncoderModel.from_pretrained( | |
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder" | |
).to(device) | |
tokenizer = T5Tokenizer.from_pretrained( | |
"PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer" | |
) | |
pipeline = XoraVideoPipeline( | |
transformer=unet, | |
patchifier=patchifier, | |
text_encoder=text_encoder, | |
tokenizer=tokenizer, | |
scheduler=scheduler, | |
vae=vae, | |
).to(device) | |
# Preset options for resolution and frame configuration | |
preset_options = [ | |
{"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41}, | |
{"label": "1088x704, 49 frames", "width": 1088, "height": 704, "num_frames": 49}, | |
{"label": "1056x640, 57 frames", "width": 1056, "height": 640, "num_frames": 57}, | |
{"label": "992x608, 65 frames", "width": 992, "height": 608, "num_frames": 65}, | |
{"label": "896x608, 73 frames", "width": 896, "height": 608, "num_frames": 73}, | |
{"label": "896x544, 81 frames", "width": 896, "height": 544, "num_frames": 81}, | |
{"label": "832x544, 89 frames", "width": 832, "height": 544, "num_frames": 89}, | |
{"label": "800x512, 97 frames", "width": 800, "height": 512, "num_frames": 97}, | |
{"label": "768x512, 97 frames", "width": 768, "height": 512, "num_frames": 97}, | |
{"label": "800x480, 105 frames", "width": 800, "height": 480, "num_frames": 105}, | |
{"label": "736x480, 113 frames", "width": 736, "height": 480, "num_frames": 113}, | |
{"label": "704x480, 121 frames", "width": 704, "height": 480, "num_frames": 121}, | |
{"label": "704x448, 129 frames", "width": 704, "height": 448, "num_frames": 129}, | |
{"label": "672x448, 137 frames", "width": 672, "height": 448, "num_frames": 137}, | |
{"label": "640x416, 153 frames", "width": 640, "height": 416, "num_frames": 153}, | |
{"label": "672x384, 161 frames", "width": 672, "height": 384, "num_frames": 161}, | |
{"label": "640x384, 169 frames", "width": 640, "height": 384, "num_frames": 169}, | |
{"label": "608x384, 177 frames", "width": 608, "height": 384, "num_frames": 177}, | |
{"label": "576x384, 185 frames", "width": 576, "height": 384, "num_frames": 185}, | |
{"label": "608x352, 193 frames", "width": 608, "height": 352, "num_frames": 193}, | |
{"label": "576x352, 201 frames", "width": 576, "height": 352, "num_frames": 201}, | |
{"label": "544x352, 209 frames", "width": 544, "height": 352, "num_frames": 209}, | |
{"label": "512x352, 225 frames", "width": 512, "height": 352, "num_frames": 225}, | |
{"label": "512x352, 233 frames", "width": 512, "height": 352, "num_frames": 233}, | |
{"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241}, | |
{"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249}, | |
{"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257}, | |
] | |
def preset_changed(preset): | |
if preset != "Custom": | |
selected = next(item for item in preset_options if item["label"] == preset) | |
return ( | |
selected["height"], | |
selected["width"], | |
selected["num_frames"], | |
gr.update(visible=False), | |
gr.update(visible=False), | |
gr.update(visible=False), | |
) | |
else: | |
return ( | |
None, | |
None, | |
None, | |
gr.update(visible=True), | |
gr.update(visible=True), | |
gr.update(visible=True), | |
) | |
def generate_video_from_text( | |
prompt="", | |
enhance_prompt_toggle=False, | |
negative_prompt="", | |
frame_rate=25, | |
seed=171198, | |
num_inference_steps=30, | |
guidance_scale=3, | |
height=512, | |
width=768, | |
num_frames=121, | |
progress=gr.Progress(), | |
): | |
if len(prompt.strip()) < 50: | |
raise gr.Error( | |
"ํ๋กฌํํธ๋ ์ต์ 50์ ์ด์์ด์ด์ผ ํฉ๋๋ค. ๋ ์์ธํ ์ค๋ช ์ ์ ๊ณตํด์ฃผ์ธ์.", | |
duration=5, | |
) | |
# Translate Korean prompts to English | |
prompt = translate_korean_prompt(prompt) | |
negative_prompt = translate_korean_prompt(negative_prompt) | |
sample = { | |
"prompt": prompt, | |
"prompt_attention_mask": None, | |
"negative_prompt": negative_prompt, | |
"negative_prompt_attention_mask": None, | |
"media_items": None, | |
} | |
generator = torch.Generator(device="cpu").manual_seed(seed) | |
def gradio_progress_callback(self, step, timestep, kwargs): | |
progress((step + 1) / num_inference_steps) | |
try: | |
with torch.no_grad(): | |
images = pipeline( | |
num_inference_steps=num_inference_steps, | |
num_images_per_prompt=1, | |
guidance_scale=guidance_scale, | |
generator=generator, | |
output_type="pt", | |
height=height, | |
width=width, | |
num_frames=num_frames, | |
frame_rate=frame_rate, | |
**sample, | |
is_video=True, | |
vae_per_channel_normalize=True, | |
conditioning_method=ConditioningMethod.UNCONDITIONAL, | |
mixed_precision=True, | |
callback_on_step_end=gradio_progress_callback, | |
).images | |
except Exception as e: | |
raise gr.Error( | |
f"๋น๋์ค ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ๋ค์ ์๋ํด์ฃผ์ธ์. ์ค๋ฅ: {e}", | |
duration=5, | |
) | |
finally: | |
torch.cuda.empty_cache() | |
gc.collect() | |
output_path = tempfile.mktemp(suffix=".mp4") | |
print(images.shape) | |
video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() | |
video_np = (video_np * 255).astype(np.uint8) | |
height, width = video_np.shape[1:3] | |
out = cv2.VideoWriter( | |
output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height) | |
) | |
for frame in video_np[..., ::-1]: | |
out.write(frame) | |
out.release() | |
del images | |
del video_np | |
torch.cuda.empty_cache() | |
return output_path | |
def generate_video_from_image( | |
image_path, | |
prompt="", | |
enhance_prompt_toggle=False, | |
negative_prompt="", | |
frame_rate=25, | |
seed=171198, | |
num_inference_steps=30, | |
guidance_scale=3, | |
height=512, | |
width=768, | |
num_frames=121, | |
progress=gr.Progress(), | |
): | |
print("Height: ", height) | |
print("Width: ", width) | |
print("Num Frames: ", num_frames) | |
if len(prompt.strip()) < 50: | |
raise gr.Error( | |
"ํ๋กฌํํธ๋ ์ต์ 50์ ์ด์์ด์ด์ผ ํฉ๋๋ค. ๋ ์์ธํ ์ค๋ช ์ ์ ๊ณตํด์ฃผ์ธ์.", | |
duration=5, | |
) | |
if not image_path: | |
raise gr.Error("์ ๋ ฅ ์ด๋ฏธ์ง๋ฅผ ์ ๊ณตํด์ฃผ์ธ์.", duration=5) | |
# Translate Korean prompts to English | |
prompt = translate_korean_prompt(prompt) | |
negative_prompt = translate_korean_prompt(negative_prompt) | |
media_items = ( | |
load_image_to_tensor_with_resize(image_path, height, width).to(device).detach() | |
) | |
sample = { | |
"prompt": prompt, | |
"prompt_attention_mask": None, | |
"negative_prompt": negative_prompt, | |
"negative_prompt_attention_mask": None, | |
"media_items": media_items, | |
} | |
generator = torch.Generator(device="cpu").manual_seed(seed) | |
def gradio_progress_callback(self, step, timestep, kwargs): | |
progress((step + 1) / num_inference_steps) | |
try: | |
with torch.no_grad(): | |
images = pipeline( | |
num_inference_steps=num_inference_steps, | |
num_images_per_prompt=1, | |
guidance_scale=guidance_scale, | |
generator=generator, | |
output_type="pt", | |
height=height, | |
width=width, | |
num_frames=num_frames, | |
frame_rate=frame_rate, | |
**sample, | |
is_video=True, | |
vae_per_channel_normalize=True, | |
conditioning_method=ConditioningMethod.FIRST_FRAME, | |
mixed_precision=True, | |
callback_on_step_end=gradio_progress_callback, | |
).images | |
output_path = tempfile.mktemp(suffix=".mp4") | |
video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() | |
video_np = (video_np * 255).astype(np.uint8) | |
height, width = video_np.shape[1:3] | |
out = cv2.VideoWriter( | |
output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height) | |
) | |
for frame in video_np[..., ::-1]: | |
out.write(frame) | |
out.release() | |
except Exception as e: | |
raise gr.Error( | |
f"๋น๋์ค ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ๋ค์ ์๋ํด์ฃผ์ธ์. ์ค๋ฅ: {e}", | |
duration=5, | |
) | |
finally: | |
torch.cuda.empty_cache() | |
gc.collect() | |
return output_path | |
def create_advanced_options(): | |
with gr.Accordion("Step 4: Advanced Options (Optional)", open=False): | |
seed = gr.Slider( | |
label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198 | |
) | |
inference_steps = gr.Slider( | |
label="4.2 Inference Steps", minimum=1, maximum=50, step=1, value=30 | |
) | |
guidance_scale = gr.Slider( | |
label="4.3 Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0 | |
) | |
height_slider = gr.Slider( | |
label="4.4 Height", | |
minimum=256, | |
maximum=1024, | |
step=64, | |
value=512, | |
visible=False, | |
) | |
width_slider = gr.Slider( | |
label="4.5 Width", | |
minimum=256, | |
maximum=1024, | |
step=64, | |
value=768, | |
visible=False, | |
) | |
num_frames_slider = gr.Slider( | |
label="4.5 Number of Frames", | |
minimum=1, | |
maximum=200, | |
step=1, | |
value=121, | |
visible=False, | |
) | |
return [ | |
seed, | |
inference_steps, | |
guidance_scale, | |
height_slider, | |
width_slider, | |
num_frames_slider, | |
] | |
# Gradio Interface Definition | |
with gr.Blocks(theme=gr.themes.Soft()) as iface: | |
with gr.Tabs(): | |
# Text to Video Tab | |
with gr.TabItem("ํ ์คํธ๋ก ๋น๋์ค ๋ง๋ค๊ธฐ"): | |
with gr.Row(): | |
with gr.Column(): | |
txt2vid_prompt = gr.Textbox( | |
label="Step 1: ํ๋กฌํํธ ์ ๋ ฅ", | |
placeholder="์์ฑํ๊ณ ์ถ์ ๋น๋์ค๋ฅผ ์ค๋ช ํ์ธ์ (์ต์ 50์)...", | |
value="๊ฐ์ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ์ง ์ฌ์ฑ์ด ๊ธ๋ฐ์ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ์ง ๋ค๋ฅธ ์ฌ์ฑ์ ํฅํด ๋ฏธ์์ง์ต๋๋ค. ๊ฐ์ ๋จธ๋ฆฌ์ ์ฌ์ฑ์ ๊ฒ์์ ์์ผ์ ์ ๊ณ ์์ผ๋ฉฐ ์ค๋ฅธ์ชฝ ๋บจ์ ์์ ์ ์ด ์์ต๋๋ค. ์นด๋ฉ๋ผ ๊ฐ๋๋ ๊ฐ์ ๋จธ๋ฆฌ ์ฌ์ฑ์ ์ผ๊ตด์ ํด๋ก์ฆ์ ๋์ด ์์ต๋๋ค. ์กฐ๋ช ์ ์์ฐ์ค๋ฝ๊ณ ๋ฐ๋ปํ๋ฉฐ, ์์์์ ์ค๋ ๋ฏํ ๋ถ๋๋ฌ์ด ๋น์ด ์ฅ๋ฉด์ ๋น์ถฅ๋๋ค. ์ฅ๋ฉด์ ์ค์ ์์์ฒ๋ผ ๋ณด์ ๋๋ค.", | |
lines=5, | |
) | |
txt2vid_enhance_toggle = Toggle( | |
label="ํ๋กฌํํธ ๊ฐ์ ", | |
value=False, | |
interactive=True, | |
) | |
txt2vid_negative_prompt = gr.Textbox( | |
label="Step 2: ๋ค๊ฑฐํฐ๋ธ ํ๋กฌํํธ ์ ๋ ฅ", | |
placeholder="๋น๋์ค์์ ์ํ์ง ์๋ ์์๋ฅผ ์ค๋ช ํ์ธ์...", | |
value="low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
lines=2, | |
) | |
txt2vid_preset = gr.Dropdown( | |
choices=[p["label"] for p in preset_options], | |
value="768x512, 97 frames", | |
label="Step 3.1: ํด์๋ ํ๋ฆฌ์ ์ ํ", | |
) | |
txt2vid_frame_rate = gr.Slider( | |
label="Step 3.2: ํ๋ ์ ๋ ์ดํธ", | |
minimum=21, | |
maximum=30, | |
step=1, | |
value=25, | |
) | |
txt2vid_advanced = create_advanced_options() | |
txt2vid_generate = gr.Button( | |
"Step 5: ๋น๋์ค ์์ฑ", | |
variant="primary", | |
size="lg", | |
) | |
with gr.Column(): | |
txt2vid_output = gr.Video(label="์์ฑ๋ ๋น๋์ค") | |
with gr.Row(): | |
gr.Examples( | |
examples=[ | |
[ | |
"์ ํต์ ์ธ ๋ชฝ๊ณจ ๋๋ ์ค๋ฅผ ์ ์ ์ ์ ์ฌ์ฑ์ด ์์ ํฐ์ ์ปคํผ์ ํตํด ํธ๊ธฐ์ฌ๊ณผ ๊ธด์ฅ์ด ์์ธ ํ์ ์ผ๋ก ๋ค์ฌ๋ค๋ณด๊ณ ์์ต๋๋ค. ์ฌ์ฑ์ ํฐ ๊ตฌ์ฌ๋ก ์ฅ์๋ ๋ ๊ฐ์ ๋์ ๋จธ๋ฆฌ๋ก ์คํ์ผ๋ง๋ ๊ธด ๊ฒ์ ๋จธ๋ฆฌ๋ฅผ ํ๊ณ ์์ผ๋ฉฐ, ๋์ ๋๋์ ๋๋ฉฐ ํฌ๊ฒ ๋ ์ ธ ์์ต๋๋ค. ๊ทธ๋ ์ ๋๋ ์ค๋ ํ๋ คํ ๊ธ์ ์์๊ฐ ์๊ฒจ์ง ์ ๋ช ํ ํ๋์์ด๋ฉฐ, ๋น์ทํ ๋์์ธ์ ๋จธ๋ฆฌ๋ ๋ฅผ ํ๊ณ ์์ต๋๋ค. ๋ฐฐ๊ฒฝ์ ์ ๋น๋ก์๊ณผ ํธ๊ธฐ์ฌ์ ์์๋ด๋ ๋จ์ํ ํฐ์ ์ปคํผ์ ๋๋ค.", | |
"low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
"assets/t2v_2.mp4", | |
], | |
[ | |
"๋ ธ๋์ ์ฌํท์ ์ ์ ๊ธ๋ฐ ๋จธ๋ฆฌ์ ์ ์ ๋จ์๊ฐ ์ฒ์ ์์ ์ฃผ์๋ฅผ ๋๋ฌ๋ด ๋๋ค. ๊ทธ๋ ๋ฐ์ ํผ๋ถ๋ฅผ ๊ฐ์ก๊ณ ๋จธ๋ฆฌ๋ ๊ฐ์ด๋ฐ ๊ฐ๋ฅด๋ง๋ก ์คํ์ผ๋ง๋์ด ์์ต๋๋ค. ๊ทธ๋ ์ผ์ชฝ์ ๋ณด๊ณ ๋ ํ ์ค๋ฅธ์ชฝ์ ๋ณด๋ฉฐ, ๊ฐ ๋ฐฉํฅ์ ์ ์ ์์ํฉ๋๋ค. ์นด๋ฉ๋ผ๋ ๋ฎ์ ๊ฐ๋์์ ๋จ์๋ฅผ ์ฌ๋ ค๋ค๋ณด๋ฉฐ ๊ณ ์ ๋์ด ์์ต๋๋ค. ๋ฐฐ๊ฒฝ์ ์ฝ๊ฐ ํ๋ฆฟํ๋ฉฐ, ๋ น์ ๋๋ฌด๋ค๊ณผ ๋จ์์ ๋ค์์ ๋ฐ๊ฒ ๋น์น๋ ํ์์ด ๋ณด์ ๋๋ค. ์กฐ๋ช ์ ์์ฐ์ค๋ฝ๊ณ ๋ฐ๋ปํ๋ฉฐ, ํ์ ๋น์ด ๋จ์์ ์ผ๊ตด์ ๊ฐ๋ก์ง๋ฅด๋ ๋ ์ฆ ํ๋ ์ด๋ฅผ ๋ง๋ญ๋๋ค. ์ฅ๋ฉด์ ์ค์ ์์์ฒ๋ผ ์ดฌ์๋์์ต๋๋ค.", | |
"low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
"assets/t2v_1.mp4", | |
], | |
[ | |
"ํ ์ฌ์ดํด๋ฆฌ์คํธ๊ฐ ๊ตฝ์ด์ง ์ฐ๊ธธ์ ๋ฐ๋ผ ๋ฌ๋ฆฝ๋๋ค. ๊ณต๊ธฐ์ญํ์ ์ธ ์ฅ๋น๋ฅผ ์ ์ ๊ทธ๋ ๊ฐํ๊ฒ ํ๋ฌ์ ๋ฐ๊ณ ์์ผ๋ฉฐ, ์ด๋ง์๋ ๋๋ฐฉ์ธ์ด ๋ฐ์ง์ ๋๋ค. ์นด๋ฉ๋ผ๋ ๊ทธ์ ๊ฒฐ์ฐํ ํ์ ๊ณผ ์จ ๋งํ๋ ํ๊ฒฝ์ ๋ฒ๊ฐ์๊ฐ๋ฉฐ ๋ณด์ฌ์ค๋๋ค. ์๋๋ฌด๋ค์ด ์ค์ณ ์ง๋๊ฐ๊ณ , ํ๋์ ์ ๋ช ํ ํ๋์์ ๋๋ค. ์ด ์ฅ๋ฉด์ ํ๊ธฐ์ฐจ๊ณ ๊ฒฝ์์ ์ธ ๋ถ์๊ธฐ๋ฅผ ์์๋ ๋๋ค.", | |
"low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
"assets/t2v_0.mp4", | |
], | |
], | |
inputs=[txt2vid_prompt, txt2vid_negative_prompt, txt2vid_output], | |
label="ํ ์คํธ-๋น๋์ค ์์ฑ ์์", | |
) | |
# Image to Video Tab | |
with gr.TabItem("์ด๋ฏธ์ง๋ก ๋น๋์ค ๋ง๋ค๊ธฐ"): | |
with gr.Row(): | |
with gr.Column(): | |
img2vid_image = gr.Image( | |
type="filepath", | |
label="Step 1: ์ ๋ ฅ ์ด๋ฏธ์ง ์ ๋ก๋", | |
elem_id="image_upload", | |
) | |
img2vid_prompt = gr.Textbox( | |
label="Step 2: ํ๋กฌํํธ ์ ๋ ฅ", | |
placeholder="์ด๋ฏธ์ง๋ฅผ ์ด๋ป๊ฒ ์ ๋๋ฉ์ด์ ํํ ์ง ์ค๋ช ํ์ธ์ (์ต์ 50์)...", | |
value="๊ฐ์ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ์ง ์ฌ์ฑ์ด ๊ธ๋ฐ์ ๊ธด ๋จธ๋ฆฌ๋ฅผ ๊ฐ์ง ๋ค๋ฅธ ์ฌ์ฑ์ ํฅํด ๋ฏธ์์ง์ต๋๋ค. ๊ฐ์ ๋จธ๋ฆฌ์ ์ฌ์ฑ์ ๊ฒ์์ ์์ผ์ ์ ๊ณ ์์ผ๋ฉฐ ์ค๋ฅธ์ชฝ ๋บจ์ ์์ ์ ์ด ์์ต๋๋ค. ์นด๋ฉ๋ผ ๊ฐ๋๋ ๊ฐ์ ๋จธ๋ฆฌ ์ฌ์ฑ์ ์ผ๊ตด์ ํด๋ก์ฆ์ ๋์ด ์์ต๋๋ค. ์กฐ๋ช ์ ์์ฐ์ค๋ฝ๊ณ ๋ฐ๋ปํ๋ฉฐ, ์์์์ ์ค๋ ๋ฏํ ๋ถ๋๋ฌ์ด ๋น์ด ์ฅ๋ฉด์ ๋น์ถฅ๋๋ค. ์ฅ๋ฉด์ ์ค์ ์์์ฒ๋ผ ๋ณด์ ๋๋ค.", | |
lines=5, | |
) | |
img2vid_enhance_toggle = Toggle( | |
label="ํ๋กฌํํธ ๊ฐ์ ", | |
value=False, | |
interactive=True, | |
) | |
img2vid_negative_prompt = gr.Textbox( | |
label="Step 3: ๋ค๊ฑฐํฐ๋ธ ํ๋กฌํํธ ์ ๋ ฅ", | |
placeholder="๋น๋์ค์์ ์ํ์ง ์๋ ์์๋ฅผ ์ค๋ช ํ์ธ์...", | |
value="low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
lines=2, | |
) | |
img2vid_preset = gr.Dropdown( | |
choices=[p["label"] for p in preset_options], | |
value="768x512, 97 frames", | |
label="Step 3.1: ํด์๋ ํ๋ฆฌ์ ์ ํ", | |
) | |
img2vid_frame_rate = gr.Slider( | |
label="Step 3.2: ํ๋ ์ ๋ ์ดํธ", | |
minimum=21, | |
maximum=30, | |
step=1, | |
value=25, | |
) | |
img2vid_advanced = create_advanced_options() | |
img2vid_generate = gr.Button( | |
"Step 6: ๋น๋์ค ์์ฑ", variant="primary", size="lg" | |
) | |
with gr.Column(): | |
img2vid_output = gr.Video(label="์์ฑ๋ ๋น๋์ค") | |
with gr.Row(): | |
gr.Examples( | |
examples=[ | |
[ | |
"assets/i2v_i2.png", | |
"์ฌ์ฑ์ด ํฐ์ ์ ๊ธฐ ๋ฒ๋ ์์์ ๋๋ ๋ฌผ์ด ๋ด๊ธด ๋๋น๋ฅผ ์ ๊ณ ์์ต๋๋ค. ๋ณด๋ผ์ ๋งค๋ํ์ด๋ฅผ ๋ฐ๋ฅธ ๊ทธ๋ ์ ์์ด ํ์ ๋๋น ์์์ ๋๋ฌด ์๊ฐ๋ฝ์ ์ํ์ผ๋ก ์์ง์ ๋๋ค. ๋๋น๋ ๊ฒ์์ ๋ฒํผ๊ณผ ๋์งํธ ๋์คํ๋ ์ด๊ฐ ์๋ ํฐ์ ์ ๊ธฐ ๋ฒ๋ ์์ ๋์ฌ ์์ต๋๋ค. ๋ฒ๋๋ ์ค๋ฅธ์ชฝ ์๋ ๋ชจ์๋ฆฌ์ ๋นจ๊ฐ์๊ณผ ํฐ์ ์ฒดํฌ๋ฌด๋ฌ ์ฒ์ด ๋ถ๋ถ์ ์ผ๋ก ๋ณด์ด๋ ํฐ์ ์กฐ๋ฆฌ๋ ์์ ๋์ฌ ์์ต๋๋ค. ์นด๋ฉ๋ผ ๊ฐ๋๋ ์ ํํ ์์์ ๋ด๋ ค๋ค๋ณด๋ ๊ฐ๋์ด๋ฉฐ ์ฅ๋ฉด ๋ด๋ด ๊ณ ์ ๋์ด ์์ต๋๋ค. ์กฐ๋ช ์ ๋ฐ๊ณ ๊ณ ๋ฅธ ์ค์ฑ์ ์ธ ํฐ์ ๋น์ผ๋ก ์ฅ๋ฉด์ ๋น์ถฅ๋๋ค. ์ฅ๋ฉด์ ์ค์ ์์์ฒ๋ผ ๋ณด์ ๋๋ค.", | |
"low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
"assets/i2v_2.mp4", | |
], | |
[ | |
"assets/i2v_i0.png", | |
"๊ธด ํ๋ฅด๋ ๋๋ ์ค๋ฅผ ์ ์ ์ฌ์ฑ์ด ๋คํ์ ์์ ๋ฑ์ ์นด๋ฉ๋ผ๋ฅผ ํฅํ ์ฑ ์งํ์ ์ ๋ฐ๋ผ๋ณด๊ณ ์์ต๋๋ค. ๊ทธ๋ ์ ๋จธ๋ฆฌ์นด๋ฝ์ ๊ธธ๊ณ ๋ฐ์ผ๋ฉฐ ๋ฑ ์๋๋ก ํ๋ฌ๋ด๋ฆฝ๋๋ค. ๊ทธ๋ ๋ ํฐ ์ฐธ๋๋ฌด์ ๋๊ฒ ํผ์ง ๊ฐ์ง ์๋์ ์ ์์ต๋๋ค. ์ผ์ชฝ์ผ๋ก๋ ๋ง๋ผ๋ถ์ ์๋ ์์ ํด๋์ํ ๋ฏธ๊ตญ ์๋์ฐจ๊ฐ ์ฃผ์ฐจ๋์ด ์์ต๋๋ค. ๋ฉ๋ฆฌ์๋ ํ ๋์ ๋ถ์์ง ์๋์ฐจ๊ฐ ์์ผ๋ก ๋์ ์์ต๋๋ค. ์์ ํ๋์ ์ด๋์ด ํ๋์ ๋ฐฐ๊ฒฝ์ผ๋ก ๋ฐ์ ํฐ ๊ตฌ๋ฆ์ด ๊ทน์ ์ธ ์บ๋ฒ์ค๋ฅผ ์ด๋ฃจ๊ณ ์์ต๋๋ค. ์ ์ฒด ์ด๋ฏธ์ง๋ ํ๋ฐฑ์ผ๋ก, ๋น๊ณผ ๊ทธ๋ฆผ์์ ๋๋น๋ฅผ ๊ฐ์กฐํฉ๋๋ค. ์ฌ์ฑ์ด ์ฒ์ฒํ ์๋์ฐจ๋ฅผ ํฅํด ๊ฑธ์ด๊ฐ๊ณ ์์ต๋๋ค.", | |
"low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
"assets/i2v_0.mp4", | |
], | |
[ | |
"assets/i2v_i1.png", | |
"ํ ์์ ์์ด ๋์๊ธฐ ๋ฌผ๋ ์์์ ์ ํ ์กฐ๊ฐ์ ๋ชจ์ ์ก์ ์ ์ฐจ์ ์ผ๋ก ์๋ฟ ๋ชจ์์ ๋ง๋ค์ด๊ฐ๊ณ ์์ต๋๋ค. ํ๋ ์ ๋ฐ์ ์ฌ๋์ ์์ด ์ ํ ๋ก ๋ฎ์ฌ ์์ผ๋ฉฐ, ํ์ ํ๋ ๋์๊ธฐ ๋ฌผ๋ ์ค์์ ์ ํ ๋ฉ์ด๋ฆฌ๋ฅผ ๋ถ๋๋ฝ๊ฒ ๋๋ฅด๊ณ ์์ต๋๋ค. ์์ ์ํ์ผ๋ก ์์ง์ด๋ฉฐ, ์ ํ ์์ชฝ์ ์ ์ฐจ์ ์ผ๋ก ์๋ฟ ๋ชจ์์ ๋ง๋ค์ด๊ฐ๋๋ค. ์นด๋ฉ๋ผ๋ ๋์๊ธฐ ๋ฌผ๋ ๋ฐ๋ก ์์ ์์นํ์ฌ ์ ํ ๊ฐ ๋ชจ์ ์กํ๊ฐ๋ ๊ฒ์ ์กฐ๊ฐ๋๋ก ๋ณด์ฌ์ค๋๋ค. ์กฐ๋ช ์ ๋ฐ๊ณ ๊ณ ๋ฅด๋ฉฐ, ์ ํ ์ ๊ทธ๊ฒ์ ๋ค๋ฃจ๋ ์์ ๋ฐ๊ฒ ๋น์ถฅ๋๋ค. ์ฅ๋ฉด์ ์ค์ ์์์ฒ๋ผ ์ดฌ์๋์์ต๋๋ค.", | |
"low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive", | |
"assets/i2v_1.mp4", | |
], | |
], | |
inputs=[ | |
img2vid_image, | |
img2vid_prompt, | |
img2vid_negative_prompt, | |
img2vid_output, | |
], | |
label="์ด๋ฏธ์ง-๋น๋์ค ์์ฑ ์์", | |
) | |
# Event handlers | |
# Event handlers | |
txt2vid_preset.change( | |
fn=preset_changed, | |
inputs=[txt2vid_preset], | |
outputs=txt2vid_advanced[3:] | |
) | |
txt2vid_enhance_toggle.change( | |
fn=update_prompt_t2v, | |
inputs=[txt2vid_prompt, txt2vid_enhance_toggle], | |
outputs=txt2vid_prompt | |
) | |
txt2vid_generate.click( | |
fn=generate_video_from_text, | |
inputs=[ | |
txt2vid_prompt, | |
txt2vid_enhance_toggle, | |
txt2vid_negative_prompt, | |
txt2vid_frame_rate, | |
*txt2vid_advanced, | |
], | |
outputs=txt2vid_output, | |
concurrency_limit=1, | |
concurrency_id="generate_video", | |
queue=True, | |
) | |
img2vid_preset.change( | |
fn=preset_changed, | |
inputs=[img2vid_preset], | |
outputs=img2vid_advanced[3:] | |
) | |
img2vid_enhance_toggle.change( | |
fn=update_prompt_i2v, | |
inputs=[img2vid_prompt, img2vid_enhance_toggle], | |
outputs=img2vid_prompt | |
) | |
img2vid_generate.click( | |
fn=generate_video_from_image, | |
inputs=[ | |
img2vid_image, | |
img2vid_prompt, | |
img2vid_enhance_toggle, | |
img2vid_negative_prompt, | |
img2vid_frame_rate, | |
*img2vid_advanced, | |
], | |
outputs=img2vid_output, | |
concurrency_limit=1, | |
concurrency_id="generate_video", | |
queue=True, | |
) | |
if __name__ == "__main__": | |
iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch( | |
share=True, show_api=False | |
) |