Spaces:

openfree
/

ginigen-sora

Running

File size: 48,243 Bytes

import gradio as gr
from gradio_toggle import Toggle
import torch
from huggingface_hub import snapshot_download
from transformers import pipeline

from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
from xora.models.transformers.transformer3d import Transformer3DModel
from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
from xora.schedulers.rf import RectifiedFlowScheduler
from xora.pipelines.pipeline_xora_video import XoraVideoPipeline
from transformers import T5EncoderModel, T5Tokenizer
from xora.utils.conditioning_method import ConditioningMethod
from pathlib import Path
import safetensors.torch
import json
import numpy as np
import cv2
from PIL import Image
import tempfile
import os
import gc
from openai import OpenAI
import re
import time
# Load system prompts
system_prompt_t2v = """당신은 비디오 생성을 위한 프롬프트 전문가입니다. 
주어진 프롬프트를 다음 구조에 맞게 개선해주세요:
1. 주요 동작을 명확한 한 문장으로 시작
2. 구체적인 동작과 제스처를 시간 순서대로 설명
3. 캐릭터/객체의 외모를 상세히 묘사
4. 배경과 환경 세부 사항을 구체적으로 포함
5. 카메라 각도와 움직임을 명시
6. 조명과 색상을 자세히 설명
7. 변화나 갑작스러운 사건을 자연스럽게 포함
모든 설명은 하나의 자연스러운 문단으로 작성하고, 
촬영 감독이 촬영 목록을 설명하는 것처럼 구체적이고 시각적으로 작성하세요.
200단어를 넘지 않도록 하되, 최대한 상세하게 작성하세요."""

system_prompt_i2v = """당신은 이미지 기반 비디오 생성을 위한 프롬프트 전문가입니다. 
주어진 프롬프트를 다음 구조에 맞게 개선해주세요:
1. 주요 동작을 명확한 한 문장으로 시작
2. 구체적인 동작과 제스처를 시간 순서대로 설명
3. 캐릭터/객체의 외모를 상세히 묘사
4. 배경과 환경 세부 사항을 구체적으로 포함
5. 카메라 각도와 움직임을 명시
6. 조명과 색상을 자세히 설명
7. 변화나 갑작스러운 사건을 자연스럽게 포함
모든 설명은 하나의 자연스러운 문단으로 작성하고, 
촬영 감독이 촬영 목록을 설명하는 것처럼 구체적이고 시각적으로 작성하세요.
200단어를 넘지 않도록 하되, 최대한 상세하게 작성하세요."""

# Load Hugging Face token if needed
hf_token = os.getenv("HF_TOKEN")
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

# Initialize translation pipeline with device and clean_up settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
translator = pipeline(
    "translation", 
    model="Helsinki-NLP/opus-mt-ko-en",
    device=device,
    clean_up_tokenization_spaces=True
)

# Korean text detection function
def contains_korean(text):
    korean_pattern = re.compile('[ㄱ-ㅎㅏ-ㅣ가-힣]')
    return bool(korean_pattern.search(text))

def translate_korean_prompt(prompt, max_length=450):
    """
    Translate Korean prompt to English if Korean text is detected
    Split long text into chunks if necessary
    """
    if not contains_korean(prompt):
        return prompt
        
    # Split long text into chunks
    def split_text(text, max_length):
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        
        for word in words:
            if current_length + len(word) + 1 > max_length:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_length = len(word)
            else:
                current_chunk.append(word)
                current_length += len(word) + 1
                
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        return chunks

    try:
        if len(prompt) > max_length:
            chunks = split_text(prompt, max_length)
            translated_chunks = []
            
            for chunk in chunks:
                translated = translator(chunk, max_length=512)[0]['translation_text']
                translated_chunks.append(translated)
                
            final_translation = ' '.join(translated_chunks)
        else:
            final_translation = translator(prompt, max_length=512)[0]['translation_text']
            
        print(f"Original Korean prompt: {prompt}")
        print(f"Translated English prompt: {final_translation}")
        return final_translation
        
    except Exception as e:
        print(f"Translation error: {e}")
        return prompt  # Return original prompt if translation fails

def enhance_prompt(prompt, type="t2v"):
    system_prompt = system_prompt_t2v if type == "t2v" else system_prompt_i2v
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=messages,
            max_tokens=2000,
        )
        enhanced_prompt = response.choices[0].message.content.strip()
        
        print("\n=== 프롬프트 증강 결과 ===")
        print("Original Prompt:")
        print(prompt)
        print("\nEnhanced Prompt:")
        print(enhanced_prompt)
        print("========================\n")
        
        return enhanced_prompt
    except Exception as e:
        print(f"Error during prompt enhancement: {e}")
        return prompt

def update_prompt_t2v(prompt, enhance_toggle):
    return update_prompt(prompt, enhance_toggle, "t2v")

def update_prompt_i2v(prompt, enhance_toggle):
    return update_prompt(prompt, enhance_toggle, "i2v")
    
def update_prompt(prompt, enhance_toggle, type="t2v"):
    if enhance_toggle:
        return enhance_prompt(prompt, type)
    return prompt

# Set model download directory within Hugging Face Spaces
model_path = "asset"
if not os.path.exists(model_path):
    snapshot_download(
        "Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token
    )

# Global variables to load components
vae_dir = Path(model_path) / "vae"
unet_dir = Path(model_path) / "unet"
scheduler_dir = Path(model_path) / "scheduler"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_vae(vae_dir):
    vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
    vae_config_path = vae_dir / "config.json"
    with open(vae_config_path, "r") as f:
        vae_config = json.load(f)
    vae = CausalVideoAutoencoder.from_config(vae_config)
    vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
    vae.load_state_dict(vae_state_dict)
    return vae.to(device=device, dtype=torch.bfloat16)

def load_unet(unet_dir):
    unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors"
    unet_config_path = unet_dir / "config.json"
    transformer_config = Transformer3DModel.load_config(unet_config_path)
    transformer = Transformer3DModel.from_config(transformer_config)
    unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
    transformer.load_state_dict(unet_state_dict, strict=True)
    return transformer.to(device=device, dtype=torch.bfloat16)

def load_scheduler(scheduler_dir):
    scheduler_config_path = scheduler_dir / "scheduler_config.json"
    scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
    return RectifiedFlowScheduler.from_config(scheduler_config)

# Helper function for image processing
def center_crop_and_resize(frame, target_height, target_width):
    h, w, _ = frame.shape
    aspect_ratio_target = target_width / target_height
    aspect_ratio_frame = w / h
    if aspect_ratio_frame > aspect_ratio_target:
        new_width = int(h * aspect_ratio_target)
        x_start = (w - new_width) // 2
        frame_cropped = frame[:, x_start : x_start + new_width]
    else:
        new_height = int(w / aspect_ratio_target)
        y_start = (h - new_height) // 2
        frame_cropped = frame[y_start : y_start + new_height, :]
    frame_resized = cv2.resize(frame_cropped, (target_width, target_height))
    return frame_resized

def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768):
    image = Image.open(image_path).convert("RGB")
    image_np = np.array(image)
    frame_resized = center_crop_and_resize(image_np, target_height, target_width)
    frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float()
    frame_tensor = (frame_tensor / 127.5) - 1.0
    return frame_tensor.unsqueeze(0).unsqueeze(2)

# Load models
vae = load_vae(vae_dir)
unet = load_unet(unet_dir)
scheduler = load_scheduler(scheduler_dir)
patchifier = SymmetricPatchifier(patch_size=1)
text_encoder = T5EncoderModel.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
).to(device)
tokenizer = T5Tokenizer.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
)

pipeline = XoraVideoPipeline(
    transformer=unet,
    patchifier=patchifier,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    scheduler=scheduler,
    vae=vae,
).to(device)



# Preset options for resolution and frame configuration
# Convert frames to seconds assuming 25 FPS
preset_options = [
    {"label": "[16:9 HD] 1216x704, 1.6초", "width": 1216, "height": 704, "num_frames": 41},
    {"label": "[16:9] 1088x704, 2.0초", "width": 1088, "height": 704, "num_frames": 49},
    {"label": "[16:9] 1056x640, 2.3초", "width": 1056, "height": 640, "num_frames": 57},
    {"label": "[16:9] 992x608, 2.6초", "width": 992, "height": 608, "num_frames": 65},
    {"label": "[16:9] 896x608, 2.9초", "width": 896, "height": 608, "num_frames": 73},
    {"label": "[16:9] 896x544, 3.2초", "width": 896, "height": 544, "num_frames": 81},
    {"label": "[16:9] 832x544, 3.6초", "width": 832, "height": 544, "num_frames": 89},
    {"label": "[16:9] 800x512, 3.9초", "width": 800, "height": 512, "num_frames": 97},
    {"label": "[16:9] 768x512, 3.9초", "width": 768, "height": 512, "num_frames": 97},
    {"label": "[16:9] 800x480, 4.2초", "width": 800, "height": 480, "num_frames": 105},
    {"label": "[16:9] 736x480, 4.5초", "width": 736, "height": 480, "num_frames": 113},
    {"label": "[3:2] 704x480, 4.8초", "width": 704, "height": 480, "num_frames": 121},
    {"label": "[16:9] 704x448, 5.2초", "width": 704, "height": 448, "num_frames": 129},
    {"label": "[16:9] 672x448, 5.5초", "width": 672, "height": 448, "num_frames": 137},
    {"label": "[16:9] 640x416, 6.1초", "width": 640, "height": 416, "num_frames": 153},
    {"label": "[16:9] 672x384, 6.4초", "width": 672, "height": 384, "num_frames": 161},
    {"label": "[16:9] 640x384, 6.8초", "width": 640, "height": 384, "num_frames": 169},
    {"label": "[16:9] 608x384, 7.1초", "width": 608, "height": 384, "num_frames": 177},
    {"label": "[16:9] 576x384, 7.4초", "width": 576, "height": 384, "num_frames": 185},
    {"label": "[16:9] 608x352, 7.7초", "width": 608, "height": 352, "num_frames": 193},
    {"label": "[16:9] 576x352, 8.0초", "width": 576, "height": 352, "num_frames": 201},
    {"label": "[16:9] 544x352, 8.4초", "width": 544, "height": 352, "num_frames": 209},
    {"label": "[3:2] 512x352, 9.3초", "width": 512, "height": 352, "num_frames": 233},
    {"label": "[16:9] 544x320, 9.6초", "width": 544, "height": 320, "num_frames": 241},
    {"label": "[16:9] 512x320, 10.3초", "width": 512, "height": 320, "num_frames": 257},
]

def preset_changed(preset):
    selected = next((item for item in preset_options if item["label"] == preset), None)
    if selected is None:
        raise gr.Error("Invalid preset selected")
    return [
        gr.State(value=selected["height"]),
        gr.State(value=selected["width"]),
        gr.State(value=selected["num_frames"]),
        gr.update(visible=False),
        gr.update(visible=False),
        gr.update(visible=False),
    ]
    
def generate_video_from_text(
    prompt,
    enhance_prompt_toggle,
    negative_prompt,
    frame_rate,
    seed,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    progress=gr.Progress(),
):
    if len(prompt.strip()) < 50:
        raise gr.Error(
            "프롬프트는 최소 50자 이상이어야 합니다. 더 자세한 설명을 제공해주세요.",
            duration=5,
        )

    # 프롬프트 개선이 활성화된 경우
    if enhance_prompt_toggle:
        prompt = enhance_prompt(prompt, "t2v")

    # Translate Korean prompts to English
    prompt = translate_korean_prompt(prompt)
    negative_prompt = translate_korean_prompt(negative_prompt)

    # 기본값 설정
    height = height or 320
    width = width or 512
    num_frames = num_frames or 257
    frame_rate = frame_rate or 25
    seed = seed or 171198
    num_inference_steps = num_inference_steps or 41
    guidance_scale = guidance_scale or 4.0

    sample = {
        "prompt": prompt,
        "prompt_attention_mask": None,
        "negative_prompt": negative_prompt,
        "negative_prompt_attention_mask": None,
        "media_items": None,
    }

    generator = torch.Generator(device="cpu").manual_seed(seed)

    def gradio_progress_callback(self, step, timestep, kwargs):
        progress((step + 1) / num_inference_steps)

    try:
        with torch.no_grad():
            images = pipeline(
                num_inference_steps=num_inference_steps,
                num_images_per_prompt=1,
                guidance_scale=guidance_scale,
                generator=generator,
                output_type="pt",
                height=height,
                width=width,
                num_frames=num_frames,
                frame_rate=frame_rate,
                **sample,
                is_video=True,
                vae_per_channel_normalize=True,
                conditioning_method=ConditioningMethod.UNCONDITIONAL,
                mixed_precision=True,
                callback_on_step_end=gradio_progress_callback,
            ).images
    except Exception as e:
        raise gr.Error(
            f"비디오 생성 중 오류가 발생했습니다. 다시 시도해주세요. 오류: {e}",
            duration=5,
        )
    finally:
        torch.cuda.empty_cache()
        gc.collect()

    output_path = tempfile.mktemp(suffix=".mp4")
    video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
    video_np = (video_np * 255).astype(np.uint8)
    height, width = video_np.shape[1:3]
    out = cv2.VideoWriter(
        output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
    )
    for frame in video_np[..., ::-1]:
        out.write(frame)
    out.release()
    del images
    del video_np
    torch.cuda.empty_cache()
    return output_path

def generate_video_from_image(
    image_path,
    prompt,
    enhance_prompt_toggle,
    negative_prompt,
    frame_rate,
    seed,
    num_inference_steps,
    guidance_scale,
    height,
    width,
    num_frames,
    progress=gr.Progress(),
):
    if not image_path:
        raise gr.Error("입력 이미지를 제공해주세요.", duration=5)

    if len(prompt.strip()) < 50:
        raise gr.Error(
            "프롬프트는 최소 50자 이상이어야 합니다. 더 자세한 설명을 제공해주세요.",
            duration=5,
        )

    # 프롬프트 개선이 활성화된 경우
    if enhance_prompt_toggle:
        prompt = enhance_prompt(prompt, "i2v")

    # Translate Korean prompts to English
    prompt = translate_korean_prompt(prompt)
    negative_prompt = translate_korean_prompt(negative_prompt)

    # 기본값 설정
    height = height or 320
    width = width or 512
    num_frames = num_frames or 257
    frame_rate = frame_rate or 25
    seed = seed or 171198
    num_inference_steps = num_inference_steps or 41
    guidance_scale = guidance_scale or 4.0

    # 이미지 로드 및 전처리
    media_items = (
        load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
    )

    sample = {
        "prompt": prompt,
        "prompt_attention_mask": None,
        "negative_prompt": negative_prompt,
        "negative_prompt_attention_mask": None,
        "media_items": media_items,
    }

    generator = torch.Generator(device="cpu").manual_seed(seed)

    def gradio_progress_callback(self, step, timestep, kwargs):
        progress((step + 1) / num_inference_steps)

    try:
        with torch.no_grad():
            images = pipeline(
                num_inference_steps=num_inference_steps,
                num_images_per_prompt=1,
                guidance_scale=guidance_scale,
                generator=generator,
                output_type="pt",
                height=height,
                width=width,
                num_frames=num_frames,
                frame_rate=frame_rate,
                **sample,
                is_video=True,
                vae_per_channel_normalize=True,
                conditioning_method=ConditioningMethod.FIRST_FRAME,
                mixed_precision=True,
                callback_on_step_end=gradio_progress_callback,
            ).images

        output_path = tempfile.mktemp(suffix=".mp4")
        video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
        video_np = (video_np * 255).astype(np.uint8)
        height, width = video_np.shape[1:3]
        out = cv2.VideoWriter(
            output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
        )
        for frame in video_np[..., ::-1]:
            out.write(frame)
        out.release()

    except Exception as e:
        raise gr.Error(
            f"비디오 생성 중 오류가 발생했습니다. 다시 시도해주세요. 오류: {e}",
            duration=5,
        )

    finally:
        torch.cuda.empty_cache()
        gc.collect()
        if 'images' in locals():
            del images
        if 'video_np' in locals():
            del video_np
        if 'media_items' in locals():
            del media_items

    return output_path

def create_advanced_options():
    with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
        seed = gr.Slider(
            label="Seed",
            minimum=0,
            maximum=1000000,
            step=1,
            value=171198
        )
        inference_steps = gr.Slider(
            label="4.2 Inference Steps",
            minimum=1,
            maximum=50,
            step=1,
            value=41,
            visible=False
        )
        guidance_scale = gr.Slider(
            label="4.3 Guidance Scale",
            minimum=1.0,
            maximum=5.0,
            step=0.1,
            value=4.0,
            visible=False
        )
        height_slider = gr.Slider(
            label="4.4 Height",
            minimum=256,
            maximum=1024,
            step=64,
            value=320,
            visible=False,
        )
        width_slider = gr.Slider(
            label="4.5 Width",
            minimum=256,
            maximum=1024,
            step=64,
            value=512,
            visible=False,
        )
        num_frames_slider = gr.Slider(
            label="4.5 Number of Frames",
            minimum=1,
            maximum=200,
            step=1,
            value=257,
            visible=False,
        )

        return [
            seed,
            inference_steps,
            guidance_scale,
            height_slider,
            width_slider,
            num_frames_slider,
        ]

system_prompt_scenario = """당신은 영상 스크립트에 맞는 배경 영상을 생성하기 위한 프롬프트 전문가입니다.
주어진 스크립트의 분위기와 맥락을 시각적 배경으로 표현하되, 다음 원칙을 반드시 준수하세요:

1. 제품이나 서비스를 직접적으로 묘사하지 말 것
2. 스크립트의 감성과 톤앤매너를 표현하는 배경 영상에 집중할 것
3. 5개 섹션이 하나의 이야기처럼 자연스럽게 연결되도록 할 것
4. 추상적이고 은유적인 시각 표현을 활용할 것

각 섹션별 프롬프트 작성 가이드:
1. 배경 및 필요성: 주제의 전반적인 분위기를 표현하는 배경 씬
2. 문제 제기: 긴장감이나 갈등을 암시하는 분위기 있는 배경
3. 해결책 제시: 희망적이고 밝은 톤의 배경 전환
4. 본론: 안정감 있고 신뢰도를 높이는 배경
5. 결론: 임팩트 있는 마무리를 위한 역동적인 배경

모든 섹션이 일관된 스타일과 톤을 유지하면서도 자연스럽게 이어지도록 구성하세요.

각 섹션의 프롬프트 작성시 반드시 다음 구조에 맞게 개선해주세요:
1. 주요 동작을 명확한 한 문장으로 시작
2. 구체적인 동작과 제스처를 시간 순서대로 설명
3. 캐릭터/객체의 외모를 상세히 묘사
4. 배경과 환경 세부 사항을 구체적으로 포함
5. 카메라 각도와 움직임을 명시
6. 조명과 색상을 자세히 설명
7. 변화나 갑작스러운 사건을 자연스럽게 포함
모든 설명은 하나의 자연스러운 문단으로 작성하고, 
촬영 감독이 촬영 목록을 설명하는 것처럼 구체적이고 시각적으로 작성하세요.
200단어를 넘지 않도록 하되, 최대한 상세하게 작성하세요.

"""


def analyze_scenario(scenario):
    """시나리오를 분석하여 각 섹션별 배경 영상용 프롬프트 생성"""
    try:
        # 각 섹션별 프롬프트 생성을 위한 메시지 구성
        section_prompts = []
        
        for section_num in range(1, 6):
            section_descriptions = {
                1: "배경 및 필요성: 주제의 전반적인 분위기를 표현하는 배경 씬",
                2: "흥미 유발: 긴장감이나 갈등을 암시하는 분위기 있는 배경",
                3: "해결책 제시: 희망적이고 밝은 톤의 배경 전환",
                4: "본론: 안정감 있고 신뢰도를 높이는 배경",
                5: "결론: 임팩트 있는 마무리를 위한 역동적인 배경"
            }
            
            messages = [
                {"role": "system", "content": system_prompt_scenario},
                {"role": "user", "content": f"""
다음 스크립트의 {section_num}번째 섹션({section_descriptions[section_num]})에 대한 
배경 영상 프롬프트를 생성해주세요.

스크립트:
{scenario}

주의사항:
1. 해당 섹션의 특성({section_descriptions[section_num]})에 맞는 분위기와 톤을 반영하세요.
2. 직접적인 제품/서비스 묘사는 피하고, 감성적이고 은유적인 배경 영상에 집중하세요.
3. 다음 구조를 반드시 포함하세요:
   - 주요 동작을 명확한 한 문장으로 시작
   - 구체적인 동작과 제스처를 시간 순서대로 설명
   - 배경과 환경 세부 사항을 구체적으로 포함
   - 카메라 각도와 움직임을 명시
   - 조명과 색상을 자세히 설명
   - 변화나 갑작스러운 사건을 자연스럽게 포함"""}
            ]

            response = client.chat.completions.create(
                model="gpt-4-1106-preview",
                messages=messages,
                max_tokens=1000,
                temperature=0.7
            )
            
            section_prompt = response.choices[0].message.content.strip()
            section_prompts.append(f"{section_num}. {section_prompt}")
            
            # API 요청 사이에 짧은 딜레이 추가
            time.sleep(1)
        
        return section_prompts
        
    except Exception as e:
        print(f"Error during scenario analysis: {e}")
        return ["Error occurred during analysis"] * 5

def generate_section_video(prompt, preset, section_number=1, base_seed=171198, progress=gr.Progress()):
    """각 섹션의 비디오 생성"""
    try:
        if not prompt or len(prompt.strip()) < 50:
            raise gr.Error("프롬프트는 최소 50자 이상이어야 합니다.")
            
        if not preset:
            raise gr.Error("해상도 프리셋을 선택해주세요.")
            
        selected = next((item for item in preset_options if item["label"] == preset), None)
        if not selected:
            raise gr.Error("올바르지 않은 프리셋입니다.")
            
        section_seed = base_seed + section_number
        
        return generate_video_from_text(
            prompt=prompt,
            enhance_prompt_toggle=False,  # 섹션 생성시는 프롬프트 증강 비활성화
            negative_prompt="low quality, worst quality, deformed, distorted, warped",
            frame_rate=25,
            seed=section_seed,
            num_inference_steps=41,
            guidance_scale=4.0,
            height=selected["height"],
            width=selected["width"],
            num_frames=selected["num_frames"],
            progress=progress
        )
    except Exception as e:
        print(f"Error in section {section_number}: {e}")
        raise gr.Error(f"섹션 {section_number} 생성 중 오류: {str(e)}")
    finally:
        torch.cuda.empty_cache()
        gc.collect()

def generate_single_section_prompt(scenario, section_number):
    """개별 섹션에 대한 프롬프트 생성"""
    section_descriptions = {
        1: "배경 및 필요성: 주제의 전반적인 분위기를 표현하는 배경 씬",
        2: "흥미 유발: 흥미를 유발하고 기대감을 증폭시키는 배경",
        3: "해결책 제시: 희망적이고 밝은 톤의 배경 전환",
        4: "본론: 안정감 있고 신뢰도를 높이는 배경",
        5: "결론: 임팩트 있는 마무리를 위한 역동적인 배경"
    }
    
    messages = [
        {"role": "system", "content": system_prompt_scenario},
        {"role": "user", "content": f"""
다음 스크립트의 {section_number}번째 섹션({section_descriptions[section_number]})에 대한 
배경 영상 프롬프트를 생성해주세요.

스크립트:
{scenario}

주의사항:
1. 해당 섹션의 특성({section_descriptions[section_number]})에 맞는 분위기와 톤을 반영하세요.
2. 직접적인 제품/서비스 묘사는 피하고, 감성적이고 은유적인 배경 영상에 집중하세요.
3. 다음 구조를 반드시 포함하세요:
   - 주요 동작을 명확한 한 문장으로 시작
   - 구체적인 동작과 제스처를 시간 순서대로 설명
   - 배경과 환경 세부 사항을 구체적으로 포함
   - 카메라 각도와 움직임을 명시
   - 조명과 색상을 자세히 설명
   - 변화나 갑작스러운 사건을 자연스럽게 포함"""}
    ]
    
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=messages,
            max_tokens=1000,  # 토큰 수 증가
            temperature=0.7
        )
        generated_prompt = response.choices[0].message.content.strip()
        return f"{section_number}. {generated_prompt}"
    except Exception as e:
        print(f"Error during prompt generation for section {section_number}: {e}")
        return f"Error occurred during prompt generation for section {section_number}"


# 비디오 결합 함수 추가
def combine_videos(video_paths, output_path):
    """여러 비디오를 하나로 결합"""
    if not all(video_paths):
        raise gr.Error("모든 섹션의 영상이 생성되어야 합니다.")
        
    try:
        # 첫 번째 비디오의 속성 가져오기
        cap = cv2.VideoCapture(video_paths[0])
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        cap.release()

        # 출력 비디오 설정
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

        # 각 비디오 순차적으로 결합
        for video_path in video_paths:
            if video_path and os.path.exists(video_path):
                cap = cv2.VideoCapture(video_path)
                while True:
                    ret, frame = cap.read()
                    if not ret:
                        break
                    out.write(frame)
                cap.release()

        out.release()
        return output_path
    except Exception as e:
        raise gr.Error(f"비디오 결합 중 오류 발생: {e}")

def merge_section_videos(section1, section2, section3, section4, section5):
    """섹션 비디오들을 하나로 결합"""
    videos = []
    
    # 각 섹션 비디오 확인 및 처리
    for i, video_path in enumerate([section1, section2, section3, section4, section5], 1):
        if video_path:
            if os.path.exists(video_path):
                try:
                    # 비디오 파일 검증
                    cap = cv2.VideoCapture(video_path)
                    if cap.isOpened():
                        videos.append(video_path)
                        cap.release()
                    else:
                        raise gr.Error(f"섹션 {i}의 영상 파일이 손상되었거나 읽을 수 없습니다.")
                except Exception as e:
                    raise gr.Error(f"섹션 {i} 영상 처리 중 오류: {str(e)}")
            else:
                raise gr.Error(f"섹션 {i}의 영상 파일을 찾을 수 없습니다.")
        else:
            raise gr.Error(f"섹션 {i}의 영상이 없습니다.")
    
    if not videos:
        raise gr.Error("결합할 영상이 없습니다.")
    
    try:
        output_path = tempfile.mktemp(suffix=".mp4")
        
        # 첫 번째 비디오의 속성 가져오기
        cap = cv2.VideoCapture(videos[0])
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        cap.release()

        # 출력 비디오 설정
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

        # 각 비디오 순차적으로 결합
        for video_path in videos:
            cap = cv2.VideoCapture(video_path)
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                # 프레임 크기가 다른 경우 리사이즈
                if frame.shape[:2] != (height, width):
                    frame = cv2.resize(frame, (width, height))
                out.write(frame)
            cap.release()

        out.release()
        print(f"Successfully merged {len(videos)} videos")
        return output_path
        
    except Exception as e:
        raise gr.Error(f"비디오 결합 중 오류 발생: {e}")

def generate_script(topic):
    """주제에 맞는 스크립트 생성"""
    if not topic:
        return "주제를 입력해주세요."
        
    messages = [
        {"role": "system", "content": """당신은 영상 스크립트 작성 전문가입니다.
주어진 주제로 다음 구조에 맞는 5개 섹션의 스크립트를 작성해주세요:

1. 배경 및 필요성: 주제 소개와 시청자의 흥미 유발
2. 흥미 유발: 구체적인 내용 전개와 호기심 자극
3. 해결책 제시: 핵심 내용과 해결방안 제시
4. 본론: 상세한 설명과 장점 부각
5. 결론: 핵심 메시지 강조와 행동 유도

각 섹션은 자연스럽게 연결되어야 하며, 
전체적으로 일관된 톤과 분위기를 유지하면서도 
시청자의 관심을 끝까지 유지할 수 있도록 작성해주세요."""},
        {"role": "user", "content": f"다음 주제로 영상 스크립트를 작성해주세요: {topic}"}
    ]
    
    try:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=messages,
            max_tokens=2000,
            temperature=0.7
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during script generation: {e}")
        return "스크립트 생성 중 오류가 발생했습니다."


def cleanup():
    """메모리 정리 함수"""
    torch.cuda.empty_cache()
    gc.collect()

with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange") as iface:
     # State 변수들의 초기화
     txt2vid_current_height = gr.State(value=320)
     txt2vid_current_width = gr.State(value=512)
     txt2vid_current_num_frames = gr.State(value=257)

     img2vid_current_height = gr.State(value=320)
     img2vid_current_width = gr.State(value=512)
     img2vid_current_num_frames = gr.State(value=257)

     with gr.Tabs():
          # Text to Video Tab
          with gr.TabItem("텍스트로 비디오 만들기"):
               with gr.Row():
                    with gr.Column():
                         txt2vid_prompt = gr.Textbox(
                              label="Step 1: 프롬프트 입력",
                              placeholder="생성하고 싶은 비디오를 설명하세요 (최소 50자)...",
                              value="귀여운 고양이",
                              lines=5,
                         )
                         txt2vid_enhance_toggle = Toggle(
                              label="프롬프트 증강",
                              value=False,
                              interactive=True,
                         )
                         txt2vid_negative_prompt = gr.Textbox(
                              label="Step 2: 네거티브 프롬프트 입력",
                              placeholder="비디오에서 원하지 않는 요소를 설명하세요...",
                              value="low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive",
                              lines=2,
                              visible=False
                         )
                         txt2vid_preset = gr.Dropdown(
                              choices=[p["label"] for p in preset_options],
                              value="[16:9] 512x320, 10.3초",
                              label="Step 2: 해상도 프리셋 선택",
                         )
                         txt2vid_frame_rate = gr.Slider(
                              label="Step 3: 프레임 레이트",
                              minimum=21,
                              maximum=30,
                              step=1,
                              value=25,
                              visible=False
                         )
                         txt2vid_advanced = create_advanced_options()
                         txt2vid_generate = gr.Button(
                              "Step 3: 비디오 생성",
                              variant="primary",
                              size="lg",
                         )
                    with gr.Column():
                         txt2vid_output = gr.Video(label="생성된 비디오")


# Image to Video Tab
          with gr.TabItem("이미지로 비디오 만들기"):
               with gr.Row():
                    with gr.Column():
                         img2vid_image = gr.Image(
                              type="filepath",
                              label="Step 1: 입력 이미지 업로드",
                              elem_id="image_upload",
                         )
                         img2vid_prompt = gr.Textbox(
                              label="Step 2: 프롬프트 입력",
                              placeholder="이미지를 어떻게 애니메이션화할지 설명하세요 (최소 50자)...",
                              value="귀여운 고양이",
                              lines=5,
                         )
                         img2vid_enhance_toggle = Toggle(
                              label="프롬프트 증강",
                              value=False,
                              interactive=True,
                         )
                         img2vid_negative_prompt = gr.Textbox(
                              label="Step 3: 네거티브 프롬프트 입력",
                              placeholder="비디오에서 원하지 않는 요소를 설명하세요...",
                              value="low quality, worst quality, deformed, distorted, warped, motion smear, motion artifacts, fused fingers, incorrect anatomy, strange hands, unattractive",
                              lines=2,
                              visible=False
                         )
                         img2vid_preset = gr.Dropdown(
                              choices=[p["label"] for p in preset_options],
                              value="[16:9] 512x320, 10.3초",
                              label="Step 3: 해상도 프리셋 선택",
                         )
                         img2vid_frame_rate = gr.Slider(
                              label="Step 4: 프레임 레이트",
                              minimum=21,
                              maximum=30,
                              step=1,
                              value=25,
                              visible=False
                         )
                         img2vid_advanced = create_advanced_options()
                         img2vid_generate = gr.Button(
                              "Step 4: 비디오 생성",
                              variant="primary",
                              size="lg",
                         )
                    with gr.Column():
                         img2vid_output = gr.Video(label="생성된 비디오")                        


# Scenario Tab
          with gr.TabItem("시나리오로 비디오 만들기(숏폼)"):
               with gr.Row():
                    with gr.Column(scale=1):
                         script_topic = gr.Textbox(
                              label="스크립트 생성",
                              placeholder="겨울 일본 온천 여행을 주제로 밝은 느낌으로 스크립트 생성하라",
                              lines=2
                         )
                         generate_script_btn = gr.Button("스크립트 생성", variant="primary")
                         
                         scenario_input = gr.Textbox(
                              label="영상 스크립트 입력",
                              placeholder="전체 시나리오를 입력하세요...",
                              lines=10
                         )
                         scenario_preset = gr.Dropdown(
                              choices=[p["label"] for p in preset_options],
                              value="[16:9] 512x320, 10.3초",
                              label="화면 크기 선택"
                         )
                         analyze_btn = gr.Button("시나리오 분석 및 프롬프트 생성", variant="primary")

                    with gr.Column(scale=2):
                         with gr.Row():
                              # 섹션 1
                              with gr.Column():
                                   section1_prompt = gr.Textbox(
                                        label="1. 배경 및 필요성",
                                        lines=4
                                   )
                                   with gr.Row():
                                        section1_regenerate = gr.Button("🔄 프롬프트 생성")
                                        section1_generate = gr.Button("🔄 영상 생성")
                                   section1_video = gr.Video(label="섹션 1 영상")
                              
                              # 섹션 2
                              with gr.Column():
                                   section2_prompt = gr.Textbox(
                                        label="2. 흥미 유발",
                                        lines=4
                                   )
                                   with gr.Row():
                                        section2_regenerate = gr.Button("🔄 프롬프트 생성")
                                        section2_generate = gr.Button("🔄 영상 생성")
                                   section2_video = gr.Video(label="섹션 2 영상")



                         with gr.Row():
                              # 섹션 3
                              with gr.Column():
                                   section3_prompt = gr.Textbox(
                                        label="3. 해결책 제시",
                                        lines=4
                                   )
                                   with gr.Row():
                                        section3_regenerate = gr.Button("🔄 프롬프트 생성")
                                        section3_generate = gr.Button("🔄 영상 생성")
                                   section3_video = gr.Video(label="섹션 3 영상")
                              
                              # 섹션 4
                              with gr.Column():
                                   section4_prompt = gr.Textbox(
                                        label="4. 본론",
                                        lines=4
                                   )
                                   with gr.Row():
                                        section4_regenerate = gr.Button("🔄 프롬프트 생성")
                                        section4_generate = gr.Button("🔄 영상 생성")
                                   section4_video = gr.Video(label="섹션 4 영상")
                         
                         with gr.Row():
                              # 섹션 5
                              with gr.Column():
                                   section5_prompt = gr.Textbox(
                                        label="5. 결론 및 강조",
                                        lines=4
                                   )
                                   with gr.Row():
                                        section5_regenerate = gr.Button("🔄 프롬프트 생성")
                                        section5_generate = gr.Button("🔄 영상 생성")
                                   section5_video = gr.Video(label="섹션 5 영상")

                         # 통합 영상 섹션
                         with gr.Row():
                              with gr.Column(scale=1):
                                   merge_videos_btn = gr.Button("통합 영상 생성", variant="primary", size="lg")
                              
                              with gr.Column(scale=2):
                                   with gr.Row():
                                        merged_video_output = gr.Video(label="통합 영상")


# Text to Video Tab handlers
     txt2vid_preset.change(
          fn=preset_changed,
          inputs=[txt2vid_preset],
          outputs=[
               txt2vid_current_height,
               txt2vid_current_width,
               txt2vid_current_num_frames,
               txt2vid_advanced[3],  # height_slider
               txt2vid_advanced[4],  # width_slider
               txt2vid_advanced[5],  # num_frames_slider
          ]
     )

     txt2vid_enhance_toggle.change(
          fn=update_prompt_t2v,
          inputs=[txt2vid_prompt, txt2vid_enhance_toggle],
          outputs=txt2vid_prompt
     )

     txt2vid_generate.click(
          fn=generate_video_from_text,
          inputs=[
               txt2vid_prompt,
               txt2vid_enhance_toggle,
               txt2vid_negative_prompt,
               txt2vid_frame_rate,
               txt2vid_advanced[0],  # seed
               txt2vid_advanced[1],  # inference_steps
               txt2vid_advanced[2],  # guidance_scale
               txt2vid_current_height,
               txt2vid_current_width,
               txt2vid_current_num_frames,
          ],
          outputs=txt2vid_output,
     )

     # Image to Video Tab handlers
     img2vid_preset.change(
          fn=preset_changed,
          inputs=[img2vid_preset],
          outputs=[
               img2vid_current_height,
               img2vid_current_width,
               img2vid_current_num_frames,
               img2vid_advanced[3],  # height_slider
               img2vid_advanced[4],  # width_slider
               img2vid_advanced[5],  # num_frames_slider
          ]
     )

     img2vid_enhance_toggle.change(
          fn=update_prompt_i2v,
          inputs=[img2vid_prompt, img2vid_enhance_toggle],
          outputs=img2vid_prompt
     )

     img2vid_generate.click(
          fn=generate_video_from_image,
          inputs=[
               img2vid_image,
               img2vid_prompt,
               img2vid_enhance_toggle,
               img2vid_negative_prompt,
               img2vid_frame_rate,
               img2vid_advanced[0],  # seed
               img2vid_advanced[1],  # inference_steps
               img2vid_advanced[2],  # guidance_scale
               img2vid_current_height,
               img2vid_current_width,
               img2vid_current_num_frames,
          ],
          outputs=img2vid_output,
     )



# Scenario Tab handlers
     generate_script_btn.click(
          fn=generate_script,
          inputs=[script_topic],
          outputs=[scenario_input]
     )

     analyze_btn.click(
          fn=analyze_scenario,
          inputs=[scenario_input],
          outputs=[
               section1_prompt, section2_prompt, section3_prompt,
               section4_prompt, section5_prompt
          ]
     )

     # 섹션별 프롬프트 재생성 핸들러
     section1_regenerate.click(
          fn=lambda x: generate_single_section_prompt(x, 1),
          inputs=[scenario_input],
          outputs=section1_prompt
     )

     section2_regenerate.click(
          fn=lambda x: generate_single_section_prompt(x, 2),
          inputs=[scenario_input],
          outputs=section2_prompt
     )

     section3_regenerate.click(
          fn=lambda x: generate_single_section_prompt(x, 3),
          inputs=[scenario_input],
          outputs=section3_prompt
     )

     section4_regenerate.click(
          fn=lambda x: generate_single_section_prompt(x, 4),
          inputs=[scenario_input],
          outputs=section4_prompt
     )

     section5_regenerate.click(
          fn=lambda x: generate_single_section_prompt(x, 5),
          inputs=[scenario_input],
          outputs=section5_prompt
     )

     # 섹션별 비디오 생성 핸들러
     section1_generate.click(
          fn=lambda p, pr: generate_section_video(p, pr, 1),
          inputs=[section1_prompt, scenario_preset],
          outputs=section1_video
     )

     section2_generate.click(
          fn=lambda p, pr: generate_section_video(p, pr, 2),
          inputs=[section2_prompt, scenario_preset],
          outputs=section2_video
     )

     section3_generate.click(
          fn=lambda p, pr: generate_section_video(p, pr, 3),
          inputs=[section3_prompt, scenario_preset],
          outputs=section3_video
     )

     section4_generate.click(
          fn=lambda p, pr: generate_section_video(p, pr, 4),
          inputs=[section4_prompt, scenario_preset],
          outputs=section4_video
     )

     section5_generate.click(
          fn=lambda p, pr: generate_section_video(p, pr, 5),
          inputs=[section5_prompt, scenario_preset],
          outputs=section5_video
     )

     # 통합 영상 생성 핸들러
     merge_videos_btn.click(
          fn=merge_section_videos,
          inputs=[
               section1_video,
               section2_video,
               section3_video,
               section4_video,
               section5_video
          ],
          outputs=merged_video_output
     )

if __name__ == "__main__":
     iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
          share=True, 
          show_api=False
     )