Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

File size: 5,619 Bytes

# app.py
import os, io, hashlib, spaces, gradio as gr
from pydub import AudioSegment
from PIL   import Image
import numpy as np
from sonic import Sonic          # <-- 수정된 sonic.py 사용

# ------------------------------------------------------------------
# 1. 필요한 모델·라이브러리 설치 & 체크포인트 다운로드
# ------------------------------------------------------------------

SETUP_CMD = (
    'python -m pip install "huggingface_hub[cli]" accelerate; '
    'huggingface-cli download LeonJoe13/Sonic '
    '    --local-dir checkpoints/Sonic --local-dir-use-symlinks False; '
    'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt '
    '    --local-dir checkpoints/stable-video-diffusion-img2vid-xt --local-dir-use-symlinks False; '
    'huggingface-cli download openai/whisper-tiny '
    '    --local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False; '
)
os.system(SETUP_CMD)

# ------------------------------------------------------------------
# 2. 파이프라인 초기화 (GPU 한 번만)
# ------------------------------------------------------------------
pipe = Sonic()     # 오류가 사라진 상태로 초기화

# ------------------------------------------------------------------
# 3. 유틸리티
# ------------------------------------------------------------------
def _md5(b: bytes) -> str:
    return hashlib.md5(b).hexdigest()

TMP_DIR  = "tmp_path"
RES_DIR  = "res_path"
os.makedirs(TMP_DIR,  exist_ok=True)
os.makedirs(RES_DIR,  exist_ok=True)

# ------------------------------------------------------------------
# 4. 실제 비디오 생성 (GPU 태스크)
# ------------------------------------------------------------------
@spaces.GPU(duration=600)     # 최대 10분
def _render_video(img_path: str,
                  audio_path: str,
                  out_path: str,
                  dynamic_scale: float = 1.0) -> str | int:

    min_resolution = 512
    audio          = AudioSegment.from_file(audio_path)
    duration_sec   = len(audio) / 1000.0
    steps          = int(np.clip(duration_sec * 12.5, 25, 750))

    print(f"[INFO] Audio duration={duration_sec:.2f}s → inference_steps={steps}")

    face_info = pipe.preprocess(img_path)
    print(f"[INFO] Face detection info: {face_info}")

    if face_info["face_num"] == 0:
        return -1        # 얼굴 미검출

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    pipe.process(
        img_path, audio_path, out_path,
        min_resolution=min_resolution,
        inference_steps=steps,
        dynamic_scale=dynamic_scale,
    )
    return out_path

# ------------------------------------------------------------------
# 5. Gradio 래퍼
# ------------------------------------------------------------------
def run_sonic(image, audio, dynamic_scale):

    if image is None:
        raise gr.Error("Please upload an image.")
    if audio is None:
        raise gr.Error("Please upload an audio file.")

    # ── 이미지 캐시 ───────────────────────────────────────────────
    buf_i = io.BytesIO(); image.save(buf_i, format="PNG")
    img_hash = _md5(buf_i.getvalue())
    img_path = os.path.join(TMP_DIR, f"{img_hash}.png")
    if not os.path.exists(img_path):
        with open(img_path, "wb") as f: f.write(buf_i.getvalue())

    # ── 오디오 캐시 (mono/16 kHz, ≤60 s) ─────────────────────────
    rate, arr = audio[:2]
    if arr.ndim == 1: arr = arr[:, None]
    seg = AudioSegment(arr.tobytes(), frame_rate=rate,
                       sample_width=arr.dtype.itemsize, channels=arr.shape[1])
    seg = seg.set_channels(1).set_frame_rate(16000)[:60_000]
    buf_a = io.BytesIO(); seg.export(buf_a, format="wav")
    aud_hash = _md5(buf_a.getvalue())
    aud_path = os.path.join(TMP_DIR, f"{aud_hash}.wav")
    if not os.path.exists(aud_path):
        with open(aud_path, "wb") as f: f.write(buf_a.getvalue())

    # ── 결과 경로 ────────────────────────────────────────────────
    out_path = os.path.join(
        RES_DIR, f"{img_hash}_{aud_hash}_{dynamic_scale:.1f}.mp4"
    )

    if os.path.exists(out_path):
        print(f"[INFO] Cache hit → {out_path}")
        return out_path

    print(f"[INFO] Generating video (dynamic_scale={dynamic_scale}) …")
    return _render_video(img_path, aud_path, out_path, dynamic_scale)

# ------------------------------------------------------------------
# 6. Gradio UI
# ------------------------------------------------------------------
CSS = """
.gradio-container{font-family:Arial, sans-serif}
.main-header{text-align:center;color:#2a2a2a;margin-bottom:2em}
"""

with gr.Blocks(css=CSS) as demo:
    gr.HTML("""
    <div class="main-header">
        <h1>🎭 Sonic - Portrait Animation</h1>
        <p>Turn a single photo into a talking-head video (≤1 min audio)</p>
    </div>""")

    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil",  label="Portrait Image")
            aud_in  = gr.Audio(label="Voice / Audio (≤60 s)", type="numpy")
            scale   = gr.Slider(0.5, 2.0, 1.0, step=0.1,
                                label="Animation Intensity")
            btn     = gr.Button("Generate Animation", variant="primary")
        with gr.Column():
            vid_out = gr.Video(label="Result")

    btn.click(run_sonic, [img_in, aud_in, scale], vid_out)

demo.launch(share=True)