Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

File size: 5,223 Bytes

import os, io, hashlib, gradio as gr, spaces
from pydub import AudioSegment
from PIL import Image
from sonic import Sonic

# --------------------------------------------------------------
# 1) 필요 체크포인트(두 개)만 받는다
# --------------------------------------------------------------
DL_CMDS = [
    # Sonic(가중치, unet 등)
    "huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints/LeonJoe13-Sonic --local-dir-use-symlinks False -q",
    # stable-video-diffusion-img2vid-xt (VAE/UNet/CLIP)
    "huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt "
    "--local-dir checkpoints/stable-video-diffusion-img2vid-xt --local-dir-use-symlinks False -q",
    # whisper-tiny
    "huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False -q",
]
for cmd in DL_CMDS:
    os.system(cmd)

pipe = Sonic()   # 위에서 모델 경로를 자동으로 찾음

# ------------------------------------------------------------------
def md5(b: bytes) -> str:          # 빠른 32-byte 해시
    return hashlib.md5(b).hexdigest()

TMP_DIR, RES_DIR = "./tmp_path", "./res_path"
os.makedirs(TMP_DIR, exist_ok=True)
os.makedirs(RES_DIR, exist_ok=True)

# ------------------------------------------------------------------
@spaces.GPU(duration=600)          # 최대 10분까지 GPU 사용
def get_video_res(img_p: str, wav_p: str, out_p: str, scale: float):
    """실제 Sonic 파이프라인 호출(얼굴 체크·프레임·interpolate 포함)"""
    audio = AudioSegment.from_file(wav_p)
    dur   = len(audio) / 1000.0
    steps = max(25, min(int(dur * 12.5), 750))   # 12.5fps 기준

    print(f"[INFO] Audio duration {dur:.2f}s ➜ steps {steps}")

    face = pipe.preprocess(img_p)
    print("[INFO] Face detection:", face)
    if face["face_num"] == 0:
        return -1                                 # 얼굴 없음

    pipe.process(img_p, wav_p, out_p,
                 min_resolution=512,
                 inference_steps=steps,
                 dynamic_scale=scale)
    return out_p

# ------------------------------------------------------------------
def run_sonic(image, audio, scale):
    """Gradio 버튼 연결 함수 (캐싱·전처리)"""
    if image is None: raise gr.Error("Please upload an image.")
    if audio is None: raise gr.Error("Please upload an audio file.")

    # ---- 이미지 저장 & 해시 -------------------------------------------------
    buf_img = io.BytesIO(); image.save(buf_img, "PNG")
    img_key = md5(buf_img.getvalue())
    img_path = os.path.join(TMP_DIR, f"{img_key}.png")
    if not os.path.exists(img_path):
        with open(img_path, "wb") as f: f.write(buf_img.getvalue())

    # ---- 오디오 → mono/16kHz WAV (≤60 s) -----------------------------------
    sr, arr = audio[:2]
    arr = arr if arr.ndim == 2 else arr[:, None]
    seg = AudioSegment(arr.tobytes(), frame_rate=sr,
                       sample_width=arr.dtype.itemsize,
                       channels=arr.shape[1]
           ).set_channels(1).set_frame_rate(16_000)[:60_000]
    buf_wav = io.BytesIO(); seg.export(buf_wav, format="wav")
    wav_key = md5(buf_wav.getvalue())
    wav_path = os.path.join(TMP_DIR, f"{wav_key}.wav")
    if not os.path.exists(wav_path):
        with open(wav_path, "wb") as f: f.write(buf_wav.getvalue())

    # ---- 결과 파일 경로 -----------------------------------------------------
    out_path = os.path.join(RES_DIR, f"{img_key}_{wav_key}_{scale}.mp4")

    # ---- 캐시 확인 ---------------------------------------------------------
    if os.path.exists(out_path):
        print("[INFO] Cached video used.")
        return out_path

    print(f"[INFO] Generating video (scale={scale}) …")
    res = get_video_res(img_path, wav_path, out_path, scale)
    if res == -1:
        raise gr.Error("No face detected in the image.")
    return res

# ------------------------------------------------------------------
#                     Gradio UI
# ------------------------------------------------------------------
css = """
.gradio-container {font-family: Arial, sans-serif;}
.main-header     {text-align:center;color:#2a2a2a;margin-bottom:2em;}
"""

with gr.Blocks(css=css) as demo:
    gr.HTML(
        "<div class='main-header'>"
        "<h1>🎭 Sonic: Portrait-to-Video Animator</h1>"
        "<p>Create talking-head videos (≤60 s audio)</p>"
        "</div>"
    )

    with gr.Row():
        with gr.Column():
            img_in = gr.Image(type="pil", label="Portrait Image")
            aud_in = gr.Audio(type="numpy", label="Voice/Audio (≤60 s)")
            scale  = gr.Slider(0.5, 2.0, 1.0, 0.1,
                               label="Animation Intensity")
            btn    = gr.Button("Generate", variant="primary")
        vid_out = gr.Video(label="Generated Animation")

    btn.click(run_sonic, inputs=[img_in, aud_in, scale], outputs=vid_out)

    gr.HTML(
        "<div style='text-align:center;margin-top:1.5em'>"
        "<a href='https://github.com/jixiaozhong/Sonic' target='_blank'>GitHub</a> | "
        "<a href='https://arxiv.org/pdf/2411.16331' target='_blank'>Paper</a>"
        "</div>"
    )

demo.launch(share=True)