File size: 5,223 Bytes
6db7147
9d31513
137ab16
2279f85
9d31513
6db7147
 
 
 
 
 
 
 
 
 
 
 
 
 
0b636bf
6db7147
2279f85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b636bf
2279f85
 
9d31513
 
0b636bf
2279f85
 
 
 
 
 
 
9d31513
 
2279f85
 
 
 
 
 
 
 
 
 
 
 
 
 
0b636bf
2279f85
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os, io, hashlib, gradio as gr, spaces
from pydub import AudioSegment
from PIL import Image
from sonic import Sonic

# --------------------------------------------------------------
# 1) 필요 체크포인트(두 개)만 받는다
# --------------------------------------------------------------
DL_CMDS = [
    # Sonic(가중치, unet 등)
    "huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints/LeonJoe13-Sonic --local-dir-use-symlinks False -q",
    # stable-video-diffusion-img2vid-xt (VAE/UNet/CLIP)
    "huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt "
    "--local-dir checkpoints/stable-video-diffusion-img2vid-xt --local-dir-use-symlinks False -q",
    # whisper-tiny
    "huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False -q",
]
for cmd in DL_CMDS:
    os.system(cmd)

pipe = Sonic()   # 위에서 모델 경로를 자동으로 찾음

# ------------------------------------------------------------------
def md5(b: bytes) -> str:          # 빠른 32-byte 해시
    return hashlib.md5(b).hexdigest()

TMP_DIR, RES_DIR = "./tmp_path", "./res_path"
os.makedirs(TMP_DIR, exist_ok=True)
os.makedirs(RES_DIR, exist_ok=True)

# ------------------------------------------------------------------
@spaces.GPU(duration=600)          # 최대 10분까지 GPU 사용
def get_video_res(img_p: str, wav_p: str, out_p: str, scale: float):
    """실제 Sonic 파이프라인 호출(얼굴 체크·프레임·interpolate 포함)"""
    audio = AudioSegment.from_file(wav_p)
    dur   = len(audio) / 1000.0
    steps = max(25, min(int(dur * 12.5), 750))   # 12.5fps 기준

    print(f"[INFO] Audio duration {dur:.2f}s ➜ steps {steps}")

    face = pipe.preprocess(img_p)
    print("[INFO] Face detection:", face)
    if face["face_num"] == 0:
        return -1                                 # 얼굴 없음

    pipe.process(img_p, wav_p, out_p,
                 min_resolution=512,
                 inference_steps=steps,
                 dynamic_scale=scale)
    return out_p

# ------------------------------------------------------------------
def run_sonic(image, audio, scale):
    """Gradio 버튼 연결 함수 (캐싱·전처리)"""
    if image is None: raise gr.Error("Please upload an image.")
    if audio is None: raise gr.Error("Please upload an audio file.")

    # ---- 이미지 저장 & 해시 -------------------------------------------------
    buf_img = io.BytesIO(); image.save(buf_img, "PNG")
    img_key = md5(buf_img.getvalue())
    img_path = os.path.join(TMP_DIR, f"{img_key}.png")
    if not os.path.exists(img_path):
        with open(img_path, "wb") as f: f.write(buf_img.getvalue())

    # ---- 오디오 → mono/16kHz WAV (≤60 s) -----------------------------------
    sr, arr = audio[:2]
    arr = arr if arr.ndim == 2 else arr[:, None]
    seg = AudioSegment(arr.tobytes(), frame_rate=sr,
                       sample_width=arr.dtype.itemsize,
                       channels=arr.shape[1]
           ).set_channels(1).set_frame_rate(16_000)[:60_000]
    buf_wav = io.BytesIO(); seg.export(buf_wav, format="wav")
    wav_key = md5(buf_wav.getvalue())
    wav_path = os.path.join(TMP_DIR, f"{wav_key}.wav")
    if not os.path.exists(wav_path):
        with open(wav_path, "wb") as f: f.write(buf_wav.getvalue())

    # ---- 결과 파일 경로 -----------------------------------------------------
    out_path = os.path.join(RES_DIR, f"{img_key}_{wav_key}_{scale}.mp4")

    # ---- 캐시 확인 ---------------------------------------------------------
    if os.path.exists(out_path):
        print("[INFO] Cached video used.")
        return out_path

    print(f"[INFO] Generating video (scale={scale}) …")
    res = get_video_res(img_path, wav_path, out_path, scale)
    if res == -1:
        raise gr.Error("No face detected in the image.")
    return res

# ------------------------------------------------------------------
#                     Gradio UI
# ------------------------------------------------------------------
css = """
.gradio-container {font-family: Arial, sans-serif;}
.main-header     {text-align:center;color:#2a2a2a;margin-bottom:2em;}
"""

with gr.Blocks(css=css) as demo:
    gr.HTML(
        "<div class='main-header'>"
        "<h1>🎭 Sonic: Portrait-to-Video Animator</h1>"
        "<p>Create talking-head videos (≤60 s audio)</p>"
        "</div>"
    )

    with gr.Row():
        with gr.Column():
            img_in = gr.Image(type="pil", label="Portrait Image")
            aud_in = gr.Audio(type="numpy", label="Voice/Audio (≤60 s)")
            scale  = gr.Slider(0.5, 2.0, 1.0, 0.1,
                               label="Animation Intensity")
            btn    = gr.Button("Generate", variant="primary")
        vid_out = gr.Video(label="Generated Animation")

    btn.click(run_sonic, inputs=[img_in, aud_in, scale], outputs=vid_out)

    gr.HTML(
        "<div style='text-align:center;margin-top:1.5em'>"
        "<a href='https://github.com/jixiaozhong/Sonic' target='_blank'>GitHub</a> | "
        "<a href='https://arxiv.org/pdf/2411.16331' target='_blank'>Paper</a>"
        "</div>"
    )

demo.launch(share=True)