File size: 5,619 Bytes
ff37225
 
9d31513
ff37225
 
 
2279f85
 
ff37225
 
9132603
ff37225
9132603
 
 
 
 
 
 
ff37225
 
 
 
 
 
 
 
 
 
 
 
2279f85
 
ff37225
 
 
 
2279f85
 
ff37225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2279f85
 
ff37225
 
 
 
 
 
 
 
 
 
 
 
 
2279f85
ff37225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2279f85
ff37225
2279f85
 
ff37225
 
2279f85
 
ff37225
2279f85
ff37225
 
 
9d31513
 
ff37225
 
 
 
 
 
2279f85
9d31513
 
ff37225
 
 
 
 
 
 
 
 
2279f85
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# app.py
import os, io, hashlib, spaces, gradio as gr
from pydub import AudioSegment
from PIL   import Image
import numpy as np
from sonic import Sonic          # <-- 수정된 sonic.py 사용

# ------------------------------------------------------------------
# 1. 필요한 모델·라이브러리 설치 & 체크포인트 다운로드
# ------------------------------------------------------------------

SETUP_CMD = (
    'python -m pip install "huggingface_hub[cli]" accelerate; '
    'huggingface-cli download LeonJoe13/Sonic '
    '    --local-dir checkpoints/Sonic --local-dir-use-symlinks False; '
    'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt '
    '    --local-dir checkpoints/stable-video-diffusion-img2vid-xt --local-dir-use-symlinks False; '
    'huggingface-cli download openai/whisper-tiny '
    '    --local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False; '
)
os.system(SETUP_CMD)

# ------------------------------------------------------------------
# 2. 파이프라인 초기화 (GPU 한 번만)
# ------------------------------------------------------------------
pipe = Sonic()     # 오류가 사라진 상태로 초기화

# ------------------------------------------------------------------
# 3. 유틸리티
# ------------------------------------------------------------------
def _md5(b: bytes) -> str:
    return hashlib.md5(b).hexdigest()

TMP_DIR  = "tmp_path"
RES_DIR  = "res_path"
os.makedirs(TMP_DIR,  exist_ok=True)
os.makedirs(RES_DIR,  exist_ok=True)

# ------------------------------------------------------------------
# 4. 실제 비디오 생성 (GPU 태스크)
# ------------------------------------------------------------------
@spaces.GPU(duration=600)     # 최대 10분
def _render_video(img_path: str,
                  audio_path: str,
                  out_path: str,
                  dynamic_scale: float = 1.0) -> str | int:

    min_resolution = 512
    audio          = AudioSegment.from_file(audio_path)
    duration_sec   = len(audio) / 1000.0
    steps          = int(np.clip(duration_sec * 12.5, 25, 750))

    print(f"[INFO] Audio duration={duration_sec:.2f}s → inference_steps={steps}")

    face_info = pipe.preprocess(img_path)
    print(f"[INFO] Face detection info: {face_info}")

    if face_info["face_num"] == 0:
        return -1        # 얼굴 미검출

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    pipe.process(
        img_path, audio_path, out_path,
        min_resolution=min_resolution,
        inference_steps=steps,
        dynamic_scale=dynamic_scale,
    )
    return out_path

# ------------------------------------------------------------------
# 5. Gradio 래퍼
# ------------------------------------------------------------------
def run_sonic(image, audio, dynamic_scale):

    if image is None:
        raise gr.Error("Please upload an image.")
    if audio is None:
        raise gr.Error("Please upload an audio file.")

    # ── 이미지 캐시 ───────────────────────────────────────────────
    buf_i = io.BytesIO(); image.save(buf_i, format="PNG")
    img_hash = _md5(buf_i.getvalue())
    img_path = os.path.join(TMP_DIR, f"{img_hash}.png")
    if not os.path.exists(img_path):
        with open(img_path, "wb") as f: f.write(buf_i.getvalue())

    # ── 오디오 캐시 (mono/16 kHz, ≤60 s) ─────────────────────────
    rate, arr = audio[:2]
    if arr.ndim == 1: arr = arr[:, None]
    seg = AudioSegment(arr.tobytes(), frame_rate=rate,
                       sample_width=arr.dtype.itemsize, channels=arr.shape[1])
    seg = seg.set_channels(1).set_frame_rate(16000)[:60_000]
    buf_a = io.BytesIO(); seg.export(buf_a, format="wav")
    aud_hash = _md5(buf_a.getvalue())
    aud_path = os.path.join(TMP_DIR, f"{aud_hash}.wav")
    if not os.path.exists(aud_path):
        with open(aud_path, "wb") as f: f.write(buf_a.getvalue())

    # ── 결과 경로 ────────────────────────────────────────────────
    out_path = os.path.join(
        RES_DIR, f"{img_hash}_{aud_hash}_{dynamic_scale:.1f}.mp4"
    )

    if os.path.exists(out_path):
        print(f"[INFO] Cache hit → {out_path}")
        return out_path

    print(f"[INFO] Generating video (dynamic_scale={dynamic_scale}) …")
    return _render_video(img_path, aud_path, out_path, dynamic_scale)

# ------------------------------------------------------------------
# 6. Gradio UI
# ------------------------------------------------------------------
CSS = """
.gradio-container{font-family:Arial, sans-serif}
.main-header{text-align:center;color:#2a2a2a;margin-bottom:2em}
"""

with gr.Blocks(css=CSS) as demo:
    gr.HTML("""
    <div class="main-header">
        <h1>🎭 Sonic - Portrait Animation</h1>
        <p>Turn a single photo into a talking-head video (≤1 min audio)</p>
    </div>""")

    with gr.Row():
        with gr.Column():
            img_in  = gr.Image(type="pil",  label="Portrait Image")
            aud_in  = gr.Audio(label="Voice / Audio (≤60 s)", type="numpy")
            scale   = gr.Slider(0.5, 2.0, 1.0, step=0.1,
                                label="Animation Intensity")
            btn     = gr.Button("Generate Animation", variant="primary")
        with gr.Column():
            vid_out = gr.Video(label="Result")

    btn.click(run_sonic, [img_in, aud_in, scale], vid_out)

demo.launch(share=True)