# app.py import os, io, hashlib, spaces, gradio as gr from pydub import AudioSegment from PIL import Image import numpy as np from sonic import Sonic # <-- 수정된 sonic.py 사용 # ------------------------------------------------------------------ # 1. 필요한 모델·라이브러리 설치 & 체크포인트 다운로드 # ------------------------------------------------------------------ SETUP_CMD = ( 'python -m pip install "huggingface_hub[cli]" accelerate; ' 'huggingface-cli download LeonJoe13/Sonic ' ' --local-dir checkpoints/Sonic --local-dir-use-symlinks False; ' 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt ' ' --local-dir checkpoints/stable-video-diffusion-img2vid-xt --local-dir-use-symlinks False; ' 'huggingface-cli download openai/whisper-tiny ' ' --local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False; ' ) os.system(SETUP_CMD) # ------------------------------------------------------------------ # 2. 파이프라인 초기화 (GPU 한 번만) # ------------------------------------------------------------------ pipe = Sonic() # 오류가 사라진 상태로 초기화 # ------------------------------------------------------------------ # 3. 유틸리티 # ------------------------------------------------------------------ def _md5(b: bytes) -> str: return hashlib.md5(b).hexdigest() TMP_DIR = "tmp_path" RES_DIR = "res_path" os.makedirs(TMP_DIR, exist_ok=True) os.makedirs(RES_DIR, exist_ok=True) # ------------------------------------------------------------------ # 4. 실제 비디오 생성 (GPU 태스크) # ------------------------------------------------------------------ @spaces.GPU(duration=600) # 최대 10분 def _render_video(img_path: str, audio_path: str, out_path: str, dynamic_scale: float = 1.0) -> str | int: min_resolution = 512 audio = AudioSegment.from_file(audio_path) duration_sec = len(audio) / 1000.0 steps = int(np.clip(duration_sec * 12.5, 25, 750)) print(f"[INFO] Audio duration={duration_sec:.2f}s → inference_steps={steps}") face_info = pipe.preprocess(img_path) print(f"[INFO] Face detection info: {face_info}") if face_info["face_num"] == 0: return -1 # 얼굴 미검출 os.makedirs(os.path.dirname(out_path), exist_ok=True) pipe.process( img_path, audio_path, out_path, min_resolution=min_resolution, inference_steps=steps, dynamic_scale=dynamic_scale, ) return out_path # ------------------------------------------------------------------ # 5. Gradio 래퍼 # ------------------------------------------------------------------ def run_sonic(image, audio, dynamic_scale): if image is None: raise gr.Error("Please upload an image.") if audio is None: raise gr.Error("Please upload an audio file.") # ── 이미지 캐시 ─────────────────────────────────────────────── buf_i = io.BytesIO(); image.save(buf_i, format="PNG") img_hash = _md5(buf_i.getvalue()) img_path = os.path.join(TMP_DIR, f"{img_hash}.png") if not os.path.exists(img_path): with open(img_path, "wb") as f: f.write(buf_i.getvalue()) # ── 오디오 캐시 (mono/16 kHz, ≤60 s) ───────────────────────── rate, arr = audio[:2] if arr.ndim == 1: arr = arr[:, None] seg = AudioSegment(arr.tobytes(), frame_rate=rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1]) seg = seg.set_channels(1).set_frame_rate(16000)[:60_000] buf_a = io.BytesIO(); seg.export(buf_a, format="wav") aud_hash = _md5(buf_a.getvalue()) aud_path = os.path.join(TMP_DIR, f"{aud_hash}.wav") if not os.path.exists(aud_path): with open(aud_path, "wb") as f: f.write(buf_a.getvalue()) # ── 결과 경로 ──────────────────────────────────────────────── out_path = os.path.join( RES_DIR, f"{img_hash}_{aud_hash}_{dynamic_scale:.1f}.mp4" ) if os.path.exists(out_path): print(f"[INFO] Cache hit → {out_path}") return out_path print(f"[INFO] Generating video (dynamic_scale={dynamic_scale}) …") return _render_video(img_path, aud_path, out_path, dynamic_scale) # ------------------------------------------------------------------ # 6. Gradio UI # ------------------------------------------------------------------ CSS = """ .gradio-container{font-family:Arial, sans-serif} .main-header{text-align:center;color:#2a2a2a;margin-bottom:2em} """ with gr.Blocks(css=CSS) as demo: gr.HTML("""

🎭 Sonic - Portrait Animation

Turn a single photo into a talking-head video (≤1 min audio)

""") with gr.Row(): with gr.Column(): img_in = gr.Image(type="pil", label="Portrait Image") aud_in = gr.Audio(label="Voice / Audio (≤60 s)", type="numpy") scale = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Animation Intensity") btn = gr.Button("Generate Animation", variant="primary") with gr.Column(): vid_out = gr.Video(label="Result") btn.click(run_sonic, [img_in, aud_in, scale], vid_out) demo.launch(share=True)