🎭 Sonic: Advanced Portrait Animation
Transform still images into dynamic videos synchronized with audio (up to 1 minute)
import spaces import gradio as gr import os import numpy as np from pydub import AudioSegment import hashlib import io from sonic import Sonic from PIL import Image import torch # 초기 실행 시 필요한 모델들을 다운로드 cmd = ( 'python3 -m pip install "huggingface_hub[cli]" accelerate; ' 'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' 'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' ) os.system(cmd) pipe = Sonic() def get_md5(content_bytes: bytes): """MD5 해시를 계산하여 32자리 문자열을 반환""" return hashlib.md5(content_bytes).hexdigest() tmp_path = './tmp_path/' res_path = './res_path/' os.makedirs(tmp_path, exist_ok=True) os.makedirs(res_path, exist_ok=True) @spaces.GPU(duration=600) # 긴 비디오 처리를 위해 duration 600초로 설정 (10분) def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): """ Sonic pipeline으로부터 실제 비디오를 생성하는 함수. 최대 60초 길이의 오디오에 대해 inference_steps를 결정하여, 얼굴 탐지 후 영상 생성 작업을 수행함. """ expand_ratio = 0.0 min_resolution = 512 # 오디오 길이 계산 audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 # 초 단위 # 오디오 길이에 따라 inference_steps 결정 (최소 25프레임 ~ 최대 750프레임) inference_steps = min(max(int(duration * 12.5), 25), 750) print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}") # 얼굴 인식 face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) print(f"[INFO] Face detection info: {face_info}") # 얼굴이 하나라도 검출되면 -> pipeline 진행 if face_info['face_num'] > 0: os.makedirs(os.path.dirname(res_video_path), exist_ok=True) pipe.process( img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale ) return res_video_path else: # 얼굴이 전혀 없으면 -1 리턴 return -1 def process_sonic(image, audio, dynamic_scale): """ Gradio 인터페이스에서 호출되는 함수: 1. 이미지/오디오 검사 2. MD5 해시 -> 파일명 3. 캐시 검사 -> 없으면 영상 생성 """ if image is None: raise gr.Error("Please upload an image") if audio is None: raise gr.Error("Please upload an audio file") # (1) 이미지 MD5 buf_img = io.BytesIO() image.save(buf_img, format="PNG") img_bytes = buf_img.getvalue() img_md5 = get_md5(img_bytes) # (2) 오디오 MD5 sampling_rate, arr = audio[:2] if len(arr.shape) == 1: arr = arr[:, None] audio_segment = AudioSegment( arr.tobytes(), frame_rate=sampling_rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1] ) # Whisper 호환을 위해 mono/16kHz로 변환 audio_segment = audio_segment.set_channels(1).set_frame_rate(16000) MAX_DURATION_MS = 60000 if len(audio_segment) > MAX_DURATION_MS: audio_segment = audio_segment[:MAX_DURATION_MS] buf_audio = io.BytesIO() audio_segment.export(buf_audio, format="wav") audio_bytes = buf_audio.getvalue() audio_md5 = get_md5(audio_bytes) # (3) 파일 경로 image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) if not os.path.exists(image_path): with open(image_path, "wb") as f: f.write(img_bytes) if not os.path.exists(audio_path): with open(audio_path, "wb") as f: f.write(audio_bytes) # (4) 캐싱된 결과가 있으면 재사용 if os.path.exists(res_video_path): print(f"[INFO] Using cached result: {res_video_path}") return res_video_path else: print(f"[INFO] Generating new video with dynamic_scale={dynamic_scale}") video_result = get_video_res(image_path, audio_path, res_video_path, dynamic_scale) return video_result def get_example(): return [] css = """ .gradio-container { font-family: 'Arial', sans-serif; } .main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } .parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } .example-section { margin-top: 2em; } """ with gr.Blocks(css=css) as demo: gr.HTML("""
Transform still images into dynamic videos synchronized with audio (up to 1 minute)