🎭 Sonic: Advanced Portrait Animation
Transform still images into dynamic videos synchronized with audio (up to 1 minute)
import spaces import gradio as gr import os import numpy as np from pydub import AudioSegment import hashlib from sonic import Sonic from PIL import Image import torch # 모델 초기화 cmd = ( 'python3 -m pip install "huggingface_hub[cli]"; ' 'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' 'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' ) os.system(cmd) pipe = Sonic() def get_md5(content): md5hash = hashlib.md5(content) return md5hash.hexdigest() tmp_path = './tmp_path/' res_path = './res_path/' os.makedirs(tmp_path, exist_ok=True) os.makedirs(res_path, exist_ok=True) @spaces.GPU(duration=600) # 긴 비디오 처리를 위해 duration 600초로 설정 (10분) def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): # ============================ # 최대 60초까지 오디오를 반영 # ============================ expand_ratio = 0.0 min_resolution = 512 # pydub으로 오디오 길이 계산 audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 # 초 단위 # 오디오 길이에 따라 inference_steps 계산 (초당 약 12.5 프레임) # 최소 25 프레임, 최대 750 프레임 (60초 => 60*12.5=750) inference_steps = min(max(int(duration * 12.5), 25), 750) print(f"Audio duration: {duration:.2f} seconds, using inference_steps: {inference_steps}") # 얼굴 인식 (face_info는 참고용) face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) print(f"Face detection info: {face_info}") # 얼굴이 하나라도 검출되면(>0), 원본 이미지 비율 유지 if face_info['face_num'] > 0: os.makedirs(os.path.dirname(res_video_path), exist_ok=True) pipe.process( img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale ) return res_video_path else: return -1 def process_sonic(image, audio, dynamic_scale): # 입력 검증 if image is None: raise gr.Error("Please upload an image") if audio is None: raise gr.Error("Please upload an audio file") img_md5 = get_md5(np.array(image)) audio_md5 = get_md5(audio[1]) print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") sampling_rate, arr = audio[:2] if len(arr.shape) == 1: arr = arr[:, None] # numpy array -> AudioSegment 변환 audio_segment = AudioSegment( arr.tobytes(), frame_rate=sampling_rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1] ) audio_segment = audio_segment.set_frame_rate(sampling_rate) # 오디오 길이 제한 확인 (최대 60초) MAX_DURATION_MS = 60000 # 60초 if len(audio_segment) > MAX_DURATION_MS: print(f"Audio longer than 60 seconds ({len(audio_segment)/1000:.2f}s). Truncating to 60 seconds.") audio_segment = audio_segment[:MAX_DURATION_MS] # 파일 경로 생성 image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) # 이미지/오디오 파일 캐싱 if not os.path.exists(image_path): image.save(image_path) if not os.path.exists(audio_path): audio_segment.export(audio_path, format="wav") # 캐시된 결과가 있으면 바로 사용 if os.path.exists(res_video_path): print(f"Using cached result: {res_video_path}") return res_video_path else: print(f"Generating new video with dynamic scale: {dynamic_scale}") return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) # 예시 데이터를 위한 dummy 함수 def get_example(): return [] css = """ .gradio-container { font-family: 'Arial', sans-serif; } .main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } .parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } .example-section { margin-top: 2em; } """ with gr.Blocks(css=css) as demo: gr.HTML("""
Transform still images into dynamic videos synchronized with audio (up to 1 minute)