🎭 Sonic: Advanced Portrait Animation
Transform still images into dynamic videos synchronized with audio
import spaces import gradio as gr import os import numpy as np from pydub import AudioSegment import hashlib from sonic import Sonic from PIL import Image import torch # Initialize the model cmd = 'python3 -m pip install "huggingface_hub[cli]"; \ huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; \ huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; \ huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' os.system(cmd) pipe = Sonic() def get_md5(content): md5hash = hashlib.md5(content) md5 = md5hash.hexdigest() return md5 @spaces.GPU(duration=300) # Increased duration to handle longer videos def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): expand_ratio = 0.5 min_resolution = 512 inference_steps = 25 # Get audio duration audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 # Convert to seconds face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) print(f"Face detection info: {face_info}") print(f"Audio duration: {duration} seconds") if face_info['face_num'] > 0: crop_image_path = img_path + '.crop.png' pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) img_path = crop_image_path os.makedirs(os.path.dirname(res_video_path), exist_ok=True) # Process with full audio duration pipe.process( img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale, duration=duration # Pass the actual duration ) else: return -1 tmp_path = './tmp_path/' res_path = './res_path/' os.makedirs(tmp_path, exist_ok=True) os.makedirs(res_path, exist_ok=True) def process_sonic(image, audio, dynamic_scale): # Input validation if image is None: raise gr.Error("Please upload an image") if audio is None: raise gr.Error("Please upload an audio file") img_md5 = get_md5(np.array(image)) audio_md5 = get_md5(audio[1]) print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") sampling_rate, arr = audio[:2] if len(arr.shape) == 1: arr = arr[:, None] # Create audio segment audio_segment = AudioSegment( arr.tobytes(), frame_rate=sampling_rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1] ) audio_segment = audio_segment.set_frame_rate(sampling_rate) # Generate paths image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) # Save inputs if they don't exist if not os.path.exists(image_path): image.save(image_path) if not os.path.exists(audio_path): audio_segment.export(audio_path, format="wav") # Process or return cached result if os.path.exists(res_video_path): print(f"Using cached result: {res_video_path}") return res_video_path else: print(f"Generating new video with dynamic scale: {dynamic_scale}") return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) # Dummy get_example function to prevent errors if examples are not defined def get_example(): # 예시가 없다면 빈 리스트를 반환하거나 실제 예시 데이터를 입력할 수 있습니다. return [] # Enhanced UI css = """ .gradio-container { font-family: 'Arial', sans-serif; } .main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } .parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } .example-section { margin-top: 2em; } """ with gr.Blocks(css=css) as demo: gr.HTML("""
Transform still images into dynamic videos synchronized with audio