🎠Sonic: Advanced Portrait Animation
Transform still images into dynamic videos synchronized with audio
import spaces import gradio as gr import os import numpy as np from pydub import AudioSegment import hashlib from sonic import Sonic from PIL import Image import torch # Initialize the model cmd = 'python3 -m pip install "huggingface_hub[cli]"; \ huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; \ huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; \ huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' os.system(cmd) pipe = Sonic() def get_md5(content): md5hash = hashlib.md5(content) md5 = md5hash.hexdigest() return md5 @spaces.GPU(duration=300) # Increased duration to handle longer videos def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): expand_ratio = 0.5 min_resolution = 512 inference_steps = 25 # Get audio duration audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 # Convert to seconds face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) print(f"Face detection info: {face_info}") print(f"Audio duration: {duration} seconds") if face_info['face_num'] > 0: crop_image_path = img_path + '.crop.png' pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) img_path = crop_image_path os.makedirs(os.path.dirname(res_video_path), exist_ok=True) # Process with full audio duration pipe.process( img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale, duration=duration # Pass the actual duration ) else: return -1 tmp_path = './tmp_path/' res_path = './res_path/' os.makedirs(tmp_path, exist_ok=1) os.makedirs(res_path, exist_ok=1) def process_sonic(image, audio, dynamic_scale): # Input validation if image is None: raise gr.Error("Please upload an image") if audio is None: raise gr.Error("Please upload an audio file") img_md5 = get_md5(np.array(image)) audio_md5 = get_md5(audio[1]) print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") sampling_rate, arr = audio[:2] if len(arr.shape) == 1: arr = arr[:, None] # Create audio segment audio_segment = AudioSegment( arr.tobytes(), frame_rate=sampling_rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1] ) audio_segment = audio_segment.set_frame_rate(sampling_rate) # Generate paths image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) # Save inputs if they don't exist if not os.path.exists(image_path): image.save(image_path) if not os.path.exists(audio_path): audio_segment.export(audio_path, format="wav") # Process or return cached result if os.path.exists(res_video_path): print(f"Using cached result: {res_video_path}") return res_video_path else: print(f"Generating new video with dynamic scale: {dynamic_scale}") return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) # Enhanced UI css = """ .gradio-container { font-family: 'Arial', sans-serif; } .main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } .parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } .example-section { margin-top: 2em; } """ with gr.Blocks(css=css) as demo: gr.HTML("""
Transform still images into dynamic videos synchronized with audio