import spaces import gradio as gr import os import numpy as np from pydub import AudioSegment import hashlib from sonic import Sonic from PIL import Image import torch # Initialize the model cmd = 'python3 -m pip install "huggingface_hub[cli]"; \ huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; \ huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; \ huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' os.system(cmd) pipe = Sonic() def get_md5(content): md5hash = hashlib.md5(content) md5 = md5hash.hexdigest() return md5 @spaces.GPU(duration=300) # Increased duration to handle longer videos def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): expand_ratio = 0.5 min_resolution = 512 inference_steps = 25 # Get audio duration audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 # Convert to seconds face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) print(f"Face detection info: {face_info}") print(f"Audio duration: {duration} seconds") if face_info['face_num'] > 0: crop_image_path = img_path + '.crop.png' pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) img_path = crop_image_path os.makedirs(os.path.dirname(res_video_path), exist_ok=True) # Process with full audio duration pipe.process( img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale, duration=duration # Pass the actual duration ) else: return -1 tmp_path = './tmp_path/' res_path = './res_path/' os.makedirs(tmp_path, exist_ok=True) os.makedirs(res_path, exist_ok=True) def process_sonic(image, audio, dynamic_scale): # Input validation if image is None: raise gr.Error("Please upload an image") if audio is None: raise gr.Error("Please upload an audio file") img_md5 = get_md5(np.array(image)) audio_md5 = get_md5(audio[1]) print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") sampling_rate, arr = audio[:2] if len(arr.shape) == 1: arr = arr[:, None] # Create audio segment audio_segment = AudioSegment( arr.tobytes(), frame_rate=sampling_rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1] ) audio_segment = audio_segment.set_frame_rate(sampling_rate) # Generate paths image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) # Save inputs if they don't exist if not os.path.exists(image_path): image.save(image_path) if not os.path.exists(audio_path): audio_segment.export(audio_path, format="wav") # Process or return cached result if os.path.exists(res_video_path): print(f"Using cached result: {res_video_path}") return res_video_path else: print(f"Generating new video with dynamic scale: {dynamic_scale}") return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) # Dummy get_example function to prevent errors if examples are not defined def get_example(): # 예시가 없다면 빈 리스트를 반환하거나 실제 예시 데이터를 입력할 수 있습니다. return [] # Enhanced UI css = """ .gradio-container { font-family: 'Arial', sans-serif; } .main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } .parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } .example-section { margin-top: 2em; } """ with gr.Blocks(css=css) as demo: gr.HTML("""

🎭 Sonic: Advanced Portrait Animation

Transform still images into dynamic videos synchronized with audio

""") with gr.Row(): with gr.Column(): image_input = gr.Image( type='pil', label="Portrait Image", elem_id="image_input" ) audio_input = gr.Audio( label="Voice/Audio Input", elem_id="audio_input", type="numpy" ) with gr.Column(): dynamic_scale = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Animation Intensity", info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)" ) process_btn = gr.Button( "Generate Animation", variant="primary", elem_id="process_btn" ) with gr.Column(): video_output = gr.Video( label="Generated Animation", elem_id="video_output" ) # Process button click process_btn.click( fn=process_sonic, inputs=[image_input, audio_input, dynamic_scale], outputs=video_output, api_name="animate" ) # Examples section (elem_classes 인자 제거) gr.Examples( examples=get_example(), fn=process_sonic, inputs=[image_input, audio_input, dynamic_scale], outputs=video_output, cache_examples=False ) # Footer with attribution and links gr.HTML("""
GitHub Repo arXiv Paper

🔔 Note: For optimal results, use clear portrait images and high-quality audio

""") demo.launch()