import spaces import gradio as gr import os import numpy as np from pydub import AudioSegment import hashlib from sonic import Sonic from PIL import Image import torch # Initialize the model cmd = ( 'python3 -m pip install "huggingface_hub[cli]"; ' 'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' 'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' ) os.system(cmd) pipe = Sonic() def get_md5(content): md5hash = hashlib.md5(content) return md5hash.hexdigest() @spaces.GPU(duration=300) # Increased duration to handle longer videos def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): expand_ratio = 0.5 min_resolution = 512 inference_steps = 25 # Get audio duration (정보 출력용) audio = AudioSegment.from_file(audio_path) duration = len(audio) / 1000.0 # 초 단위 변환 face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) print(f"Face detection info: {face_info}") print(f"Audio duration: {duration} seconds") if face_info['face_num'] > 0: crop_image_path = img_path + '.crop.png' pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) img_path = crop_image_path os.makedirs(os.path.dirname(res_video_path), exist_ok=True) # NOTE: Sonic.process()는 더 이상 duration 인자를 받지 않으므로 제거합니다. pipe.process( img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale ) else: return -1 tmp_path = './tmp_path/' res_path = './res_path/' os.makedirs(tmp_path, exist_ok=True) os.makedirs(res_path, exist_ok=True) def process_sonic(image, audio, dynamic_scale): # 입력 검증 if image is None: raise gr.Error("Please upload an image") if audio is None: raise gr.Error("Please upload an audio file") img_md5 = get_md5(np.array(image)) audio_md5 = get_md5(audio[1]) print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") sampling_rate, arr = audio[:2] if len(arr.shape) == 1: arr = arr[:, None] # 오디오 세그먼트 생성 audio_segment = AudioSegment( arr.tobytes(), frame_rate=sampling_rate, sample_width=arr.dtype.itemsize, channels=arr.shape[1] ) audio_segment = audio_segment.set_frame_rate(sampling_rate) # 파일 경로 생성 image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) # 입력 파일이 없으면 저장 if not os.path.exists(image_path): image.save(image_path) if not os.path.exists(audio_path): audio_segment.export(audio_path, format="wav") # 캐시된 결과가 있으면 반환, 없으면 새로 생성 if os.path.exists(res_video_path): print(f"Using cached result: {res_video_path}") return res_video_path else: print(f"Generating new video with dynamic scale: {dynamic_scale}") return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) # 예시 데이터를 위한 dummy 함수 (필요에 따라 실제 예시 데이터를 넣으세요) def get_example(): return [] css = """ .gradio-container { font-family: 'Arial', sans-serif; } .main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } .parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } .example-section { margin-top: 2em; } """ with gr.Blocks(css=css) as demo: gr.HTML("""

🎭 Sonic: Advanced Portrait Animation

Transform still images into dynamic videos synchronized with audio

""") with gr.Row(): with gr.Column(): image_input = gr.Image( type='pil', label="Portrait Image", elem_id="image_input" ) audio_input = gr.Audio( label="Voice/Audio Input", elem_id="audio_input", type="numpy" ) with gr.Column(): dynamic_scale = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Animation Intensity", info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)" ) process_btn = gr.Button( "Generate Animation", variant="primary", elem_id="process_btn" ) with gr.Column(): video_output = gr.Video( label="Generated Animation", elem_id="video_output" ) # 버튼 클릭 시 애니메이션 생성 함수 호출 process_btn.click( fn=process_sonic, inputs=[image_input, audio_input, dynamic_scale], outputs=video_output, api_name="animate" ) # 예시 섹션 gr.Examples( examples=get_example(), fn=process_sonic, inputs=[image_input, audio_input, dynamic_scale], outputs=video_output, cache_examples=False ) # Footer: Attribution & Links gr.HTML("""
GitHub Repo arXiv Paper

🔔 Note: For optimal results, use clear portrait images and high-quality audio

""") # 공유 링크 생성: share=True demo.launch(share=True)