Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import os | |
import numpy as np | |
from pydub import AudioSegment | |
import hashlib | |
import io | |
from sonic import Sonic | |
from PIL import Image | |
import torch | |
# 초기 실행 시 필요한 모델들을 다운로드 | |
cmd = ( | |
'python3 -m pip install "huggingface_hub[cli]" accelerate; ' | |
'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' | |
'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' | |
'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' | |
) | |
os.system(cmd) | |
pipe = Sonic() | |
def get_md5(content_bytes: bytes): | |
"""MD5 해시를 계산하여 32자리 문자열을 반환""" | |
return hashlib.md5(content_bytes).hexdigest() | |
tmp_path = './tmp_path/' | |
res_path = './res_path/' | |
os.makedirs(tmp_path, exist_ok=True) | |
os.makedirs(res_path, exist_ok=True) | |
# 긴 비디오 처리를 위해 duration 600초로 설정 (10분) | |
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): | |
""" | |
Sonic pipeline으로부터 실제 비디오를 생성하는 함수. | |
최대 60초 길이의 오디오에 대해 inference_steps를 결정하여, | |
얼굴 탐지 후 영상 생성 작업을 수행함. | |
""" | |
expand_ratio = 0.0 | |
min_resolution = 512 | |
# 오디오 길이 계산 | |
audio = AudioSegment.from_file(audio_path) | |
duration = len(audio) / 1000.0 # 초 단위 | |
# 오디오 길이에 따라 inference_steps 결정 (최소 25프레임 ~ 최대 750프레임) | |
inference_steps = min(max(int(duration * 12.5), 25), 750) | |
print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}") | |
# 얼굴 인식 | |
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) | |
print(f"[INFO] Face detection info: {face_info}") | |
# 얼굴이 하나라도 검출되면 -> pipeline 진행 | |
if face_info['face_num'] > 0: | |
os.makedirs(os.path.dirname(res_video_path), exist_ok=True) | |
pipe.process( | |
img_path, | |
audio_path, | |
res_video_path, | |
min_resolution=min_resolution, | |
inference_steps=inference_steps, | |
dynamic_scale=dynamic_scale | |
) | |
return res_video_path | |
else: | |
# 얼굴이 전혀 없으면 -1 리턴 | |
return -1 | |
def process_sonic(image, audio, dynamic_scale): | |
""" | |
Gradio 인터페이스에서 호출되는 함수: | |
1. 이미지/오디오 검사 | |
2. MD5 해시 -> 파일명 | |
3. 캐시 검사 -> 없으면 영상 생성 | |
""" | |
if image is None: | |
raise gr.Error("Please upload an image") | |
if audio is None: | |
raise gr.Error("Please upload an audio file") | |
# (1) 이미지 MD5 | |
buf_img = io.BytesIO() | |
image.save(buf_img, format="PNG") | |
img_bytes = buf_img.getvalue() | |
img_md5 = get_md5(img_bytes) | |
# (2) 오디오 MD5 | |
sampling_rate, arr = audio[:2] | |
if len(arr.shape) == 1: | |
arr = arr[:, None] | |
audio_segment = AudioSegment( | |
arr.tobytes(), | |
frame_rate=sampling_rate, | |
sample_width=arr.dtype.itemsize, | |
channels=arr.shape[1] | |
) | |
# Whisper 호환을 위해 mono/16kHz로 변환 | |
audio_segment = audio_segment.set_channels(1).set_frame_rate(16000) | |
MAX_DURATION_MS = 60000 | |
if len(audio_segment) > MAX_DURATION_MS: | |
audio_segment = audio_segment[:MAX_DURATION_MS] | |
buf_audio = io.BytesIO() | |
audio_segment.export(buf_audio, format="wav") | |
audio_bytes = buf_audio.getvalue() | |
audio_md5 = get_md5(audio_bytes) | |
# (3) 파일 경로 | |
image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) | |
audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) | |
res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) | |
if not os.path.exists(image_path): | |
with open(image_path, "wb") as f: | |
f.write(img_bytes) | |
if not os.path.exists(audio_path): | |
with open(audio_path, "wb") as f: | |
f.write(audio_bytes) | |
# (4) 캐싱된 결과가 있으면 재사용 | |
if os.path.exists(res_video_path): | |
print(f"[INFO] Using cached result: {res_video_path}") | |
return res_video_path | |
else: | |
print(f"[INFO] Generating new video with dynamic_scale={dynamic_scale}") | |
video_result = get_video_res(image_path, audio_path, res_video_path, dynamic_scale) | |
return video_result | |
def get_example(): | |
return [] | |
css = """ | |
.gradio-container { | |
font-family: 'Arial', sans-serif; | |
} | |
.main-header { | |
text-align: center; | |
color: #2a2a2a; | |
margin-bottom: 2em; | |
} | |
.parameter-section { | |
background-color: #f5f5f5; | |
padding: 1em; | |
border-radius: 8px; | |
margin: 1em 0; | |
} | |
.example-section { | |
margin-top: 2em; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.HTML(""" | |
<div class="main-header"> | |
<h1>🎭 Sonic: Advanced Portrait Animation</h1> | |
<p>Transform still images into dynamic videos synchronized with audio (up to 1 minute)</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image( | |
type='pil', | |
label="Portrait Image", | |
elem_id="image_input" | |
) | |
audio_input = gr.Audio( | |
label="Voice/Audio Input (up to 1 minute)", | |
elem_id="audio_input", | |
type="numpy" | |
) | |
with gr.Column(): | |
dynamic_scale = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Animation Intensity", | |
info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)" | |
) | |
process_btn = gr.Button( | |
"Generate Animation", | |
variant="primary", | |
elem_id="process_btn" | |
) | |
with gr.Column(): | |
video_output = gr.Video( | |
label="Generated Animation", | |
elem_id="video_output" | |
) | |
process_btn.click( | |
fn=process_sonic, | |
inputs=[image_input, audio_input, dynamic_scale], | |
outputs=video_output, | |
) | |
gr.Examples( | |
examples=get_example(), | |
fn=process_sonic, | |
inputs=[image_input, audio_input, dynamic_scale], | |
outputs=video_output, | |
cache_examples=False | |
) | |
gr.HTML(""" | |
<div style="text-align: center; margin-top: 2em;"> | |
<div style="margin-bottom: 1em;"> | |
<a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;"> | |
<img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo"> | |
</a> | |
<a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;"> | |
<img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper"> | |
</a> | |
</div> | |
<p>🔔 Note: For optimal results, use clear portrait images and high-quality audio (now supports up to 1 minute!)</p> | |
</div> | |
""") | |
demo.launch(share=True) | |