Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import os | |
import numpy as np | |
from pydub import AudioSegment | |
import hashlib | |
from sonic import Sonic | |
from PIL import Image | |
import torch | |
# Initialize the model | |
cmd = ( | |
'python3 -m pip install "huggingface_hub[cli]"; ' | |
'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' | |
'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' | |
'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' | |
) | |
os.system(cmd) | |
pipe = Sonic() | |
def get_md5(content): | |
md5hash = hashlib.md5(content) | |
return md5hash.hexdigest() | |
# Increased duration to handle longer videos | |
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): | |
expand_ratio = 0.5 | |
min_resolution = 512 | |
inference_steps = 25 | |
# Get audio duration (정보 출력용) | |
audio = AudioSegment.from_file(audio_path) | |
duration = len(audio) / 1000.0 # 초 단위 변환 | |
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) | |
print(f"Face detection info: {face_info}") | |
print(f"Audio duration: {duration} seconds") | |
if face_info['face_num'] > 0: | |
crop_image_path = img_path + '.crop.png' | |
pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) | |
img_path = crop_image_path | |
os.makedirs(os.path.dirname(res_video_path), exist_ok=True) | |
# NOTE: Sonic.process()는 더 이상 duration 인자를 받지 않으므로 제거합니다. | |
pipe.process( | |
img_path, | |
audio_path, | |
res_video_path, | |
min_resolution=min_resolution, | |
inference_steps=inference_steps, | |
dynamic_scale=dynamic_scale | |
) | |
else: | |
return -1 | |
tmp_path = './tmp_path/' | |
res_path = './res_path/' | |
os.makedirs(tmp_path, exist_ok=True) | |
os.makedirs(res_path, exist_ok=True) | |
def process_sonic(image, audio, dynamic_scale): | |
# 입력 검증 | |
if image is None: | |
raise gr.Error("Please upload an image") | |
if audio is None: | |
raise gr.Error("Please upload an audio file") | |
img_md5 = get_md5(np.array(image)) | |
audio_md5 = get_md5(audio[1]) | |
print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") | |
sampling_rate, arr = audio[:2] | |
if len(arr.shape) == 1: | |
arr = arr[:, None] | |
# 오디오 세그먼트 생성 | |
audio_segment = AudioSegment( | |
arr.tobytes(), | |
frame_rate=sampling_rate, | |
sample_width=arr.dtype.itemsize, | |
channels=arr.shape[1] | |
) | |
audio_segment = audio_segment.set_frame_rate(sampling_rate) | |
# 파일 경로 생성 | |
image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) | |
audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) | |
res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) | |
# 입력 파일이 없으면 저장 | |
if not os.path.exists(image_path): | |
image.save(image_path) | |
if not os.path.exists(audio_path): | |
audio_segment.export(audio_path, format="wav") | |
# 캐시된 결과가 있으면 반환, 없으면 새로 생성 | |
if os.path.exists(res_video_path): | |
print(f"Using cached result: {res_video_path}") | |
return res_video_path | |
else: | |
print(f"Generating new video with dynamic scale: {dynamic_scale}") | |
return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) | |
# 예시 데이터를 위한 dummy 함수 (필요에 따라 실제 예시 데이터를 넣으세요) | |
def get_example(): | |
return [] | |
css = """ | |
.gradio-container { | |
font-family: 'Arial', sans-serif; | |
} | |
.main-header { | |
text-align: center; | |
color: #2a2a2a; | |
margin-bottom: 2em; | |
} | |
.parameter-section { | |
background-color: #f5f5f5; | |
padding: 1em; | |
border-radius: 8px; | |
margin: 1em 0; | |
} | |
.example-section { | |
margin-top: 2em; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.HTML(""" | |
<div class="main-header"> | |
<h1>🎭 Sonic: Advanced Portrait Animation</h1> | |
<p>Transform still images into dynamic videos synchronized with audio</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image( | |
type='pil', | |
label="Portrait Image", | |
elem_id="image_input" | |
) | |
audio_input = gr.Audio( | |
label="Voice/Audio Input", | |
elem_id="audio_input", | |
type="numpy" | |
) | |
with gr.Column(): | |
dynamic_scale = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Animation Intensity", | |
info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)" | |
) | |
process_btn = gr.Button( | |
"Generate Animation", | |
variant="primary", | |
elem_id="process_btn" | |
) | |
with gr.Column(): | |
video_output = gr.Video( | |
label="Generated Animation", | |
elem_id="video_output" | |
) | |
# 버튼 클릭 시 애니메이션 생성 함수 호출 | |
process_btn.click( | |
fn=process_sonic, | |
inputs=[image_input, audio_input, dynamic_scale], | |
outputs=video_output, | |
api_name="animate" | |
) | |
# 예시 섹션 | |
gr.Examples( | |
examples=get_example(), | |
fn=process_sonic, | |
inputs=[image_input, audio_input, dynamic_scale], | |
outputs=video_output, | |
cache_examples=False | |
) | |
# Footer: Attribution & Links | |
gr.HTML(""" | |
<div style="text-align: center; margin-top: 2em;"> | |
<div style="margin-bottom: 1em;"> | |
<a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;"> | |
<img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo"> | |
</a> | |
<a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;"> | |
<img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper"> | |
</a> | |
</div> | |
<p>🔔 Note: For optimal results, use clear portrait images and high-quality audio</p> | |
</div> | |
""") | |
# 공유 링크 생성: share=True | |
demo.launch(share=True) | |