Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import os | |
import numpy as np | |
from pydub import AudioSegment | |
import hashlib | |
from sonic import Sonic | |
from PIL import Image | |
import torch # 필요 시 사용 | |
# ------------------------------------------------------------------ | |
# 모델 초기화 | |
# ------------------------------------------------------------------ | |
cmd = ( | |
'python3 -m pip install "huggingface_hub[cli]"; ' | |
'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' | |
'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' | |
'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' | |
) | |
os.system(cmd) | |
pipe = Sonic() | |
# ------------------------------------------------------------------ | |
# 유틸 | |
# ------------------------------------------------------------------ | |
def get_md5(content): | |
"""바이트/배열에서 md5 해시 문자열 반환""" | |
md5hash = hashlib.md5(content) | |
return md5hash.hexdigest() | |
# ------------------------------------------------------------------ | |
# 비디오 생성 | |
# ------------------------------------------------------------------ | |
# 최대 5분까지 GPU 세션 유지 | |
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): | |
expand_ratio = 0.0 # ★ 얼굴 크롭 방지 | |
min_resolution = 512 | |
# 오디오 길이 → 프레임 수 결정 (fps=25, 최대 60초=1500프레임) | |
audio = AudioSegment.from_file(audio_path) | |
duration = len(audio) / 1000.0 # 초 | |
fps = 25 | |
max_steps = fps * 60 # 1500 | |
inference_steps = max(1, min(int(duration * fps), max_steps)) | |
print(f"Audio duration: {duration:.2f}s → inference_steps: {inference_steps}") | |
# 얼굴 정보는 참고용으로만 출력 | |
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) | |
print(f"Face detection info: {face_info}") | |
if face_info["face_num"] == 0: | |
print("Warning: face not detected – proceeding with full image.") | |
# 출력 폴더 보장 | |
os.makedirs(os.path.dirname(res_video_path), exist_ok=True) | |
# 비디오 생성 | |
pipe.process( | |
img_path, | |
audio_path, | |
res_video_path, | |
min_resolution=min_resolution, | |
inference_steps=inference_steps, | |
dynamic_scale=dynamic_scale, | |
) | |
return res_video_path | |
# ------------------------------------------------------------------ | |
# 캐시·경로 설정 | |
# ------------------------------------------------------------------ | |
tmp_path = "./tmp_path/" | |
res_path = "./res_path/" | |
os.makedirs(tmp_path, exist_ok=True) | |
os.makedirs(res_path, exist_ok=True) | |
# ------------------------------------------------------------------ | |
# Gradio 콜백 | |
# ------------------------------------------------------------------ | |
def process_sonic(image, audio, dynamic_scale): | |
# 입력 검증 | |
if image is None: | |
raise gr.Error("Please upload an image") | |
if audio is None: | |
raise gr.Error("Please upload an audio file") | |
img_md5 = get_md5(np.array(image)) | |
audio_md5 = get_md5(audio[1]) | |
print(f"Processing (img={img_md5}, audio={audio_md5})") | |
# numpy 오디오 → AudioSegment | |
sampling_rate, arr = audio[:2] | |
if arr.ndim == 1: | |
arr = arr[:, None] | |
audio_segment = AudioSegment( | |
arr.tobytes(), | |
frame_rate=sampling_rate, | |
sample_width=arr.dtype.itemsize, | |
channels=arr.shape[1], | |
) | |
# 경로 | |
image_path = os.path.abspath(os.path.join(tmp_path, f"{img_md5}.png")) | |
audio_path = os.path.abspath(os.path.join(tmp_path, f"{audio_md5}.wav")) | |
res_video_path = os.path.abspath( | |
os.path.join(res_path, f"{img_md5}_{audio_md5}_{dynamic_scale}.mp4") | |
) | |
# 저장 / 캐시 | |
if not os.path.exists(image_path): | |
image.save(image_path) | |
if not os.path.exists(audio_path): | |
audio_segment.export(audio_path, format="wav") | |
if os.path.exists(res_video_path): | |
print(f"Using cached result: {res_video_path}") | |
return res_video_path | |
print(f"Generating new video (dynamic_scale={dynamic_scale})") | |
return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) | |
# ------------------------------------------------------------------ | |
# Gradio UI | |
# ------------------------------------------------------------------ | |
def get_example(): | |
"""예시 데이터 (필요 시 추가)""" | |
return [] | |
css = """ | |
.gradio-container { font-family: 'Arial', sans-serif; } | |
.main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; } | |
.parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; } | |
.example-section { margin-top: 2em; } | |
""" | |
with gr.Blocks(css=css, theme="apriel") as demo: | |
gr.HTML( | |
""" | |
<div class="main-header"> | |
<h1>🎭 Longer Sonic: Advanced Portrait Animation</h1> | |
<p>Transform still images into dynamic videos synchronized with audio(Demo max 60sec)</p> | |
</div> | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
image_input = gr.Image(type="pil", label="Portrait Image", elem_id="image_input") | |
audio_input = gr.Audio(label="Voice/Audio Input", elem_id="audio_input", type="numpy") | |
dynamic_scale = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Animation Intensity", | |
info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)", | |
) | |
process_btn = gr.Button("Generate Animation", variant="primary", elem_id="process_btn") | |
with gr.Column(): | |
video_output = gr.Video(label="Generated Animation", elem_id="video_output") | |
process_btn.click( | |
fn=process_sonic, | |
inputs=[image_input, audio_input, dynamic_scale], | |
outputs=video_output, | |
api_name="animate", | |
) | |
gr.Examples( | |
examples=get_example(), | |
fn=process_sonic, | |
inputs=[image_input, audio_input, dynamic_scale], | |
outputs=video_output, | |
cache_examples=False, | |
) | |
# ------------------------------------------------------------------ | |
# Launch | |
# ------------------------------------------------------------------ | |
demo.launch(share=True) | |