openfree's picture
Update app.py
406d112 verified
raw
history blame
6.84 kB
import spaces
import gradio as gr
import os
import numpy as np
from pydub import AudioSegment
import hashlib
from sonic import Sonic
from PIL import Image
import torch
# Initialize the model
cmd = (
'python3 -m pip install "huggingface_hub[cli]"; '
'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
)
os.system(cmd)
pipe = Sonic()
def get_md5(content):
md5hash = hashlib.md5(content)
return md5hash.hexdigest()
@spaces.GPU(duration=300) # Increased duration to handle longer videos
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
expand_ratio = 0.5
min_resolution = 512
inference_steps = 25
# Get audio duration (for logging)
audio = AudioSegment.from_file(audio_path)
duration = len(audio) / 1000.0 # Convert ms to seconds
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
print(f"Face detection info: {face_info}")
print(f"Audio duration: {duration} seconds")
if face_info['face_num'] > 0:
crop_image_path = img_path + '.crop.png'
pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
img_path = crop_image_path
os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
# Process the video (duration parameter removed)
pipe.process(
img_path,
audio_path,
res_video_path,
min_resolution=min_resolution,
inference_steps=inference_steps,
dynamic_scale=dynamic_scale
)
# ★ 수정: 생성된 비디오 파일 경로를 반환하도록 함.
return res_video_path
else:
return -1
tmp_path = './tmp_path/'
res_path = './res_path/'
os.makedirs(tmp_path, exist_ok=True)
os.makedirs(res_path, exist_ok=True)
def process_sonic(image, audio, dynamic_scale):
# Input validation
if image is None:
raise gr.Error("Please upload an image")
if audio is None:
raise gr.Error("Please upload an audio file")
img_md5 = get_md5(np.array(image))
audio_md5 = get_md5(audio[1])
print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}")
sampling_rate, arr = audio[:2]
if len(arr.shape) == 1:
arr = arr[:, None]
# Create an audio segment from numpy array
audio_segment = AudioSegment(
arr.tobytes(),
frame_rate=sampling_rate,
sample_width=arr.dtype.itemsize,
channels=arr.shape[1]
)
audio_segment = audio_segment.set_frame_rate(sampling_rate)
# Generate file paths
image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
# Save input files if they don't exist
if not os.path.exists(image_path):
image.save(image_path)
if not os.path.exists(audio_path):
audio_segment.export(audio_path, format="wav")
# If cached video exists, return it; otherwise, generate a new one
if os.path.exists(res_video_path):
print(f"Using cached result: {res_video_path}")
return res_video_path
else:
print(f"Generating new video with dynamic scale: {dynamic_scale}")
return get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
# Dummy get_example function to prevent errors in examples section
def get_example():
return []
css = """
.gradio-container {
font-family: 'Arial', sans-serif;
}
.main-header {
text-align: center;
color: #2a2a2a;
margin-bottom: 2em;
}
.parameter-section {
background-color: #f5f5f5;
padding: 1em;
border-radius: 8px;
margin: 1em 0;
}
.example-section {
margin-top: 2em;
}
"""
with gr.Blocks(css=css) as demo:
gr.HTML("""
<div class="main-header">
<h1>🎭 Sonic: Advanced Portrait Animation</h1>
<p>Transform still images into dynamic videos synchronized with audio</p>
</div>
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(
type='pil',
label="Portrait Image",
elem_id="image_input"
)
audio_input = gr.Audio(
label="Voice/Audio Input",
elem_id="audio_input",
type="numpy"
)
with gr.Column():
dynamic_scale = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Animation Intensity",
info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
)
process_btn = gr.Button(
"Generate Animation",
variant="primary",
elem_id="process_btn"
)
with gr.Column():
video_output = gr.Video(
label="Generated Animation",
elem_id="video_output"
)
# Process button click: when clicked, process_sonic() is called and its return value is sent to video_output.
process_btn.click(
fn=process_sonic,
inputs=[image_input, audio_input, dynamic_scale],
outputs=video_output,
api_name="animate"
)
# Examples section
gr.Examples(
examples=get_example(),
fn=process_sonic,
inputs=[image_input, audio_input, dynamic_scale],
outputs=video_output,
cache_examples=False
)
# Footer with attribution and links
gr.HTML("""
<div style="text-align: center; margin-top: 2em;">
<div style="margin-bottom: 1em;">
<a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;">
<img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
</a>
<a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;">
<img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
</a>
</div>
<p>🔔 Note: For optimal results, use clear portrait images and high-quality audio</p>
</div>
""")
# To create a public link, share=True is set.
demo.launch(share=True)