Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on Feb 10

Commit

137ab16

verified ·

1 Parent(s): 612b064

Update app.py

Browse files

Files changed (1) hide show

app.py +148 -69

app.py CHANGED Viewed

@@ -5,7 +5,10 @@ import numpy as np
 from pydub import AudioSegment
 import hashlib
 from sonic import Sonic
 cmd = 'python3 -m pip install "huggingface_hub[cli]"; \
 huggingface-cli download LeonJoe13/Sonic --local-dir  checkpoints; \
 huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir  checkpoints/stable-video-diffusion-img2vid-xt; \
@@ -19,108 +22,184 @@ def get_md5(content):
     md5 = md5hash.hexdigest()
     return md5
-@spaces.GPU(duration=120)
 def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
     expand_ratio = 0.5
     min_resolution = 512
     inference_steps = 25
     face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
-    print(face_info)
     if face_info['face_num'] > 0:
         crop_image_path = img_path + '.crop.png'
         pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
         img_path = crop_image_path
         os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
-        pipe.process(img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale)
     else:
         return -1
 tmp_path = './tmp_path/'
 res_path = './res_path/'
-os.makedirs(tmp_path,exist_ok=1)
-os.makedirs(res_path,exist_ok=1)
-def process_sonic(image,audio,s0):
-    img_md5= get_md5(np.array(image))
     audio_md5 = get_md5(audio[1])
-    print(img_md5,audio_md5)
     sampling_rate, arr = audio[:2]
-    if len(arr.shape)==1:
-        arr = arr[:,None]
-    audio = AudioSegment(
         arr.tobytes(),
         frame_rate=sampling_rate,
         sample_width=arr.dtype.itemsize,
         channels=arr.shape[1]
     )
-    audio = audio.set_frame_rate(sampling_rate)
-    image_path = os.path.abspath(tmp_path+'{0}.png'.format(img_md5))
-    audio_path = os.path.abspath(tmp_path+'{0}.wav'.format(audio_md5))
     if not os.path.exists(image_path):
         image.save(image_path)
     if not os.path.exists(audio_path):
-        audio.export(audio_path, format="wav")
-    res_video_path = os.path.abspath(res_path+f'{img_md5}_{audio_md5}_{s0}.mp4')
     if os.path.exists(res_video_path):
         return res_video_path
     else:
-        get_video_res(image_path, audio_path, res_video_path,s0)
-    return res_video_path
-inputs = [
-    gr.Image(type='pil',label="Upload Image"),
-    gr.Audio(label="Upload Audio"),
-    gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Dynamic scale", info="Increase/decrease to obtain more/less movements"),
-]
-outputs = gr.Video(label="output.mp4")
-html_description = """
-<div style="display: flex; justify-content: center; align-items: center;">
-  <a href="https://github.com/jixiaozhong/Sonic.git" style="margin: 0 2px;">
-    <img src='https://img.shields.io/badge/GitHub-Repo-blue?style=flat&logo=GitHub' alt='GitHub'>
-  </a>
-  <a href="https://arxiv.org/pdf/2411.16331" style="margin: 0 2px;">
-    <img src='https://img.shields.io/badge/arXiv-2411.16331-red?style=flat&logo=arXiv&logoColor=red' alt='arxiv'>
-  </a>
-  <a href='https://jixiaozhong.github.io/Sonic/' style="margin: 0 2px;">
-    <img src='https://img.shields.io/badge/Webpage-Project-silver?style=flat&logo=&logoColor=orange' alt='webpage'>
-  </a>
-  <a href="https://github.com/jixiaozhong/Sonic/blob/main/LICENSE" style="margin: 0 2px;">
-    <img src='https://img.shields.io/badge/License-CC BY--NC--SA--4.0-lightgreen?style=flat&logo=Lisence' alt='License'>
-  </a>
-</div>
-The demo can only be used for <b>Non-commercial Use</b>.
-<br>If you like our work, please star <a href='https://jixiaozhong.github.io/Sonic/' style="margin: 0 2px;">Sonic</a>.
-<br>Note: Audio longer than 4s will be truncated due to computing resources.
 """
-TAIL = """
-<div style="display: flex; justify-content: center; align-items: center;">
-<a href="https://clustrmaps.com/site/1c38t"  title="ClustrMaps"><img src="//www.clustrmaps.com/map_v2.png?d=BI2nzSldyixPC88l8Kev4wjjqsU4IOk7gcvpOijolGI&cl=ffffff" /></a>
-</div>
-"""
-def get_example():
-    return [
-        ["examples/image/female_diaosu.png", "examples/wav/sing_female_rap_10s.MP3", 1.0],
-        ["examples/image/hair.png", "examples/wav/sing_female_10s.wav", 1.0],
-        ["examples/image/anime1.png", "examples/wav/talk_female_english_10s.MP3", 1.0],
-        ["examples/image/leonnado.jpg", "examples/wav/talk_male_law_10s.wav", 1.0],
-    ]
-with gr.Blocks(title="Sonic") as demo:
-    gr.Interface(fn=process_sonic, inputs=inputs, outputs=outputs, title="Sonic: Shifting Focus to Global Audio Perception in Portrait Animation", description=html_description)
     gr.Examples(
         examples=get_example(),
         fn=process_sonic,
-        inputs=inputs,
-        outputs=outputs,
-        cache_examples=False,)
-    gr.Markdown(TAIL)
-demo.launch()

 from pydub import AudioSegment
 import hashlib
 from sonic import Sonic
+from PIL import Image
+import torch
+# Initialize the model
 cmd = 'python3 -m pip install "huggingface_hub[cli]"; \
 huggingface-cli download LeonJoe13/Sonic --local-dir  checkpoints; \
 huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir  checkpoints/stable-video-diffusion-img2vid-xt; \
     md5 = md5hash.hexdigest()
     return md5
+@spaces.GPU(duration=300)  # Increased duration to handle longer videos
 def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
     expand_ratio = 0.5
     min_resolution = 512
     inference_steps = 25
+    # Get audio duration
+    audio = AudioSegment.from_file(audio_path)
+    duration = len(audio) / 1000.0  # Convert to seconds
     face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
+    print(f"Face detection info: {face_info}")
+    print(f"Audio duration: {duration} seconds")
     if face_info['face_num'] > 0:
         crop_image_path = img_path + '.crop.png'
         pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
         img_path = crop_image_path
         os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
+        # Process with full audio duration
+        pipe.process(
+            img_path,
+            audio_path,
+            res_video_path,
+            min_resolution=min_resolution,
+            inference_steps=inference_steps,
+            dynamic_scale=dynamic_scale,
+            duration=duration  # Pass the actual duration
+        )
     else:
         return -1
 tmp_path = './tmp_path/'
 res_path = './res_path/'
+os.makedirs(tmp_path, exist_ok=1)
+os.makedirs(res_path, exist_ok=1)
+def process_sonic(image, audio, dynamic_scale):
+    # Input validation
+    if image is None:
+        raise gr.Error("Please upload an image")
+    if audio is None:
+        raise gr.Error("Please upload an audio file")
+    img_md5 = get_md5(np.array(image))
     audio_md5 = get_md5(audio[1])
+    print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}")
     sampling_rate, arr = audio[:2]
+    if len(arr.shape) == 1:
+        arr = arr[:, None]
+    # Create audio segment
+    audio_segment = AudioSegment(
         arr.tobytes(),
         frame_rate=sampling_rate,
         sample_width=arr.dtype.itemsize,
         channels=arr.shape[1]
     )
+    audio_segment = audio_segment.set_frame_rate(sampling_rate)
+    # Generate paths
+    image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
+    audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
+    res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
+    # Save inputs if they don't exist
     if not os.path.exists(image_path):
         image.save(image_path)
     if not os.path.exists(audio_path):
+        audio_segment.export(audio_path, format="wav")
+    # Process or return cached result
     if os.path.exists(res_video_path):
+        print(f"Using cached result: {res_video_path}")
         return res_video_path
     else:
+        print(f"Generating new video with dynamic scale: {dynamic_scale}")
+        return get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
+# Enhanced UI
+css = """
+.gradio-container {
+    font-family: 'Arial', sans-serif;
+}
+.main-header {
+    text-align: center;
+    color: #2a2a2a;
+    margin-bottom: 2em;
+}
+.parameter-section {
+    background-color: #f5f5f5;
+    padding: 1em;
+    border-radius: 8px;
+    margin: 1em 0;
+}
+.example-section {
+    margin-top: 2em;
+}
 """
+with gr.Blocks(css=css) as demo:
+    gr.HTML("""
+        <div class="main-header">
+            <h1>🎭 Sonic: Advanced Portrait Animation</h1>
+            <p>Transform still images into dynamic videos synchronized with audio</p>
+        </div>
+    """)
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(
+                type='pil',
+                label="Portrait Image",
+                elem_id="image_input",
+                tool="select"
+            )
+            audio_input = gr.Audio(
+                label="Voice/Audio Input",
+                elem_id="audio_input",
+                type="numpy"
+            )
+            with gr.Box(elem_classes="parameter-section"):
+                dynamic_scale = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Animation Intensity",
+                    info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
+                )
+            process_btn = gr.Button(
+                "Generate Animation",
+                variant="primary",
+                elem_id="process_btn"
+            )
+        with gr.Column():
+            video_output = gr.Video(
+                label="Generated Animation",
+                elem_id="video_output"
+            )
+    # Process button click
+    process_btn.click(
+        fn=process_sonic,
+        inputs=[image_input, audio_input, dynamic_scale],
+        outputs=video_output,
+        api_name="animate"
+    )
+    # Examples section
     gr.Examples(
         examples=get_example(),
         fn=process_sonic,
+        inputs=[image_input, audio_input, dynamic_scale],
+        outputs=video_output,
+        cache_examples=False,
+        elem_classes="example-section"
+    )
+    # Footer with attribution and links
+    gr.HTML("""
+        <div style="text-align: center; margin-top: 2em;">
+            <div style="margin-bottom: 1em;">
+                <a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;">
+                    <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
+                </a>
+                <a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;">
+                    <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
+                </a>
+            </div>
+            <p>🔔 Note: For optimal results, use clear portrait images and high-quality audio</p>
+        </div>
+    """)
+demo.launch()