openfree commited on
Commit
137ab16
·
verified ·
1 Parent(s): 612b064

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +148 -69
app.py CHANGED
@@ -5,7 +5,10 @@ import numpy as np
5
  from pydub import AudioSegment
6
  import hashlib
7
  from sonic import Sonic
 
 
8
 
 
9
  cmd = 'python3 -m pip install "huggingface_hub[cli]"; \
10
  huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; \
11
  huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; \
@@ -19,108 +22,184 @@ def get_md5(content):
19
  md5 = md5hash.hexdigest()
20
  return md5
21
 
22
- @spaces.GPU(duration=120)
23
  def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
24
-
25
  expand_ratio = 0.5
26
  min_resolution = 512
27
  inference_steps = 25
28
-
 
 
 
 
29
  face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
30
- print(face_info)
 
 
31
  if face_info['face_num'] > 0:
32
  crop_image_path = img_path + '.crop.png'
33
  pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
34
  img_path = crop_image_path
35
  os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
36
- pipe.process(img_path, audio_path, res_video_path, min_resolution=min_resolution, inference_steps=inference_steps, dynamic_scale=dynamic_scale)
 
 
 
 
 
 
 
 
 
 
37
  else:
38
  return -1
 
39
  tmp_path = './tmp_path/'
40
  res_path = './res_path/'
41
- os.makedirs(tmp_path,exist_ok=1)
42
- os.makedirs(res_path,exist_ok=1)
43
 
44
- def process_sonic(image,audio,s0):
45
- img_md5= get_md5(np.array(image))
 
 
 
 
 
 
46
  audio_md5 = get_md5(audio[1])
47
- print(img_md5,audio_md5)
 
48
  sampling_rate, arr = audio[:2]
49
- if len(arr.shape)==1:
50
- arr = arr[:,None]
51
- audio = AudioSegment(
 
 
52
  arr.tobytes(),
53
  frame_rate=sampling_rate,
54
  sample_width=arr.dtype.itemsize,
55
  channels=arr.shape[1]
56
  )
57
- audio = audio.set_frame_rate(sampling_rate)
58
- image_path = os.path.abspath(tmp_path+'{0}.png'.format(img_md5))
59
- audio_path = os.path.abspath(tmp_path+'{0}.wav'.format(audio_md5))
 
 
 
 
 
60
  if not os.path.exists(image_path):
61
  image.save(image_path)
62
  if not os.path.exists(audio_path):
63
- audio.export(audio_path, format="wav")
64
- res_video_path = os.path.abspath(res_path+f'{img_md5}_{audio_md5}_{s0}.mp4')
 
65
  if os.path.exists(res_video_path):
 
66
  return res_video_path
67
  else:
68
- get_video_res(image_path, audio_path, res_video_path,s0)
69
- return res_video_path
70
-
71
- inputs = [
72
- gr.Image(type='pil',label="Upload Image"),
73
- gr.Audio(label="Upload Audio"),
74
- gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Dynamic scale", info="Increase/decrease to obtain more/less movements"),
75
- ]
76
- outputs = gr.Video(label="output.mp4")
77
-
78
-
79
- html_description = """
80
- <div style="display: flex; justify-content: center; align-items: center;">
81
- <a href="https://github.com/jixiaozhong/Sonic.git" style="margin: 0 2px;">
82
- <img src='https://img.shields.io/badge/GitHub-Repo-blue?style=flat&logo=GitHub' alt='GitHub'>
83
- </a>
84
- <a href="https://arxiv.org/pdf/2411.16331" style="margin: 0 2px;">
85
- <img src='https://img.shields.io/badge/arXiv-2411.16331-red?style=flat&logo=arXiv&logoColor=red' alt='arxiv'>
86
- </a>
87
- <a href='https://jixiaozhong.github.io/Sonic/' style="margin: 0 2px;">
88
- <img src='https://img.shields.io/badge/Webpage-Project-silver?style=flat&logo=&logoColor=orange' alt='webpage'>
89
- </a>
90
- <a href="https://github.com/jixiaozhong/Sonic/blob/main/LICENSE" style="margin: 0 2px;">
91
- <img src='https://img.shields.io/badge/License-CC BY--NC--SA--4.0-lightgreen?style=flat&logo=Lisence' alt='License'>
92
- </a>
93
- </div>
94
 
95
- The demo can only be used for <b>Non-commercial Use</b>.
96
- <br>If you like our work, please star <a href='https://jixiaozhong.github.io/Sonic/' style="margin: 0 2px;">Sonic</a>.
97
- <br>Note: Audio longer than 4s will be truncated due to computing resources.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  """
99
- TAIL = """
100
- <div style="display: flex; justify-content: center; align-items: center;">
101
- <a href="https://clustrmaps.com/site/1c38t" title="ClustrMaps"><img src="//www.clustrmaps.com/map_v2.png?d=BI2nzSldyixPC88l8Kev4wjjqsU4IOk7gcvpOijolGI&cl=ffffff" /></a>
102
- </div>
103
- """
104
-
105
- def get_example():
106
- return [
107
- ["examples/image/female_diaosu.png", "examples/wav/sing_female_rap_10s.MP3", 1.0],
108
- ["examples/image/hair.png", "examples/wav/sing_female_10s.wav", 1.0],
109
- ["examples/image/anime1.png", "examples/wav/talk_female_english_10s.MP3", 1.0],
110
- ["examples/image/leonnado.jpg", "examples/wav/talk_male_law_10s.wav", 1.0],
111
-
112
- ]
113
 
114
- with gr.Blocks(title="Sonic") as demo:
115
- gr.Interface(fn=process_sonic, inputs=inputs, outputs=outputs, title="Sonic: Shifting Focus to Global Audio Perception in Portrait Animation", description=html_description)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  gr.Examples(
117
  examples=get_example(),
118
  fn=process_sonic,
119
- inputs=inputs,
120
- outputs=outputs,
121
- cache_examples=False,)
122
- gr.Markdown(TAIL)
 
123
 
124
- demo.launch()
125
-
 
 
 
 
 
 
 
 
 
 
 
 
126
 
 
 
5
  from pydub import AudioSegment
6
  import hashlib
7
  from sonic import Sonic
8
+ from PIL import Image
9
+ import torch
10
 
11
+ # Initialize the model
12
  cmd = 'python3 -m pip install "huggingface_hub[cli]"; \
13
  huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; \
14
  huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; \
 
22
  md5 = md5hash.hexdigest()
23
  return md5
24
 
25
+ @spaces.GPU(duration=300) # Increased duration to handle longer videos
26
  def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
 
27
  expand_ratio = 0.5
28
  min_resolution = 512
29
  inference_steps = 25
30
+
31
+ # Get audio duration
32
+ audio = AudioSegment.from_file(audio_path)
33
+ duration = len(audio) / 1000.0 # Convert to seconds
34
+
35
  face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
36
+ print(f"Face detection info: {face_info}")
37
+ print(f"Audio duration: {duration} seconds")
38
+
39
  if face_info['face_num'] > 0:
40
  crop_image_path = img_path + '.crop.png'
41
  pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
42
  img_path = crop_image_path
43
  os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
44
+
45
+ # Process with full audio duration
46
+ pipe.process(
47
+ img_path,
48
+ audio_path,
49
+ res_video_path,
50
+ min_resolution=min_resolution,
51
+ inference_steps=inference_steps,
52
+ dynamic_scale=dynamic_scale,
53
+ duration=duration # Pass the actual duration
54
+ )
55
  else:
56
  return -1
57
+
58
  tmp_path = './tmp_path/'
59
  res_path = './res_path/'
60
+ os.makedirs(tmp_path, exist_ok=1)
61
+ os.makedirs(res_path, exist_ok=1)
62
 
63
+ def process_sonic(image, audio, dynamic_scale):
64
+ # Input validation
65
+ if image is None:
66
+ raise gr.Error("Please upload an image")
67
+ if audio is None:
68
+ raise gr.Error("Please upload an audio file")
69
+
70
+ img_md5 = get_md5(np.array(image))
71
  audio_md5 = get_md5(audio[1])
72
+ print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}")
73
+
74
  sampling_rate, arr = audio[:2]
75
+ if len(arr.shape) == 1:
76
+ arr = arr[:, None]
77
+
78
+ # Create audio segment
79
+ audio_segment = AudioSegment(
80
  arr.tobytes(),
81
  frame_rate=sampling_rate,
82
  sample_width=arr.dtype.itemsize,
83
  channels=arr.shape[1]
84
  )
85
+ audio_segment = audio_segment.set_frame_rate(sampling_rate)
86
+
87
+ # Generate paths
88
+ image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
89
+ audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
90
+ res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
91
+
92
+ # Save inputs if they don't exist
93
  if not os.path.exists(image_path):
94
  image.save(image_path)
95
  if not os.path.exists(audio_path):
96
+ audio_segment.export(audio_path, format="wav")
97
+
98
+ # Process or return cached result
99
  if os.path.exists(res_video_path):
100
+ print(f"Using cached result: {res_video_path}")
101
  return res_video_path
102
  else:
103
+ print(f"Generating new video with dynamic scale: {dynamic_scale}")
104
+ return get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Enhanced UI
107
+ css = """
108
+ .gradio-container {
109
+ font-family: 'Arial', sans-serif;
110
+ }
111
+ .main-header {
112
+ text-align: center;
113
+ color: #2a2a2a;
114
+ margin-bottom: 2em;
115
+ }
116
+ .parameter-section {
117
+ background-color: #f5f5f5;
118
+ padding: 1em;
119
+ border-radius: 8px;
120
+ margin: 1em 0;
121
+ }
122
+ .example-section {
123
+ margin-top: 2em;
124
+ }
125
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ with gr.Blocks(css=css) as demo:
128
+ gr.HTML("""
129
+ <div class="main-header">
130
+ <h1>🎭 Sonic: Advanced Portrait Animation</h1>
131
+ <p>Transform still images into dynamic videos synchronized with audio</p>
132
+ </div>
133
+ """)
134
+
135
+ with gr.Row():
136
+ with gr.Column():
137
+ image_input = gr.Image(
138
+ type='pil',
139
+ label="Portrait Image",
140
+ elem_id="image_input",
141
+ tool="select"
142
+ )
143
+
144
+ audio_input = gr.Audio(
145
+ label="Voice/Audio Input",
146
+ elem_id="audio_input",
147
+ type="numpy"
148
+ )
149
+
150
+ with gr.Box(elem_classes="parameter-section"):
151
+ dynamic_scale = gr.Slider(
152
+ minimum=0.5,
153
+ maximum=2.0,
154
+ value=1.0,
155
+ step=0.1,
156
+ label="Animation Intensity",
157
+ info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
158
+ )
159
+
160
+ process_btn = gr.Button(
161
+ "Generate Animation",
162
+ variant="primary",
163
+ elem_id="process_btn"
164
+ )
165
+
166
+ with gr.Column():
167
+ video_output = gr.Video(
168
+ label="Generated Animation",
169
+ elem_id="video_output"
170
+ )
171
+
172
+ # Process button click
173
+ process_btn.click(
174
+ fn=process_sonic,
175
+ inputs=[image_input, audio_input, dynamic_scale],
176
+ outputs=video_output,
177
+ api_name="animate"
178
+ )
179
+
180
+ # Examples section
181
  gr.Examples(
182
  examples=get_example(),
183
  fn=process_sonic,
184
+ inputs=[image_input, audio_input, dynamic_scale],
185
+ outputs=video_output,
186
+ cache_examples=False,
187
+ elem_classes="example-section"
188
+ )
189
 
190
+ # Footer with attribution and links
191
+ gr.HTML("""
192
+ <div style="text-align: center; margin-top: 2em;">
193
+ <div style="margin-bottom: 1em;">
194
+ <a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;">
195
+ <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
196
+ </a>
197
+ <a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;">
198
+ <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
199
+ </a>
200
+ </div>
201
+ <p>🔔 Note: For optimal results, use clear portrait images and high-quality audio</p>
202
+ </div>
203
+ """)
204
 
205
+ demo.launch()