openfree commited on
Commit
0c85fa1
·
verified ·
1 Parent(s): c3eebaf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -247
app.py CHANGED
@@ -11,7 +11,7 @@ import torch
11
 
12
  # 초기 실행 시 필요한 모델들을 다운로드
13
  cmd = (
14
- 'python3 -m pip install "huggingface_hub[cli]"; '
15
  'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
16
  'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
17
  'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
@@ -20,7 +20,7 @@ os.system(cmd)
20
 
21
  pipe = Sonic()
22
 
23
- def get_md5(content_bytes):
24
  """MD5 해시를 계산하여 32자리 문자열을 반환"""
25
  return hashlib.md5(content_bytes).hexdigest()
26
 
@@ -29,7 +29,6 @@ res_path = './res_path/'
29
  os.makedirs(tmp_path, exist_ok=True)
30
  os.makedirs(res_path, exist_ok=True)
31
 
32
-
33
  @spaces.GPU(duration=600) # 긴 비디오 처리를 위해 duration 600초로 설정 (10분)
34
  def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
35
  """
@@ -48,7 +47,7 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
48
  # 최소 25 프레임, 최대 750 프레임 (60초 => 60*12.5=750)
49
  inference_steps = min(max(int(duration * 12.5), 25), 750)
50
 
51
- print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps: {inference_steps}")
52
 
53
  # 얼굴 인식 (face_info는 참고용)
54
  face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
@@ -71,7 +70,6 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
71
  else:
72
  return -1
73
 
74
-
75
  def process_sonic(image, audio, dynamic_scale):
76
  """
77
  Gradio 인터페이스 상에서 호출되는 함수.
@@ -138,252 +136,10 @@ def process_sonic(image, audio, dynamic_scale):
138
  video_result = get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
139
  return video_result
140
 
141
-
142
  def get_example():
143
  """예시 데이터를 로딩하는 더미 함수 (현재는 빈 리스트)."""
144
  return []
145
 
146
-
147
- css = """
148
- .gradio-container {
149
- font-family: 'Arial', sans-serif;
150
- }
151
- .main-header {
152
- text-align: center;
153
- color: #2a2a2a;
154
- margin-bottom: 2em;
155
- }
156
- .parameter-section {
157
- background-color: #f5f5f5;
158
- padding: 1em;
159
- border-radius: 8px;
160
- margin: 1em 0;
161
- }
162
- .example-section {
163
- margin-top: 2em;
164
- }
165
- """
166
-
167
- with gr.Blocks(css=css) as demo:
168
- gr.HTML("""
169
- <div class="main-header">
170
- <h1>🎭 Sonic: Advanced Portrait Animation</h1>
171
- <p>Transform still images into dynamic videos synchronized with audio (up to 1 minute)</p>
172
- </div>
173
- """)
174
-
175
- with gr.Row():
176
- with gr.Column():
177
- image_input = gr.Image(
178
- type='pil',
179
- label="Portrait Image",
180
- elem_id="image_input"
181
- )
182
-
183
- audio_input = gr.Audio(
184
- label="Voice/Audio Input (up to 1 minute)",
185
- elem_id="audio_input",
186
- type="numpy"
187
- )
188
-
189
- with gr.Column():
190
- dynamic_scale = gr.Slider(
191
- minimum=0.5,
192
- maximum=2.0,
193
- value=1.0,
194
- step=0.1,
195
- label="Animation Intensity",
196
- info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
197
- )
198
-
199
- process_btn = gr.Button(
200
- "Generate Animation",
201
- variant="primary",
202
- elem_id="process_btn"
203
- )
204
-
205
- with gr.Column():
206
- video_output = gr.Video(
207
- label="Generated Animation",
208
- elem_id="video_output"
209
- )
210
-
211
- process_btn.click(
212
- fn=process_sonic,
213
- inputs=[image_input, audio_input, dynamic_scale],
214
- outputs=video_output,
215
- )
216
-
217
- gr.Examples(
218
- examples=get_example(),
219
- fn=process_sonic,
220
- inputs=[image_input, audio_input, dynamic_scale],
221
- outputs=video_output,
222
- cache_examples=False
223
- )
224
-
225
- gr.HTML("""
226
- <div style="text-align: center; margin-top: 2em;">
227
- <div style="margin-bottom: 1em;">
228
- <a href="https://github.com/jixiaozhong/Sonic" target="_blank" style="text-decoration: none;">
229
- <img src="https://img.shields.io/badge/GitHub-Repo-blue?style=for-the-badge&logo=github" alt="GitHub Repo">
230
- </a>
231
- <a href="https://arxiv.org/pdf/2411.16331" target="_blank" style="text-decoration: none;">
232
- <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
233
- </a>
234
- </div>
235
- <p>🔔 Note: For optimal results, use clear portrait images and high-quality audio (now supports up to 1 minute!)</p>
236
- </div>
237
- """)
238
-
239
- # 공개 링크 생성
240
- demo.launch(share=True)
241
- import spaces
242
- import gradio as gr
243
- import os
244
- import numpy as np
245
- from pydub import AudioSegment
246
- import hashlib
247
- import io
248
- from sonic import Sonic
249
- from PIL import Image
250
- import torch
251
-
252
- # 초기 실행 시 필요한 모델들을 다운로드
253
- cmd = (
254
- 'python3 -m pip install "huggingface_hub[cli]"; '
255
- 'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
256
- 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
257
- 'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
258
- )
259
- os.system(cmd)
260
-
261
- pipe = Sonic()
262
-
263
- def get_md5(content_bytes):
264
- """MD5 해시를 계산하여 32자리 문자열을 반환"""
265
- return hashlib.md5(content_bytes).hexdigest()
266
-
267
- tmp_path = './tmp_path/'
268
- res_path = './res_path/'
269
- os.makedirs(tmp_path, exist_ok=True)
270
- os.makedirs(res_path, exist_ok=True)
271
-
272
-
273
- @spaces.GPU(duration=600) # 긴 비디오 처리를 위해 duration 600초로 설정 (10분)
274
- def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
275
- """
276
- Sonic pipeline으로부터 실제 비디오를 생성하는 함수.
277
- 최대 60초 길이의 오디오에 대해 inference_steps를 결정하여,
278
- 얼굴 탐지 후 영상 생성 작업을 수행함.
279
- """
280
- expand_ratio = 0.0
281
- min_resolution = 512
282
-
283
- # 오디오 길이
284
- audio = AudioSegment.from_file(audio_path)
285
- duration = len(audio) / 1000.0 # 초 단위
286
-
287
- # 오디오 길이에 따라 inference_steps 계산 (초당 약 12.5 프레임)
288
- # 최소 25 프레임, 최대 750 프레임 (60초 => 60*12.5=750)
289
- inference_steps = min(max(int(duration * 12.5), 25), 750)
290
-
291
- print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps: {inference_steps}")
292
-
293
- # 얼굴 인식 (face_info는 참고용)
294
- face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
295
- print(f"[INFO] Face detection info: {face_info}")
296
-
297
- # 얼굴이 하나라도 검출되면(>0), 원본 이미지 비율 유지
298
- if face_info['face_num'] > 0:
299
- os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
300
-
301
- # Sonic pipeline으로 비디오 생성
302
- pipe.process(
303
- img_path,
304
- audio_path,
305
- res_video_path,
306
- min_resolution=min_resolution,
307
- inference_steps=inference_steps,
308
- dynamic_scale=dynamic_scale
309
- )
310
- return res_video_path
311
- else:
312
- return -1
313
-
314
-
315
- def process_sonic(image, audio, dynamic_scale):
316
- """
317
- Gradio 인터페이스 상에서 호출되는 함수.
318
- 1. 이미지/오디오 입력 검증
319
- 2. MD5 해시 통해 파일명 생성 후 캐싱
320
- 3. 이미 결과 파일이 있으면 재활용, 없으면 새로 비디오 생성
321
- """
322
- if image is None:
323
- raise gr.Error("Please upload an image")
324
- if audio is None:
325
- raise gr.Error("Please upload an audio file")
326
-
327
- # 이미지 MD5 해시 계산
328
- buf_img = io.BytesIO()
329
- image.save(buf_img, format="PNG")
330
- img_bytes = buf_img.getvalue()
331
- img_md5 = get_md5(img_bytes)
332
-
333
- # 오디오 MD5 해시 계산
334
- sampling_rate, arr = audio[:2]
335
- if len(arr.shape) == 1:
336
- arr = arr[:, None]
337
-
338
- audio_segment = AudioSegment(
339
- arr.tobytes(),
340
- frame_rate=sampling_rate,
341
- sample_width=arr.dtype.itemsize,
342
- channels=arr.shape[1]
343
- )
344
-
345
- # (중요) Whisper 호환을 위해 mono/16kHz 변환
346
- audio_segment = audio_segment.set_channels(1)
347
- audio_segment = audio_segment.set_frame_rate(16000)
348
-
349
- # 최대 60초 제한
350
- MAX_DURATION_MS = 60000
351
- if len(audio_segment) > MAX_DURATION_MS:
352
- audio_segment = audio_segment[:MAX_DURATION_MS]
353
-
354
- buf_audio = io.BytesIO()
355
- audio_segment.export(buf_audio, format="wav")
356
- audio_bytes = buf_audio.getvalue()
357
- audio_md5 = get_md5(audio_bytes)
358
-
359
- # 파일 경로 생성
360
- image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
361
- audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
362
- res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
363
-
364
- # 이미지/오디오 파일 캐싱
365
- if not os.path.exists(image_path):
366
- with open(image_path, "wb") as f:
367
- f.write(img_bytes)
368
- if not os.path.exists(audio_path):
369
- with open(audio_path, "wb") as f:
370
- f.write(audio_bytes)
371
-
372
- # 이미 결과가 존재하면 캐시된 결과 사용
373
- if os.path.exists(res_video_path):
374
- print(f"[INFO] Using cached result: {res_video_path}")
375
- return res_video_path
376
- else:
377
- print(f"[INFO] Generating new video with dynamic_scale={dynamic_scale}")
378
- video_result = get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
379
- return video_result
380
-
381
-
382
- def get_example():
383
- """예시 데이터를 로딩하는 더미 함수 (현재는 빈 리스트)."""
384
- return []
385
-
386
-
387
  css = """
388
  .gradio-container {
389
  font-family: 'Arial', sans-serif;
 
11
 
12
  # 초기 실행 시 필요한 모델들을 다운로드
13
  cmd = (
14
+ 'python3 -m pip install "huggingface_hub[cli]" accelerate; ' # accelerate도 같이 설치 권장
15
  'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
16
  'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
17
  'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
 
20
 
21
  pipe = Sonic()
22
 
23
+ def get_md5(content_bytes: bytes):
24
  """MD5 해시를 계산하여 32자리 문자열을 반환"""
25
  return hashlib.md5(content_bytes).hexdigest()
26
 
 
29
  os.makedirs(tmp_path, exist_ok=True)
30
  os.makedirs(res_path, exist_ok=True)
31
 
 
32
  @spaces.GPU(duration=600) # 긴 비디오 처리를 위해 duration 600초로 설정 (10분)
33
  def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
34
  """
 
47
  # 최소 25 프레임, 최대 750 프레임 (60초 => 60*12.5=750)
48
  inference_steps = min(max(int(duration * 12.5), 25), 750)
49
 
50
+ print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}")
51
 
52
  # 얼굴 인식 (face_info는 참고용)
53
  face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
 
70
  else:
71
  return -1
72
 
 
73
  def process_sonic(image, audio, dynamic_scale):
74
  """
75
  Gradio 인터페이스 상에서 호출되는 함수.
 
136
  video_result = get_video_res(image_path, audio_path, res_video_path, dynamic_scale)
137
  return video_result
138
 
 
139
  def get_example():
140
  """예시 데이터를 로딩하는 더미 함수 (현재는 빈 리스트)."""
141
  return []
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  css = """
144
  .gradio-container {
145
  font-family: 'Arial', sans-serif;