openfree commited on
Commit
ff37225
·
verified ·
1 Parent(s): 6db7147

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -103
app.py CHANGED
@@ -1,126 +1,138 @@
1
- import os, io, hashlib, gradio as gr, spaces
 
2
  from pydub import AudioSegment
3
- from PIL import Image
4
- from sonic import Sonic
5
-
6
- # --------------------------------------------------------------
7
- # 1) 필요 체크포인트(두 개)만 받는다
8
- # --------------------------------------------------------------
9
- DL_CMDS = [
10
- # Sonic(가중치, unet 등)
11
- "huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints/LeonJoe13-Sonic --local-dir-use-symlinks False -q",
12
- # stable-video-diffusion-img2vid-xt (VAE/UNet/CLIP)
13
- "huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt "
14
- "--local-dir checkpoints/stable-video-diffusion-img2vid-xt --local-dir-use-symlinks False -q",
15
- # whisper-tiny
16
- "huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False -q",
17
- ]
18
- for cmd in DL_CMDS:
19
- os.system(cmd)
20
-
21
- pipe = Sonic() # 위에서 모델 경로를 자동으로 찾음
22
 
23
  # ------------------------------------------------------------------
24
- def md5(b: bytes) -> str: # 빠른 32-byte 해시
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  return hashlib.md5(b).hexdigest()
26
 
27
- TMP_DIR, RES_DIR = "./tmp_path", "./res_path"
28
- os.makedirs(TMP_DIR, exist_ok=True)
29
- os.makedirs(RES_DIR, exist_ok=True)
 
30
 
31
  # ------------------------------------------------------------------
32
- @spaces.GPU(duration=600) # 최대 10분까지 GPU 사용
33
- def get_video_res(img_p: str, wav_p: str, out_p: str, scale: float):
34
- """실제 Sonic 파이프라인 호출(얼굴 체크·프레임·interpolate 포함)"""
35
- audio = AudioSegment.from_file(wav_p)
36
- dur = len(audio) / 1000.0
37
- steps = max(25, min(int(dur * 12.5), 750)) # 12.5fps 기준
38
-
39
- print(f"[INFO] Audio duration {dur:.2f}s ➜ steps {steps}")
40
-
41
- face = pipe.preprocess(img_p)
42
- print("[INFO] Face detection:", face)
43
- if face["face_num"] == 0:
44
- return -1 # 얼굴 없음
45
-
46
- pipe.process(img_p, wav_p, out_p,
47
- min_resolution=512,
48
- inference_steps=steps,
49
- dynamic_scale=scale)
50
- return out_p
 
 
 
 
 
 
 
 
 
 
51
 
52
  # ------------------------------------------------------------------
53
- def run_sonic(image, audio, scale):
54
- """Gradio 버튼 연결 함수 (캐싱·전처리)"""
55
- if image is None: raise gr.Error("Please upload an image.")
56
- if audio is None: raise gr.Error("Please upload an audio file.")
57
-
58
- # ---- 이미지 저장 & 해시 -------------------------------------------------
59
- buf_img = io.BytesIO(); image.save(buf_img, "PNG")
60
- img_key = md5(buf_img.getvalue())
61
- img_path = os.path.join(TMP_DIR, f"{img_key}.png")
 
 
 
 
62
  if not os.path.exists(img_path):
63
- with open(img_path, "wb") as f: f.write(buf_img.getvalue())
64
-
65
- # ---- 오디오 mono/16kHz WAV (≤60 s) -----------------------------------
66
- sr, arr = audio[:2]
67
- arr = arr if arr.ndim == 2 else arr[:, None]
68
- seg = AudioSegment(arr.tobytes(), frame_rate=sr,
69
- sample_width=arr.dtype.itemsize,
70
- channels=arr.shape[1]
71
- ).set_channels(1).set_frame_rate(16_000)[:60_000]
72
- buf_wav = io.BytesIO(); seg.export(buf_wav, format="wav")
73
- wav_key = md5(buf_wav.getvalue())
74
- wav_path = os.path.join(TMP_DIR, f"{wav_key}.wav")
75
- if not os.path.exists(wav_path):
76
- with open(wav_path, "wb") as f: f.write(buf_wav.getvalue())
77
-
78
- # ---- 결과 파일 경로 -----------------------------------------------------
79
- out_path = os.path.join(RES_DIR, f"{img_key}_{wav_key}_{scale}.mp4")
80
-
81
- # ---- 캐시 확인 ---------------------------------------------------------
82
  if os.path.exists(out_path):
83
- print("[INFO] Cached video used.")
84
  return out_path
85
 
86
- print(f"[INFO] Generating video (scale={scale}) …")
87
- res = get_video_res(img_path, wav_path, out_path, scale)
88
- if res == -1:
89
- raise gr.Error("No face detected in the image.")
90
- return res
91
 
92
  # ------------------------------------------------------------------
93
- # Gradio UI
94
  # ------------------------------------------------------------------
95
- css = """
96
- .gradio-container {font-family: Arial, sans-serif;}
97
- .main-header {text-align:center;color:#2a2a2a;margin-bottom:2em;}
98
  """
99
 
100
- with gr.Blocks(css=css) as demo:
101
- gr.HTML(
102
- "<div class='main-header'>"
103
- "<h1>🎭 Sonic: Portrait-to-Video Animator</h1>"
104
- "<p>Create talking-head videos (≤60 s audio)</p>"
105
- "</div>"
106
- )
107
 
108
  with gr.Row():
109
  with gr.Column():
110
- img_in = gr.Image(type="pil", label="Portrait Image")
111
- aud_in = gr.Audio(type="numpy", label="Voice/Audio (≤60 s)")
112
- scale = gr.Slider(0.5, 2.0, 1.0, 0.1,
113
- label="Animation Intensity")
114
- btn = gr.Button("Generate", variant="primary")
115
- vid_out = gr.Video(label="Generated Animation")
116
-
117
- btn.click(run_sonic, inputs=[img_in, aud_in, scale], outputs=vid_out)
118
-
119
- gr.HTML(
120
- "<div style='text-align:center;margin-top:1.5em'>"
121
- "<a href='https://github.com/jixiaozhong/Sonic' target='_blank'>GitHub</a> | "
122
- "<a href='https://arxiv.org/pdf/2411.16331' target='_blank'>Paper</a>"
123
- "</div>"
124
- )
125
 
126
  demo.launch(share=True)
 
1
+ # app.py
2
+ import os, io, hashlib, spaces, gradio as gr
3
  from pydub import AudioSegment
4
+ from PIL import Image
5
+ import numpy as np
6
+ from sonic import Sonic # <-- 수정된 sonic.py 사용
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # ------------------------------------------------------------------
9
+ # 1. 필요한 모델·라이브러리 설치 & 체크포인트 다운로드
10
+ # ------------------------------------------------------------------
11
+ SETUP_CMD = (
12
+ # 필수 라이브러리
13
+ 'python -m pip install --quiet "huggingface_hub[cli]" accelerate pydub Pillow '
14
+ # Sonic 체크포인트 → checkpoints/Sonic/*
15
+ '&& huggingface-cli download LeonJoe13/Sonic '
16
+ '--local-dir checkpoints/Sonic --local-dir-use-symlinks False '
17
+ # Whisper-tiny (음성 인코더)
18
+ '&& huggingface-cli download openai/whisper-tiny '
19
+ '--local-dir checkpoints/whisper-tiny --local-dir-use-symlinks False '
20
+ )
21
+ os.system(SETUP_CMD)
22
+
23
+ # ------------------------------------------------------------------
24
+ # 2. 파이프라인 초기화 (GPU 한 번만)
25
+ # ------------------------------------------------------------------
26
+ pipe = Sonic() # 오류가 사라진 상태로 초기화
27
+
28
+ # ------------------------------------------------------------------
29
+ # 3. 유틸리티
30
+ # ------------------------------------------------------------------
31
+ def _md5(b: bytes) -> str:
32
  return hashlib.md5(b).hexdigest()
33
 
34
+ TMP_DIR = "tmp_path"
35
+ RES_DIR = "res_path"
36
+ os.makedirs(TMP_DIR, exist_ok=True)
37
+ os.makedirs(RES_DIR, exist_ok=True)
38
 
39
  # ------------------------------------------------------------------
40
+ # 4. 실제 비디오 생성 (GPU 태스크)
41
+ # ------------------------------------------------------------------
42
+ @spaces.GPU(duration=600) # 최대 10분
43
+ def _render_video(img_path: str,
44
+ audio_path: str,
45
+ out_path: str,
46
+ dynamic_scale: float = 1.0) -> str | int:
47
+
48
+ min_resolution = 512
49
+ audio = AudioSegment.from_file(audio_path)
50
+ duration_sec = len(audio) / 1000.0
51
+ steps = int(np.clip(duration_sec * 12.5, 25, 750))
52
+
53
+ print(f"[INFO] Audio duration={duration_sec:.2f}s → inference_steps={steps}")
54
+
55
+ face_info = pipe.preprocess(img_path)
56
+ print(f"[INFO] Face detection info: {face_info}")
57
+
58
+ if face_info["face_num"] == 0:
59
+ return -1 # 얼굴 미검출
60
+
61
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
62
+ pipe.process(
63
+ img_path, audio_path, out_path,
64
+ min_resolution=min_resolution,
65
+ inference_steps=steps,
66
+ dynamic_scale=dynamic_scale,
67
+ )
68
+ return out_path
69
 
70
  # ------------------------------------------------------------------
71
+ # 5. Gradio 래퍼
72
+ # ------------------------------------------------------------------
73
+ def run_sonic(image, audio, dynamic_scale):
74
+
75
+ if image is None:
76
+ raise gr.Error("Please upload an image.")
77
+ if audio is None:
78
+ raise gr.Error("Please upload an audio file.")
79
+
80
+ # ── 이미지 캐시 ───────────────────────────────────────────────
81
+ buf_i = io.BytesIO(); image.save(buf_i, format="PNG")
82
+ img_hash = _md5(buf_i.getvalue())
83
+ img_path = os.path.join(TMP_DIR, f"{img_hash}.png")
84
  if not os.path.exists(img_path):
85
+ with open(img_path, "wb") as f: f.write(buf_i.getvalue())
86
+
87
+ # ── 오디오 캐시 (mono/16 kHz, ≤60 s) ─────────────────────────
88
+ rate, arr = audio[:2]
89
+ if arr.ndim == 1: arr = arr[:, None]
90
+ seg = AudioSegment(arr.tobytes(), frame_rate=rate,
91
+ sample_width=arr.dtype.itemsize, channels=arr.shape[1])
92
+ seg = seg.set_channels(1).set_frame_rate(16000)[:60_000]
93
+ buf_a = io.BytesIO(); seg.export(buf_a, format="wav")
94
+ aud_hash = _md5(buf_a.getvalue())
95
+ aud_path = os.path.join(TMP_DIR, f"{aud_hash}.wav")
96
+ if not os.path.exists(aud_path):
97
+ with open(aud_path, "wb") as f: f.write(buf_a.getvalue())
98
+
99
+ # ── 결과 경로 ────────────────────────────────────────────────
100
+ out_path = os.path.join(
101
+ RES_DIR, f"{img_hash}_{aud_hash}_{dynamic_scale:.1f}.mp4"
102
+ )
103
+
104
  if os.path.exists(out_path):
105
+ print(f"[INFO] Cache hit → {out_path}")
106
  return out_path
107
 
108
+ print(f"[INFO] Generating video (dynamic_scale={dynamic_scale}) …")
109
+ return _render_video(img_path, aud_path, out_path, dynamic_scale)
 
 
 
110
 
111
  # ------------------------------------------------------------------
112
+ # 6. Gradio UI
113
  # ------------------------------------------------------------------
114
+ CSS = """
115
+ .gradio-container{font-family:Arial, sans-serif}
116
+ .main-header{text-align:center;color:#2a2a2a;margin-bottom:2em}
117
  """
118
 
119
+ with gr.Blocks(css=CSS) as demo:
120
+ gr.HTML("""
121
+ <div class="main-header">
122
+ <h1>🎭 Sonic - Portrait Animation</h1>
123
+ <p>Turn a single photo into a talking-head video (≤1 min audio)</p>
124
+ </div>""")
 
125
 
126
  with gr.Row():
127
  with gr.Column():
128
+ img_in = gr.Image(type="pil", label="Portrait Image")
129
+ aud_in = gr.Audio(label="Voice / Audio (≤60 s)", type="numpy")
130
+ scale = gr.Slider(0.5, 2.0, 1.0, step=0.1,
131
+ label="Animation Intensity")
132
+ btn = gr.Button("Generate Animation", variant="primary")
133
+ with gr.Column():
134
+ vid_out = gr.Video(label="Result")
135
+
136
+ btn.click(run_sonic, [img_in, aud_in, scale], vid_out)
 
 
 
 
 
 
137
 
138
  demo.launch(share=True)