openfree commited on
Commit
cca593e
·
verified ·
1 Parent(s): 723bc72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -29
app.py CHANGED
@@ -11,7 +11,7 @@ import torch
11
 
12
  # 초기 실행 시 필요한 모델들을 다운로드
13
  cmd = (
14
- 'python3 -m pip install "huggingface_hub[cli]" accelerate; ' # accelerate도 같이 설치 권장
15
  'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
16
  'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
17
  'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
@@ -39,25 +39,21 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
39
  expand_ratio = 0.0
40
  min_resolution = 512
41
 
42
- # 오디오 길이
43
  audio = AudioSegment.from_file(audio_path)
44
  duration = len(audio) / 1000.0 # 초 단위
45
 
46
- # 오디오 길이에 따라 inference_steps 계산 (초당 12.5 프레임)
47
- # 최소 25 프레임, 최대 750 프레임 (60초 => 60*12.5=750)
48
  inference_steps = min(max(int(duration * 12.5), 25), 750)
49
-
50
  print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}")
51
 
52
- # 얼굴 인식 (face_info는 참고용)
53
  face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
54
  print(f"[INFO] Face detection info: {face_info}")
55
 
56
- # 얼굴이 하나라도 검출되면(>0), 원본 이미지 비율 유지
57
  if face_info['face_num'] > 0:
58
  os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
59
-
60
- # Sonic pipeline으로 비디오 생성
61
  pipe.process(
62
  img_path,
63
  audio_path,
@@ -68,43 +64,40 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
68
  )
69
  return res_video_path
70
  else:
 
71
  return -1
72
 
73
  def process_sonic(image, audio, dynamic_scale):
74
  """
75
- Gradio 인터페이스 상에서 호출되는 함수.
76
- 1. 이미지/오디오 입력 검증
77
- 2. MD5 해시 통해 파일명 생성 후 캐싱
78
- 3. 이미 결과 파일이 있으면 재활용, 없으면 새로 비디오 생성
79
  """
80
  if image is None:
81
  raise gr.Error("Please upload an image")
82
  if audio is None:
83
  raise gr.Error("Please upload an audio file")
84
 
85
- # 이미지 MD5 해시 계산
86
  buf_img = io.BytesIO()
87
  image.save(buf_img, format="PNG")
88
  img_bytes = buf_img.getvalue()
89
  img_md5 = get_md5(img_bytes)
90
 
91
- # 오디오 MD5 해시 계산
92
  sampling_rate, arr = audio[:2]
93
  if len(arr.shape) == 1:
94
  arr = arr[:, None]
95
-
96
  audio_segment = AudioSegment(
97
  arr.tobytes(),
98
  frame_rate=sampling_rate,
99
  sample_width=arr.dtype.itemsize,
100
  channels=arr.shape[1]
101
  )
 
 
102
 
103
- # (중요) Whisper 호환을 위해 mono/16kHz 변환
104
- audio_segment = audio_segment.set_channels(1)
105
- audio_segment = audio_segment.set_frame_rate(16000)
106
-
107
- # 최대 60초 제한
108
  MAX_DURATION_MS = 60000
109
  if len(audio_segment) > MAX_DURATION_MS:
110
  audio_segment = audio_segment[:MAX_DURATION_MS]
@@ -114,12 +107,11 @@ def process_sonic(image, audio, dynamic_scale):
114
  audio_bytes = buf_audio.getvalue()
115
  audio_md5 = get_md5(audio_bytes)
116
 
117
- # 파일 경로 생성
118
  image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
119
  audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
120
  res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
121
 
122
- # 이미지/오디오 파일 캐싱
123
  if not os.path.exists(image_path):
124
  with open(image_path, "wb") as f:
125
  f.write(img_bytes)
@@ -127,7 +119,7 @@ def process_sonic(image, audio, dynamic_scale):
127
  with open(audio_path, "wb") as f:
128
  f.write(audio_bytes)
129
 
130
- # 이미 결과가 존재하면 캐시된 결과 사용
131
  if os.path.exists(res_video_path):
132
  print(f"[INFO] Using cached result: {res_video_path}")
133
  return res_video_path
@@ -137,7 +129,6 @@ def process_sonic(image, audio, dynamic_scale):
137
  return video_result
138
 
139
  def get_example():
140
- """예시 데이터를 로딩하는 더미 함수 (현재는 빈 리스트)."""
141
  return []
142
 
143
  css = """
@@ -175,13 +166,11 @@ with gr.Blocks(css=css) as demo:
175
  label="Portrait Image",
176
  elem_id="image_input"
177
  )
178
-
179
  audio_input = gr.Audio(
180
  label="Voice/Audio Input (up to 1 minute)",
181
  elem_id="audio_input",
182
  type="numpy"
183
  )
184
-
185
  with gr.Column():
186
  dynamic_scale = gr.Slider(
187
  minimum=0.5,
@@ -191,7 +180,6 @@ with gr.Blocks(css=css) as demo:
191
  label="Animation Intensity",
192
  info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
193
  )
194
-
195
  process_btn = gr.Button(
196
  "Generate Animation",
197
  variant="primary",
@@ -232,5 +220,4 @@ with gr.Blocks(css=css) as demo:
232
  </div>
233
  """)
234
 
235
- # 공개 링크 생성
236
  demo.launch(share=True)
 
11
 
12
  # 초기 실행 시 필요한 모델들을 다운로드
13
  cmd = (
14
+ 'python3 -m pip install "huggingface_hub[cli]" accelerate; '
15
  'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; '
16
  'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; '
17
  'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
 
39
  expand_ratio = 0.0
40
  min_resolution = 512
41
 
42
+ # 오디오 길이 계산
43
  audio = AudioSegment.from_file(audio_path)
44
  duration = len(audio) / 1000.0 # 초 단위
45
 
46
+ # 오디오 길이에 따라 inference_steps 결정 (최소 25프레임 ~ 최대 750프레임)
 
47
  inference_steps = min(max(int(duration * 12.5), 25), 750)
 
48
  print(f"[INFO] Audio duration: {duration:.2f} seconds, using inference_steps={inference_steps}")
49
 
50
+ # 얼굴 인식
51
  face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
52
  print(f"[INFO] Face detection info: {face_info}")
53
 
54
+ # 얼굴이 하나라도 검출되면 -> pipeline 진행
55
  if face_info['face_num'] > 0:
56
  os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
 
 
57
  pipe.process(
58
  img_path,
59
  audio_path,
 
64
  )
65
  return res_video_path
66
  else:
67
+ # 얼굴이 전혀 없으면 -1 리턴
68
  return -1
69
 
70
  def process_sonic(image, audio, dynamic_scale):
71
  """
72
+ Gradio 인터페이스에서 호출되는 함수:
73
+ 1. 이미지/오디오 검사
74
+ 2. MD5 해시 -> 파일명
75
+ 3. 캐시 검사 -> 없으면 영상 생성
76
  """
77
  if image is None:
78
  raise gr.Error("Please upload an image")
79
  if audio is None:
80
  raise gr.Error("Please upload an audio file")
81
 
82
+ # (1) 이미지 MD5
83
  buf_img = io.BytesIO()
84
  image.save(buf_img, format="PNG")
85
  img_bytes = buf_img.getvalue()
86
  img_md5 = get_md5(img_bytes)
87
 
88
+ # (2) 오디오 MD5
89
  sampling_rate, arr = audio[:2]
90
  if len(arr.shape) == 1:
91
  arr = arr[:, None]
 
92
  audio_segment = AudioSegment(
93
  arr.tobytes(),
94
  frame_rate=sampling_rate,
95
  sample_width=arr.dtype.itemsize,
96
  channels=arr.shape[1]
97
  )
98
+ # Whisper 호환을 위해 mono/16kHz로 변환
99
+ audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
100
 
 
 
 
 
 
101
  MAX_DURATION_MS = 60000
102
  if len(audio_segment) > MAX_DURATION_MS:
103
  audio_segment = audio_segment[:MAX_DURATION_MS]
 
107
  audio_bytes = buf_audio.getvalue()
108
  audio_md5 = get_md5(audio_bytes)
109
 
110
+ # (3) 파일 경로
111
  image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
112
  audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
113
  res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4'))
114
 
 
115
  if not os.path.exists(image_path):
116
  with open(image_path, "wb") as f:
117
  f.write(img_bytes)
 
119
  with open(audio_path, "wb") as f:
120
  f.write(audio_bytes)
121
 
122
+ # (4) 캐싱된 결과가 있으면 재사용
123
  if os.path.exists(res_video_path):
124
  print(f"[INFO] Using cached result: {res_video_path}")
125
  return res_video_path
 
129
  return video_result
130
 
131
  def get_example():
 
132
  return []
133
 
134
  css = """
 
166
  label="Portrait Image",
167
  elem_id="image_input"
168
  )
 
169
  audio_input = gr.Audio(
170
  label="Voice/Audio Input (up to 1 minute)",
171
  elem_id="audio_input",
172
  type="numpy"
173
  )
 
174
  with gr.Column():
175
  dynamic_scale = gr.Slider(
176
  minimum=0.5,
 
180
  label="Animation Intensity",
181
  info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)"
182
  )
 
183
  process_btn = gr.Button(
184
  "Generate Animation",
185
  variant="primary",
 
220
  </div>
221
  """)
222
 
 
223
  demo.launch(share=True)