Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -28,10 +28,10 @@ res_path = './res_path/'
|
|
28 |
os.makedirs(tmp_path, exist_ok=True)
|
29 |
os.makedirs(res_path, exist_ok=True)
|
30 |
|
31 |
-
@spaces.GPU(duration=
|
32 |
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
|
33 |
# ============================
|
34 |
-
# 1)
|
35 |
# 2) 원본 비율 유지(크롭 제거)
|
36 |
# ============================
|
37 |
|
@@ -40,25 +40,22 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
|
|
40 |
|
41 |
min_resolution = 512
|
42 |
|
43 |
-
## 수정됨:
|
44 |
-
inference_steps = 100 # 기존 25 -> 50
|
45 |
-
|
46 |
audio = AudioSegment.from_file(audio_path)
|
47 |
duration = len(audio) / 1000.0 # 초 단위
|
|
|
|
|
|
|
|
|
|
|
48 |
print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}")
|
49 |
|
50 |
# 얼굴 인식 (face_info는 참고용)
|
51 |
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
|
52 |
print(f"Face detection info: {face_info}")
|
53 |
|
54 |
-
# 얼굴이 하나라도 검출되면(>0),
|
55 |
-
# 원본 이미지 비율 유지를 위해 크롭 부분 제거
|
56 |
if face_info['face_num'] > 0:
|
57 |
-
## 수정됨: 아래 3줄 크롭 코드 제거
|
58 |
-
# crop_image_path = img_path + '.crop.png'
|
59 |
-
# pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
|
60 |
-
# img_path = crop_image_path
|
61 |
-
|
62 |
os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
|
63 |
|
64 |
# 원본 이미지를 그대로 전달
|
@@ -98,6 +95,12 @@ def process_sonic(image, audio, dynamic_scale):
|
|
98 |
)
|
99 |
audio_segment = audio_segment.set_frame_rate(sampling_rate)
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
# 파일 경로 생성
|
102 |
image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
|
103 |
audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
|
@@ -145,7 +148,7 @@ with gr.Blocks(css=css) as demo:
|
|
145 |
gr.HTML("""
|
146 |
<div class="main-header">
|
147 |
<h1>🎭 Sonic: Advanced Portrait Animation</h1>
|
148 |
-
<p>Transform still images into dynamic videos synchronized with audio</p>
|
149 |
</div>
|
150 |
""")
|
151 |
|
@@ -158,7 +161,7 @@ with gr.Blocks(css=css) as demo:
|
|
158 |
)
|
159 |
|
160 |
audio_input = gr.Audio(
|
161 |
-
label="Voice/Audio Input",
|
162 |
elem_id="audio_input",
|
163 |
type="numpy"
|
164 |
)
|
@@ -209,9 +212,9 @@ with gr.Blocks(css=css) as demo:
|
|
209 |
<img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
|
210 |
</a>
|
211 |
</div>
|
212 |
-
<p>🔔 Note: For optimal results, use clear portrait images and high-quality audio</p>
|
213 |
</div>
|
214 |
""")
|
215 |
|
216 |
# 공개 링크 생성
|
217 |
-
demo.launch(share=True)
|
|
|
28 |
os.makedirs(tmp_path, exist_ok=True)
|
29 |
os.makedirs(res_path, exist_ok=True)
|
30 |
|
31 |
+
@spaces.GPU(duration=600) # 긴 비디오 처리를 위해 duration 600초로 설정 (10분)
|
32 |
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
|
33 |
# ============================
|
34 |
+
# 1) 오디오 길이에 따라 프레임 수 계산 (최대 1분까지)
|
35 |
# 2) 원본 비율 유지(크롭 제거)
|
36 |
# ============================
|
37 |
|
|
|
40 |
|
41 |
min_resolution = 512
|
42 |
|
43 |
+
## 수정됨: 오디오 길이에 따라 inference_steps 계산
|
|
|
|
|
44 |
audio = AudioSegment.from_file(audio_path)
|
45 |
duration = len(audio) / 1000.0 # 초 단위
|
46 |
+
|
47 |
+
# 오디오 길이에 따라 inference_steps 계산 (초당 약 12.5 프레임)
|
48 |
+
# 최소 25 프레임, 최대 750 프레임 (60초 x 12.5)
|
49 |
+
inference_steps = min(max(int(duration * 12.5), 25), 750)
|
50 |
+
|
51 |
print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}")
|
52 |
|
53 |
# 얼굴 인식 (face_info는 참고용)
|
54 |
face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
|
55 |
print(f"Face detection info: {face_info}")
|
56 |
|
57 |
+
# 얼굴이 하나라도 검출되면(>0), 원본 이미지 비율 유지
|
|
|
58 |
if face_info['face_num'] > 0:
|
|
|
|
|
|
|
|
|
|
|
59 |
os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
|
60 |
|
61 |
# 원본 이미지를 그대로 전달
|
|
|
95 |
)
|
96 |
audio_segment = audio_segment.set_frame_rate(sampling_rate)
|
97 |
|
98 |
+
# 오디오 길이 제한 확인 (최대 60초)
|
99 |
+
MAX_DURATION_MS = 60000 # 60초 (60,000ms)
|
100 |
+
if len(audio_segment) > MAX_DURATION_MS:
|
101 |
+
print(f"Audio longer than 60 seconds ({len(audio_segment)/1000:.2f}s). Truncating to 60 seconds.")
|
102 |
+
audio_segment = audio_segment[:MAX_DURATION_MS]
|
103 |
+
|
104 |
# 파일 경로 생성
|
105 |
image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
|
106 |
audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
|
|
|
148 |
gr.HTML("""
|
149 |
<div class="main-header">
|
150 |
<h1>🎭 Sonic: Advanced Portrait Animation</h1>
|
151 |
+
<p>Transform still images into dynamic videos synchronized with audio (up to 1 minute)</p>
|
152 |
</div>
|
153 |
""")
|
154 |
|
|
|
161 |
)
|
162 |
|
163 |
audio_input = gr.Audio(
|
164 |
+
label="Voice/Audio Input (up to 1 minute)",
|
165 |
elem_id="audio_input",
|
166 |
type="numpy"
|
167 |
)
|
|
|
212 |
<img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
|
213 |
</a>
|
214 |
</div>
|
215 |
+
<p>🔔 Note: For optimal results, use clear portrait images and high-quality audio (now supports up to 1 minute!)</p>
|
216 |
</div>
|
217 |
""")
|
218 |
|
219 |
# 공개 링크 생성
|
220 |
+
demo.launch(share=True)
|