Spaces:

VIDraft
/

Portrait-Animation

Running on Zero

App Files Files Community

openfree commited on May 10

Commit

2a1d7cf

verified ·

1 Parent(s): 68cbd36

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -16

app.py CHANGED Viewed

@@ -28,10 +28,10 @@ res_path = './res_path/'
 os.makedirs(tmp_path, exist_ok=True)
 os.makedirs(res_path, exist_ok=True)
-@spaces.GPU(duration=300)  # 긴 비디오 처리를 위해 duration 300초로 설정
 def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
     # ============================
-    # 1) 4초(프레임 50)로 늘리기
     # 2) 원본 비율 유지(크롭 제거)
     # ============================
@@ -40,25 +40,22 @@ def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
     min_resolution = 512
-    ## 수정됨: 4초 분량 = 50프레임
-    inference_steps = 100  # 기존 25 -> 50
     audio = AudioSegment.from_file(audio_path)
     duration = len(audio) / 1000.0  # 초 단위
     print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}")
     # 얼굴 인식 (face_info는 참고용)
     face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
     print(f"Face detection info: {face_info}")
-    # 얼굴이 하나라도 검출되면(>0), 기존에는 크롭 과정을 진행했으나,
-    # 원본 이미지 비율 유지를 위해 크롭 부분 제거
     if face_info['face_num'] > 0:
-        ## 수정됨: 아래 3줄 크롭 코드 제거
-        # crop_image_path = img_path + '.crop.png'
-        # pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox'])
-        # img_path = crop_image_path
         os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
         # 원본 이미지를 그대로 전달
@@ -98,6 +95,12 @@ def process_sonic(image, audio, dynamic_scale):
     )
     audio_segment = audio_segment.set_frame_rate(sampling_rate)
     # 파일 경로 생성
     image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
     audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
@@ -145,7 +148,7 @@ with gr.Blocks(css=css) as demo:
     gr.HTML("""
         <div class="main-header">
             <h1>🎭 Sonic: Advanced Portrait Animation</h1>
-            <p>Transform still images into dynamic videos synchronized with audio</p>
         </div>
     """)
@@ -158,7 +161,7 @@ with gr.Blocks(css=css) as demo:
             )
             audio_input = gr.Audio(
-                label="Voice/Audio Input",
                 elem_id="audio_input",
                 type="numpy"
             )
@@ -209,9 +212,9 @@ with gr.Blocks(css=css) as demo:
                     <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
                 </a>
             </div>
-            <p>🔔 Note: For optimal results, use clear portrait images and high-quality audio</p>
         </div>
     """)
 # 공개 링크 생성
-demo.launch(share=True)

 os.makedirs(tmp_path, exist_ok=True)
 os.makedirs(res_path, exist_ok=True)
+@spaces.GPU(duration=600)  # 긴 비디오 처리를 위해 duration 600초로 설정 (10분)
 def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
     # ============================
+    # 1) 오디오 길이에 따라 프레임 수 계산 (최대 1분까지)
     # 2) 원본 비율 유지(크롭 제거)
     # ============================
     min_resolution = 512
+    ## 수정됨: 오디오 길이에 따라 inference_steps 계산
     audio = AudioSegment.from_file(audio_path)
     duration = len(audio) / 1000.0  # 초 단위
+    # 오디오 길이에 따라 inference_steps 계산 (초당 약 12.5 프레임)
+    # 최소 25 프레임, 최대 750 프레임 (60초 x 12.5)
+    inference_steps = min(max(int(duration * 12.5), 25), 750)
     print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}")
     # 얼굴 인식 (face_info는 참고용)
     face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
     print(f"Face detection info: {face_info}")
+    # 얼굴이 하나라도 검출되면(>0), 원본 이미지 비율 유지
     if face_info['face_num'] > 0:
         os.makedirs(os.path.dirname(res_video_path), exist_ok=True)
         # 원본 이미지를 그대로 전달
     )
     audio_segment = audio_segment.set_frame_rate(sampling_rate)
+    # 오디오 길이 제한 확인 (최대 60초)
+    MAX_DURATION_MS = 60000  # 60초 (60,000ms)
+    if len(audio_segment) > MAX_DURATION_MS:
+        print(f"Audio longer than 60 seconds ({len(audio_segment)/1000:.2f}s). Truncating to 60 seconds.")
+        audio_segment = audio_segment[:MAX_DURATION_MS]
     # 파일 경로 생성
     image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png'))
     audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav'))
     gr.HTML("""
         <div class="main-header">
             <h1>🎭 Sonic: Advanced Portrait Animation</h1>
+            <p>Transform still images into dynamic videos synchronized with audio (up to 1 minute)</p>
         </div>
     """)
             )
             audio_input = gr.Audio(
+                label="Voice/Audio Input (up to 1 minute)",
                 elem_id="audio_input",
                 type="numpy"
             )
                     <img src="https://img.shields.io/badge/Paper-arXiv-red?style=for-the-badge&logo=arxiv" alt="arXiv Paper">
                 </a>
             </div>
+            <p>🔔 Note: For optimal results, use clear portrait images and high-quality audio (now supports up to 1 minute!)</p>
         </div>
     """)
 # 공개 링크 생성
+demo.launch(share=True)