Dokdo-multimodal

Building

App Files Files Community

aiqcamp commited on Dec 22, 2024

Commit

9b8d878

verified ·

1 Parent(s): b581974

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -17

app.py CHANGED Viewed

@@ -68,11 +68,14 @@ net, feature_utils, seq_cfg = get_model()
 @spaces.GPU(duration=60)
 @torch.inference_mode()
 def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
-                   seed: int = -1, num_steps: int = 25,
-                   cfg_strength: float = 4.5, target_duration: float = 8.0):
     try:
         logger.info("Starting audio generation process")
         rng = torch.Generator(device=device)
         if seed >= 0:
             rng.manual_seed(seed)
@@ -81,9 +84,8 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
         fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-        # video_info = load_video(video_path, duration) 대신:
-        kwargs = {'static_duration': target_duration}
-        video_info = load_video(video_path, **kwargs)
         if video_info is None:
             logger.error("Failed to load video")
@@ -97,14 +99,13 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
             logger.error("Failed to extract frames from video")
             return video_path
-        clip_frames = clip_frames.unsqueeze(0).to(device)
-        sync_frames = sync_frames.unsqueeze(0).to(device)
-        # 시퀀스 길이 업데이트
         seq_cfg.duration = actual_duration
         net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-        # 오디오 생성
         logger.info("Generating audio...")
         audios = generate(clip_frames,
                          sync_frames,
@@ -122,14 +123,16 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
         audio = audios.float().cpu()[0]
-        # 결과 비디오 생성
         output_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
         logger.info(f"Creating final video with audio at {output_path}")
-        make_video(video_info, output_path, audio, sampling_rate=seq_cfg.sampling_rate)
-        if not os.path.exists(output_path):
-            logger.error("Failed to create output video")
             return video_path
         logger.info(f'Successfully saved video with audio to {output_path}')
@@ -137,7 +140,8 @@ def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
     except Exception as e:
         logger.error(f"Error in video_to_audio: {str(e)}")
-        return video_path  # 오류 발생 시 원본 비디오 반환
 def upload_to_catbox(file_path):
     """catbox.moe API를 사용하여 파일 업로드"""
@@ -357,14 +361,13 @@ def generate_video(image, prompt):
                             prompt=prompt,
                             negative_prompt="music",
                             seed=-1,
-                            num_steps=25,
                             cfg_strength=4.5,
-                            target_duration=8.0  # duration을 target_duration으로 변경
                         )
                         if final_path_with_audio != final_path:
                             logger.info("Audio generation successful")
-                            # 임시 파일 정리
                             try:
                                 if output_path != final_path:
                                     os.remove(output_path)

 @spaces.GPU(duration=60)
 @torch.inference_mode()
 def video_to_audio(video_path: str, prompt: str, negative_prompt: str = "music",
+                   seed: int = -1, num_steps: int = 20,
+                   cfg_strength: float = 4.5, target_duration: float = 6.0):
     try:
         logger.info("Starting audio generation process")
+        # GPU 메모리 최적화
+        torch.cuda.empty_cache()
         rng = torch.Generator(device=device)
         if seed >= 0:
             rng.manual_seed(seed)
         fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+        # load_video 함수 호출 수정
+        video_info = load_video(video_path, duration=target_duration)  # static_duration을 duration으로 변경
         if video_info is None:
             logger.error("Failed to load video")
             logger.error("Failed to extract frames from video")
             return video_path
+        # 메모리 효율을 위해 배치 크기 조정
+        clip_frames = clip_frames.unsqueeze(0).to(device, dtype=torch.float16)
+        sync_frames = sync_frames.unsqueeze(0).to(device, dtype=torch.float16)
         seq_cfg.duration = actual_duration
         net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
         logger.info("Generating audio...")
         audios = generate(clip_frames,
                          sync_frames,
         audio = audios.float().cpu()[0]
         output_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
         logger.info(f"Creating final video with audio at {output_path}")
+        success = make_video(video_info, output_path, audio, sampling_rate=seq_cfg.sampling_rate)
+        # GPU 메모리 정리
+        torch.cuda.empty_cache()
+        if not success:
+            logger.error("Failed to create video with audio")
             return video_path
         logger.info(f'Successfully saved video with audio to {output_path}')
     except Exception as e:
         logger.error(f"Error in video_to_audio: {str(e)}")
+        torch.cuda.empty_cache()
+        return video_path
 def upload_to_catbox(file_path):
     """catbox.moe API를 사용하여 파일 업로드"""
                             prompt=prompt,
                             negative_prompt="music",
                             seed=-1,
+                            num_steps=20,
                             cfg_strength=4.5,
+                            target_duration=6.0
                         )
                         if final_path_with_audio != final_path:
                             logger.info("Audio generation successful")
                             try:
                                 if output_path != final_path:
                                     os.remove(output_path)