Spaces:

rc19477
/

dev_only_useless

Sleeping

roychao19477 commited on 22 days ago

Commit

dba6227

1 Parent(s): 5a2e862

Upload to debug

Files changed (1) hide show

app.py CHANGED Viewed

@@ -82,6 +82,9 @@ avse_model.load_state_dict(avse_state_dict, strict=True)
 avse_model.to("cuda")
 avse_model.eval()
 @spaces.GPU
 def run_avse_inference(video_path, audio_path):
     estimated = run_avse(video_path, audio_path)
@@ -101,19 +104,39 @@ def run_avse_inference(video_path, audio_path):
     ]).astype(np.float32)
     bg_frames /= 255.0
-    print(noisy.shape)
-    print(bg_frames.shape)
-    fesfse
     # Combine into input dict (match what model.enhance expects)
-    data = {
-        "noisy_audio": noisy,
-        "video_frames": bg_frames[np.newaxis, ...]
-    }
     with torch.no_grad():
-        estimated = avse_model.enhance(data).reshape(-1)
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")

 avse_model.to("cuda")
 avse_model.eval()
+CHUNK_SIZE_AUDIO = 48000  # 3 sec at 16kHz
+CHUNK_SIZE_VIDEO = 75     # 25fps × 3 sec
 @spaces.GPU
 def run_avse_inference(video_path, audio_path):
     estimated = run_avse(video_path, audio_path)
     ]).astype(np.float32)
     bg_frames /= 255.0
+    audio_chunks = [
+        noisy[i:i + CHUNK_SIZE_AUDIO]
+        for i in range(0, len(noisy), CHUNK_SIZE_AUDIO)
+    ]
+    video_chunks = [
+        bg_frames[i:i + CHUNK_SIZE_VIDEO]
+        for i in range(0, len(bg_frames), CHUNK_SIZE_VIDEO)
+    ]
+    min_len = min(len(audio_chunks), len(video_chunks))  # sync length
     # Combine into input dict (match what model.enhance expects)
+    #data = {
+    #    "noisy_audio": noisy,
+    #    "video_frames": bg_frames[np.newaxis, ...]
+    #}
+    #with torch.no_grad():
+    #    estimated = avse_model.enhance(data).reshape(-1)
+    estimated_chunks = []
     with torch.no_grad():
+        for i in range(min_len):
+            chunk_data = {
+                "noisy_audio": audio_chunks[i],
+                "video_frames": video_chunks[i][np.newaxis, ...]
+            }
+            est = avse_model.enhance(chunk_data).reshape(-1)
+            estimated_chunks.append(est)
+    estimated = torch.cat(estimated_chunks).cpu().numpy()
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")