Spaces:

rc19477
/

dev_only_useless

Sleeping

App Files Files Community

roychao19477 commited on Jul 16

Commit

678d466

1 Parent(s): 6e97a1b

Version revise

Browse files

Files changed (1) hide show

app.py +5 -38

app.py CHANGED Viewed

@@ -103,46 +103,13 @@ def run_avse_inference(video_path, audio_path):
     # Combine into input dict (match what model.enhance expects)
-    #data = {
-    #    "noisy_audio": noisy,
-    #    "video_frames": bg_frames[np.newaxis, ...]
-    #}
     with torch.no_grad():
-        # Version 1
-        #estimated = avse_model.enhance(data).reshape(-1)
-        # Version 2
-        chunk_sec = 6
-        sr = 16000
-        audio_chunk_len = chunk_sec * sr  # 48000
-        video_chunk_len = chunk_sec * 25  # 75
-        estimated_chunks = []
-        for i in range(0, len(noisy), audio_chunk_len):
-            audio_chunk = noisy[i:i+audio_chunk_len]
-            if len(audio_chunk) < audio_chunk_len:
-                pad = np.zeros(audio_chunk_len - len(audio_chunk), dtype=audio_chunk.dtype)
-                audio_chunk = np.concatenate([audio_chunk, pad])
-            vid_idx = i // sr * 25  # convert audio index to video frame index
-            #video_chunk = bg_frames[0, vid_idx:vid_idx+video_chunk_len, :, :]
-            video_chunk = bg_frames[vid_idx:vid_idx+video_chunk_len, :, :]
-            if video_chunk.shape[0] < video_chunk_len:
-                pad = np.zeros((video_chunk_len - video_chunk.shape[0], *video_chunk.shape[1:]), dtype=video_chunk.dtype)
-                video_chunk = np.concatenate([video_chunk, pad], axis=0)
-            data = {
-                "noisy_audio": audio_chunk,
-                "video_frames": video_chunk[np.newaxis, ...]
-            }
-            with torch.no_grad():
-                out = avse_model.enhance(data).reshape(-1)
-                estimated_chunks.append(out)
-        estimated = np.concatenate(estimated_chunks)[:len(noisy)]
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")

     # Combine into input dict (match what model.enhance expects)
+    data = {
+        "noisy_audio": noisy,
+        "video_frames": bg_frames[np.newaxis, ...]
+    }
     with torch.no_grad():
+        estimated = avse_model.enhance(data).reshape(-1)
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")