Spaces:

rc19477
/

dev_only_useless

Running on Zero

App Files Files Community

roychao19477 commited on Jun 24

Commit

138c0ad

1 Parent(s): f9f66c4

Update module

Browse files

Files changed (1) hide show

app.py +71 -2

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ import subprocess
 import spaces
 import torch
 import os
 import gradio as gr
 # install packages for mamba
@@ -16,6 +18,15 @@ def clone_github():
         "git", "clone",
         f"https://RoyChao19477:{os.environ['GITHUB_TOKEN']}@github.com/RoyChao19477/for_HF_AVSEMamba.git"
     ])
 install_mamba()
 clone_github()
@@ -54,6 +65,46 @@ from moviepy import ImageSequenceClip
 # Load face detector
 model = YOLO("yolov8n-face.pt").cuda()  # assumes CUDA available
 def extract_resampled_audio(video_path, target_sr=16000):
     # Step 1: extract audio via torchaudio
     # (moviepy will still extract it to wav temp file)
@@ -127,14 +178,32 @@ def extract_faces(video_file):
     ).run(overwrite_output=True)
-    return output_path, audio_path
 iface = gr.Interface(
     fn=extract_faces,
     inputs=gr.Video(label="Upload or record your video"),
     outputs=[
         gr.Video(label="Detected Face Only Video"),
-        gr.Audio(label="Extracted Audio (16kHz)", type="filepath"),
     ],
     title="Face Detector",
     description="Upload or record a video. We'll crop face regions and return a face-only video and its 16kHz audio."

 import spaces
 import torch
 import os
+import shutil
+import glob
 import gradio as gr
 # install packages for mamba
         "git", "clone",
         f"https://RoyChao19477:{os.environ['GITHUB_TOKEN']}@github.com/RoyChao19477/for_HF_AVSEMamba.git"
     ])
+    # move all files except README.md
+    for item in glob.glob("tmp_repo/*"):
+        if os.path.basename(item) != "README.md":
+            if os.path.isdir(item):
+                shutil.move(item, ".")
+            else:
+                shutil.move(item, os.path.join(".", os.path.basename(item)))
+    shutil.rmtree("tmp_repo")
 install_mamba()
 clone_github()
 # Load face detector
 model = YOLO("yolov8n-face.pt").cuda()  # assumes CUDA available
+from decord import VideoReader, cpu
+from model import AVSEModule
+from config import sampling_rate
+import spaces
+# Load model once globally
+ckpt_path = "ckpts/ep215_0906.oat.ckpt"
+model = AVSEModule.load_from_checkpoint(ckpt_path)
+model.to("cuda")
+model.eval()
+@spaces.GPU
+def run_avse_inference(video_path, audio_path):
+    # Load audio
+    noisy, _ = sf.read(audio_path, dtype='float32')  # (N, )
+    noisy = torch.tensor(noisy).unsqueeze(0)  # (1, N)
+    # Load grayscale video
+    vr = VideoReader(video_path, ctx=cpu(0))
+    frames = vr.get_batch(list(range(len(vr)))).asnumpy()
+    bg_frames = np.array([cv2.cvtColor(f, cv2.COLOR_RGB2GRAY) for f in frames]).astype(np.float32) / 255.0
+    bg_frames = torch.tensor(bg_frames).unsqueeze(0).unsqueeze(0)  # (1, 1, T, H, W)
+    # Combine into input dict (match what model.enhance expects)
+    data = {
+        "noisy_audio": noisy,
+        "video_frames": bg_frames
+    }
+    with torch.no_grad():
+        estimated = model.enhance(data).reshape(-1).cpu().numpy()
+    # Save result
+    tmp_wav = audio_path.replace(".wav", "_enhanced.wav")
+    sf.write(tmp_wav, estimated, samplerate=sampling_rate)
+    return tmp_wav
 def extract_resampled_audio(video_path, target_sr=16000):
     # Step 1: extract audio via torchaudio
     # (moviepy will still extract it to wav temp file)
     ).run(overwrite_output=True)
+    # ------------------------------- #
+    # AVSE models
+    noisy = self.load_wav(audio_path)
+    vr = VideoReader(output_path, ctx=cpu(0))
+    frames = vr.get_batch(list(range(len(vr)))).asnumpy()
+    bg_frames = np.array([
+        cv2.cvtColor(frames[i], cv2.COLOR_RGB2GRAY) for i in range(len(frames))
+    ]).astype(np.float32)
+    bg_frames /= 255.0
+    enhanced_audio_path = run_avse_inference(output_path, audio_path)
+    return output_path, enhanced_audio_path
+    #return output_path, audio_path
 iface = gr.Interface(
     fn=extract_faces,
     inputs=gr.Video(label="Upload or record your video"),
     outputs=[
         gr.Video(label="Detected Face Only Video"),
+        #gr.Audio(label="Extracted Audio (16kHz)", type="filepath"),
+        gr.Audio(label="Enhanced Audio", type="filepath")
     ],
     title="Face Detector",
     description="Upload or record a video. We'll crop face regions and return a face-only video and its 16kHz audio."