Spaces:
Running
on
Zero
Running
on
Zero
fix audio error on video speaker sep
Browse files- gradio_app.py +25 -6
gradio_app.py
CHANGED
|
@@ -104,8 +104,12 @@ def separate_dnr_video(video_path):
|
|
| 104 |
|
| 105 |
@spaces.GPU()
|
| 106 |
def separate_speakers_video(video_path):
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
| 108 |
|
|
|
|
| 109 |
waveform, original_sr = torchaudio.load(audio_path)
|
| 110 |
if original_sr != TARGET_SR:
|
| 111 |
waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
|
|
@@ -114,20 +118,34 @@ def separate_speakers_video(video_path):
|
|
| 114 |
waveform = waveform.unsqueeze(0)
|
| 115 |
audio_input = waveform.unsqueeze(0).to(device)
|
| 116 |
|
|
|
|
| 117 |
with torch.no_grad():
|
| 118 |
ests_speech = sep_model(audio_input).squeeze(0)
|
| 119 |
|
|
|
|
| 120 |
session_id = uuid.uuid4().hex[:8]
|
| 121 |
output_dir = os.path.join("output_sep_video", session_id)
|
| 122 |
os.makedirs(output_dir, exist_ok=True)
|
| 123 |
|
| 124 |
-
|
| 125 |
for i in range(ests_speech.shape[0]):
|
| 126 |
-
|
| 127 |
-
audio_np
|
| 128 |
-
|
| 129 |
-
output_files.append(path)
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
updates = []
|
| 132 |
for i in range(MAX_SPEAKERS):
|
| 133 |
if i < len(output_videos):
|
|
@@ -136,6 +154,7 @@ def separate_speakers_video(video_path):
|
|
| 136 |
updates.append(gr.update(value=None, visible=False))
|
| 137 |
return updates
|
| 138 |
|
|
|
|
| 139 |
# --- Gradio UI ---
|
| 140 |
with gr.Blocks() as demo:
|
| 141 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|
|
|
|
| 104 |
|
| 105 |
@spaces.GPU()
|
| 106 |
def separate_speakers_video(video_path):
|
| 107 |
+
# Extract audio
|
| 108 |
+
video = VideoFileClip(video_path)
|
| 109 |
+
audio_path = f"/tmp/{uuid.uuid4().hex}_audio.wav"
|
| 110 |
+
video.audio.write_audiofile(audio_path, fps=TARGET_SR, verbose=False, logger=None)
|
| 111 |
|
| 112 |
+
# Load and resample
|
| 113 |
waveform, original_sr = torchaudio.load(audio_path)
|
| 114 |
if original_sr != TARGET_SR:
|
| 115 |
waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
|
|
|
|
| 118 |
waveform = waveform.unsqueeze(0)
|
| 119 |
audio_input = waveform.unsqueeze(0).to(device)
|
| 120 |
|
| 121 |
+
# Inference
|
| 122 |
with torch.no_grad():
|
| 123 |
ests_speech = sep_model(audio_input).squeeze(0)
|
| 124 |
|
| 125 |
+
# Output directory
|
| 126 |
session_id = uuid.uuid4().hex[:8]
|
| 127 |
output_dir = os.path.join("output_sep_video", session_id)
|
| 128 |
os.makedirs(output_dir, exist_ok=True)
|
| 129 |
|
| 130 |
+
output_videos = []
|
| 131 |
for i in range(ests_speech.shape[0]):
|
| 132 |
+
audio_np = ests_speech[i].cpu().numpy()
|
| 133 |
+
if audio_np.ndim == 1:
|
| 134 |
+
audio_np = audio_np[:, None] # Ensure shape [samples, 1]
|
|
|
|
| 135 |
|
| 136 |
+
# Save separated audio
|
| 137 |
+
separated_audio_path = os.path.join(output_dir, f"speaker_{i+1}.wav")
|
| 138 |
+
sf.write(separated_audio_path, audio_np, TARGET_SR)
|
| 139 |
+
|
| 140 |
+
# Combine with original video (no original audio)
|
| 141 |
+
output_video_path = os.path.join(output_dir, f"speaker_{i+1}_video.mp4")
|
| 142 |
+
new_audio = AudioFileClip(separated_audio_path)
|
| 143 |
+
new_video = video.set_audio(new_audio)
|
| 144 |
+
new_video.write_videofile(output_video_path, audio_codec="aac", verbose=False, logger=None)
|
| 145 |
+
|
| 146 |
+
output_videos.append(output_video_path)
|
| 147 |
+
|
| 148 |
+
# Pad with empty videos if less than MAX_SPEAKERS
|
| 149 |
updates = []
|
| 150 |
for i in range(MAX_SPEAKERS):
|
| 151 |
if i < len(output_videos):
|
|
|
|
| 154 |
updates.append(gr.update(value=None, visible=False))
|
| 155 |
return updates
|
| 156 |
|
| 157 |
+
|
| 158 |
# --- Gradio UI ---
|
| 159 |
with gr.Blocks() as demo:
|
| 160 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|