Spaces:
Running
on
Zero
Running
on
Zero
fix audio error on video speaker sep
Browse files- gradio_app.py +25 -6
gradio_app.py
CHANGED
@@ -104,8 +104,12 @@ def separate_dnr_video(video_path):
|
|
104 |
|
105 |
@spaces.GPU()
|
106 |
def separate_speakers_video(video_path):
|
107 |
-
|
|
|
|
|
|
|
108 |
|
|
|
109 |
waveform, original_sr = torchaudio.load(audio_path)
|
110 |
if original_sr != TARGET_SR:
|
111 |
waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
|
@@ -114,20 +118,34 @@ def separate_speakers_video(video_path):
|
|
114 |
waveform = waveform.unsqueeze(0)
|
115 |
audio_input = waveform.unsqueeze(0).to(device)
|
116 |
|
|
|
117 |
with torch.no_grad():
|
118 |
ests_speech = sep_model(audio_input).squeeze(0)
|
119 |
|
|
|
120 |
session_id = uuid.uuid4().hex[:8]
|
121 |
output_dir = os.path.join("output_sep_video", session_id)
|
122 |
os.makedirs(output_dir, exist_ok=True)
|
123 |
|
124 |
-
|
125 |
for i in range(ests_speech.shape[0]):
|
126 |
-
|
127 |
-
audio_np
|
128 |
-
|
129 |
-
output_files.append(path)
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
updates = []
|
132 |
for i in range(MAX_SPEAKERS):
|
133 |
if i < len(output_videos):
|
@@ -136,6 +154,7 @@ def separate_speakers_video(video_path):
|
|
136 |
updates.append(gr.update(value=None, visible=False))
|
137 |
return updates
|
138 |
|
|
|
139 |
# --- Gradio UI ---
|
140 |
with gr.Blocks() as demo:
|
141 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|
|
|
104 |
|
105 |
@spaces.GPU()
|
106 |
def separate_speakers_video(video_path):
|
107 |
+
# Extract audio
|
108 |
+
video = VideoFileClip(video_path)
|
109 |
+
audio_path = f"/tmp/{uuid.uuid4().hex}_audio.wav"
|
110 |
+
video.audio.write_audiofile(audio_path, fps=TARGET_SR, verbose=False, logger=None)
|
111 |
|
112 |
+
# Load and resample
|
113 |
waveform, original_sr = torchaudio.load(audio_path)
|
114 |
if original_sr != TARGET_SR:
|
115 |
waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)
|
|
|
118 |
waveform = waveform.unsqueeze(0)
|
119 |
audio_input = waveform.unsqueeze(0).to(device)
|
120 |
|
121 |
+
# Inference
|
122 |
with torch.no_grad():
|
123 |
ests_speech = sep_model(audio_input).squeeze(0)
|
124 |
|
125 |
+
# Output directory
|
126 |
session_id = uuid.uuid4().hex[:8]
|
127 |
output_dir = os.path.join("output_sep_video", session_id)
|
128 |
os.makedirs(output_dir, exist_ok=True)
|
129 |
|
130 |
+
output_videos = []
|
131 |
for i in range(ests_speech.shape[0]):
|
132 |
+
audio_np = ests_speech[i].cpu().numpy()
|
133 |
+
if audio_np.ndim == 1:
|
134 |
+
audio_np = audio_np[:, None] # Ensure shape [samples, 1]
|
|
|
135 |
|
136 |
+
# Save separated audio
|
137 |
+
separated_audio_path = os.path.join(output_dir, f"speaker_{i+1}.wav")
|
138 |
+
sf.write(separated_audio_path, audio_np, TARGET_SR)
|
139 |
+
|
140 |
+
# Combine with original video (no original audio)
|
141 |
+
output_video_path = os.path.join(output_dir, f"speaker_{i+1}_video.mp4")
|
142 |
+
new_audio = AudioFileClip(separated_audio_path)
|
143 |
+
new_video = video.set_audio(new_audio)
|
144 |
+
new_video.write_videofile(output_video_path, audio_codec="aac", verbose=False, logger=None)
|
145 |
+
|
146 |
+
output_videos.append(output_video_path)
|
147 |
+
|
148 |
+
# Pad with empty videos if less than MAX_SPEAKERS
|
149 |
updates = []
|
150 |
for i in range(MAX_SPEAKERS):
|
151 |
if i < len(output_videos):
|
|
|
154 |
updates.append(gr.update(value=None, visible=False))
|
155 |
return updates
|
156 |
|
157 |
+
|
158 |
# --- Gradio UI ---
|
159 |
with gr.Blocks() as demo:
|
160 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|