Spaces:
Runtime error
Runtime error
import gradio as gr | |
import os | |
import torchaudio | |
from uuid import uuid4 | |
from pydub.silence import detect_nonsilent | |
from pydub import AudioSegment | |
import torch | |
torch.set_num_threads(1) | |
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', | |
model='silero_vad', | |
force_reload=True, | |
onnx=False) | |
(get_speech_timestamps, | |
save_audio, | |
read_audio, | |
VADIterator, | |
collect_chunks) = utils | |
def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length): | |
audio = AudioSegment.from_file(audio_fp) | |
speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40) | |
speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps)) | |
labels_str = [] | |
labels = [] | |
uppper_merge_threshold = float(uppper_merge_threshold) | |
for i, st in enumerate(speech_timestamps): | |
labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}") | |
labels.append((float(st['start']), float(st['end']), f"Sound {i+1}")) | |
fn = str(uuid4()) + ".txt" | |
with open(fn, "w") as f: | |
f.write("\n".join(labels_str)) | |
if not auto_merge: | |
return fn, None | |
gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))] | |
duration = lambda x: float(x[1]) - float(x[0]) | |
new_labels = [list(labels[0])] | |
for i in range(1, len(labels)): | |
if ( | |
gaps[i - 1] <= uppper_merge_threshold | |
and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i]) | |
< max_segment_length | |
): | |
new_labels[-1][1] = labels[i][1] | |
new_labels[-1][ | |
2 | |
] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}' | |
else: | |
new_labels.append(list(labels[i])) | |
translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels)) | |
filename_path = f"{fn}_translate_label.txt" | |
with open(filename_path, "w") as f: | |
f.write("\n".join(translate_labels)) | |
return fn, filename_path | |
interface = gr.Interface( | |
get_labels, | |
[ | |
gr.Audio(type="filepath", label="Audio file"), | |
gr.Slider(0, 1, value=0.7, label="Threshold", step=0.01, info="default (0.5)"), | |
gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"), | |
gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"), | |
gr.Checkbox(label="Auto merge", value=True), | |
gr.Textbox(label="Gap max threshold value (seconds)", value=0.3), | |
gr.Number(label="Approx Max Segment Length", value=5) | |
], | |
[ | |
gr.File(label="VAD Labels"), | |
gr.File(label="Merged Labels File") | |
] | |
) | |
if __name__ == "__main__": | |
interface.queue().launch() |