File size: 2,914 Bytes
d710575
 
 
 
cfc2b12
 
d710575
 
 
 
 
 
 
b926f53
d710575
 
 
 
 
 
 
 
3aad5c9
cfc2b12
 
 
0041d9c
d710575
ed53687
 
0041d9c
d710575
0041d9c
 
 
d710575
 
0041d9c
 
 
 
 
 
 
 
 
 
 
 
3aad5c9
0041d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d710575
 
 
 
0041d9c
 
0325dd6
0041d9c
0325dd6
0041d9c
cfc2b12
 
0041d9c
 
 
 
 
d710575
 
 
5456318
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import os
import torchaudio
from uuid import uuid4
from pydub.silence import detect_nonsilent
from pydub import AudioSegment

import torch
torch.set_num_threads(1)

model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=True,
                              onnx=False)
 
(get_speech_timestamps,
 save_audio,
 read_audio,
 VADIterator,
 collect_chunks) = utils


def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length):
    audio = AudioSegment.from_file(audio_fp)
    speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40)
    speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps))
    labels_str = []
    labels = []

    uppper_merge_threshold = float(uppper_merge_threshold)
    
    for i, st in enumerate(speech_timestamps):
        labels_str.append(f"{st['start']}\t{st['end']}\tSound {i+1}")
        labels.append((float(st['start']), float(st['end']), f"Sound {i+1}"))
        
    fn = str(uuid4()) + ".txt"
    with open(fn, "w") as f:
        f.write("\n".join(labels_str))

    if not auto_merge:
        return fn, None

    gaps = [labels[i][0] - labels[i - 1][1] for i in range(1, len(labels))]

    duration = lambda x: float(x[1]) - float(x[0])

    new_labels = [list(labels[0])]
    for i in range(1, len(labels)):
        if (
            gaps[i - 1] <= uppper_merge_threshold
            and duration(new_labels[-1]) + gaps[i - 1] + duration(labels[i])
            < max_segment_length
        ):
            new_labels[-1][1] = labels[i][1]
            new_labels[-1][
                2
            ] = f'{new_labels[-1][2]} |{round(gaps[i-1], 2)}s| {labels[i][2]}'
        else:
            new_labels.append(list(labels[i]))

    translate_labels = list(map(lambda x: f"{x[0]}\t{x[1]}\t{x[2]}", new_labels))

    filename_path = f"{fn}_translate_label.txt"
    with open(filename_path, "w") as f:
        f.write("\n".join(translate_labels))
    
    return fn, filename_path


interface = gr.Interface(
    get_labels,
    [
        gr.Audio(type="filepath", label="Audio file"), 
        gr.Slider(0, 1, value=0.7, label="Threshold", step=0.01, info="default (0.5)"), 
        gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"), 
        gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"),
        gr.Checkbox(label="Auto merge", value=True),
        gr.Textbox(label="Gap max threshold value (seconds)", value=0.3),
        gr.Number(label="Approx Max Segment Length", value=5)
    ],
    [
        gr.File(label="VAD Labels"),
        gr.File(label="Merged Labels File")
    ]
)

if __name__ == "__main__":
    interface.queue().launch()