deepsync commited on
Commit
cfc2b12
·
verified ·
1 Parent(s): 5456318

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -11
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  import os
3
  import torchaudio
4
  from uuid import uuid4
 
 
5
 
6
  import torch
7
  torch.set_num_threads(1)
@@ -19,15 +21,9 @@ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
19
 
20
 
21
  def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length):
22
- wav, sr = torchaudio.load(audio_fp)
23
- wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=16000)
24
- speech_timestamps = get_speech_timestamps(wav,
25
- model,
26
- sampling_rate=16000,
27
- threshold=threshold,
28
- min_speech_duration_ms=min_speech_duration_ms,
29
- min_silence_duration_ms=min_silence_duration_ms,
30
- return_seconds=True)
31
  labels_str = []
32
  labels = []
33
 
@@ -79,8 +75,8 @@ interface = gr.Interface(
79
  gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"),
80
  gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"),
81
  gr.Checkbox(label="Auto merge", value=True),
82
- gr.Textbox(label="Gap max threshold value (seconds)", value=0.7),
83
- gr.Number(label="Approx Max Segment Length", value=8)
84
  ],
85
  [
86
  gr.File(label="VAD Labels"),
 
2
  import os
3
  import torchaudio
4
  from uuid import uuid4
5
+ from pydub.silence import detect_nonsilent
6
+ from pydub import AudioSegment
7
 
8
  import torch
9
  torch.set_num_threads(1)
 
21
 
22
 
23
  def get_labels(audio_fp, threshold, min_speech_duration_ms, min_silence_duration_ms, auto_merge, uppper_merge_threshold, max_segment_length):
24
+ audio = AudioSegment.from_file(audio_fp)
25
+ speech_timestamps = detect_nonsilent(audio, min_silence_len=min_silence_duration_ms, silence_thresh=-40)
26
+ speech_timestamps = list(filter(lambda x: x[1]-x[0] > min_speech_duration_ms, speech_timestamps))
 
 
 
 
 
 
27
  labels_str = []
28
  labels = []
29
 
 
75
  gr.Number(label="min_speech_duration_ms", value=250, info="default (250)"),
76
  gr.Number(label="min_silence_duration_ms", value=40, info="default (100)"),
77
  gr.Checkbox(label="Auto merge", value=True),
78
+ gr.Textbox(label="Gap max threshold value (seconds)", value=0.3),
79
+ gr.Number(label="Approx Max Segment Length", value=5)
80
  ],
81
  [
82
  gr.File(label="VAD Labels"),