Spaces:
Sleeping
Sleeping
jhj0517
commited on
Commit
·
074b1fc
1
Parent(s):
66195c7
add Silero VAD Options
Browse files
app.py
CHANGED
|
@@ -60,8 +60,15 @@ class App:
|
|
| 60 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 61 |
with gr.Row():
|
| 62 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 64 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 65 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 66 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 67 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
@@ -93,7 +100,14 @@ class App:
|
|
| 93 |
initial_prompt=tb_initial_prompt,
|
| 94 |
temperature=sd_temperature,
|
| 95 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 96 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 98 |
inputs=params + whisper_params.to_list(),
|
| 99 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -120,8 +134,15 @@ class App:
|
|
| 120 |
with gr.Row():
|
| 121 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
| 122 |
interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 124 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 125 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 126 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 127 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
@@ -153,7 +174,13 @@ class App:
|
|
| 153 |
initial_prompt=tb_initial_prompt,
|
| 154 |
temperature=sd_temperature,
|
| 155 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 156 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 158 |
inputs=params + whisper_params.to_list(),
|
| 159 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -173,8 +200,15 @@ class App:
|
|
| 173 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
| 174 |
with gr.Row():
|
| 175 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 177 |
-
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 178 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 179 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 180 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
@@ -205,7 +239,13 @@ class App:
|
|
| 205 |
initial_prompt=tb_initial_prompt,
|
| 206 |
temperature=sd_temperature,
|
| 207 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 208 |
-
vad_filter=cb_vad_filter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 210 |
inputs=params + whisper_params.to_list(),
|
| 211 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 60 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 61 |
with gr.Row():
|
| 62 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
| 63 |
+
with gr.Accordion("Silero VAD Options", open=False):
|
| 64 |
+
cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
|
| 65 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 66 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 67 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 68 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 69 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 70 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 71 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
|
| 72 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 73 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 74 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
|
| 100 |
initial_prompt=tb_initial_prompt,
|
| 101 |
temperature=sd_temperature,
|
| 102 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 103 |
+
vad_filter=cb_vad_filter,
|
| 104 |
+
threshold=sd_threshold,
|
| 105 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
| 106 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
| 107 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
| 108 |
+
window_size_sample=nb_window_size_sample,
|
| 109 |
+
speech_pad_ms=nb_speech_pad_ms)
|
| 110 |
+
|
| 111 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 112 |
inputs=params + whisper_params.to_list(),
|
| 113 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 134 |
with gr.Row():
|
| 135 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
| 136 |
interactive=True)
|
| 137 |
+
with gr.Accordion("Silero VAD Options", open=False):
|
| 138 |
+
cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
|
| 139 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 140 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 141 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 142 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 143 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 144 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 145 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
|
| 146 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 147 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 148 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
|
| 174 |
initial_prompt=tb_initial_prompt,
|
| 175 |
temperature=sd_temperature,
|
| 176 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 177 |
+
vad_filter=cb_vad_filter,
|
| 178 |
+
threshold=sd_threshold,
|
| 179 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
| 180 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
| 181 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
| 182 |
+
window_size_sample=nb_window_size_sample,
|
| 183 |
+
speech_pad_ms=nb_speech_pad_ms)
|
| 184 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 185 |
inputs=params + whisper_params.to_list(),
|
| 186 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 200 |
dd_file_format = gr.Dropdown(["SRT", "WebVTT", "txt"], value="SRT", label="File Format")
|
| 201 |
with gr.Row():
|
| 202 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 203 |
+
with gr.Accordion("Silero VAD Options", open=False):
|
| 204 |
+
cb_vad_filter = gr.Checkbox(label="Enable VAD Filter", value=False, interactive=True)
|
| 205 |
+
sd_threshold = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, label="Speech Threshold", value=0.5)
|
| 206 |
+
nb_min_speech_duration_ms = gr.Number(label="Minimum Speech Duration (ms)", precision=0, value=250)
|
| 207 |
+
nb_max_speech_duration_s = gr.Number(label="Maximum Speech Duration (s)", value=9999)
|
| 208 |
+
nb_min_silence_duration_ms = gr.Number(label="Minimum Silence Duration (ms)", precision=0, value=2000)
|
| 209 |
+
nb_window_size_sample = gr.Number(label="Window Size (samples)", precision=0, value=1024)
|
| 210 |
+
nb_speech_pad_ms = gr.Number(label="Speech Padding (ms)", precision=0, value=400)
|
| 211 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
|
| 212 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 213 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 214 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
|
| 239 |
initial_prompt=tb_initial_prompt,
|
| 240 |
temperature=sd_temperature,
|
| 241 |
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 242 |
+
vad_filter=cb_vad_filter,
|
| 243 |
+
threshold=sd_threshold,
|
| 244 |
+
min_speech_duration_ms=nb_min_speech_duration_ms,
|
| 245 |
+
max_speech_duration_s=nb_max_speech_duration_s,
|
| 246 |
+
min_silence_duration_ms=nb_min_silence_duration_ms,
|
| 247 |
+
window_size_sample=nb_window_size_sample,
|
| 248 |
+
speech_pad_ms=nb_speech_pad_ms)
|
| 249 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 250 |
inputs=params + whisper_params.to_list(),
|
| 251 |
outputs=[tb_indicator, files_subtitles])
|