Spaces:
Running
Running
Merge pull request #141 from jhj0517/feature/add-parameters
Browse files- app.py +15 -3
- modules/whisper_data_class.py +23 -0
app.py
CHANGED
|
@@ -65,6 +65,8 @@ class App:
|
|
| 65 |
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
| 66 |
nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
|
| 67 |
nb_patience = gr.Number(label="Patience", value=1, interactive=True)
|
|
|
|
|
|
|
| 68 |
with gr.Row():
|
| 69 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 70 |
with gr.Row():
|
|
@@ -81,7 +83,9 @@ class App:
|
|
| 81 |
no_speech_threshold=nb_no_speech_threshold,
|
| 82 |
compute_type=dd_compute_type,
|
| 83 |
best_of=nb_best_of,
|
| 84 |
-
patience=nb_patience
|
|
|
|
|
|
|
| 85 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 86 |
inputs=params + whisper_params.to_list(),
|
| 87 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -115,6 +119,8 @@ class App:
|
|
| 115 |
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
| 116 |
nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
|
| 117 |
nb_patience = gr.Number(label="Patience", value=1, interactive=True)
|
|
|
|
|
|
|
| 118 |
with gr.Row():
|
| 119 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 120 |
with gr.Row():
|
|
@@ -131,7 +137,9 @@ class App:
|
|
| 131 |
no_speech_threshold=nb_no_speech_threshold,
|
| 132 |
compute_type=dd_compute_type,
|
| 133 |
best_of=nb_best_of,
|
| 134 |
-
patience=nb_patience
|
|
|
|
|
|
|
| 135 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 136 |
inputs=params + whisper_params.to_list(),
|
| 137 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -158,6 +166,8 @@ class App:
|
|
| 158 |
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
| 159 |
nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
|
| 160 |
nb_patience = gr.Number(label="Patience", value=1, interactive=True)
|
|
|
|
|
|
|
| 161 |
with gr.Row():
|
| 162 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 163 |
with gr.Row():
|
|
@@ -174,7 +184,9 @@ class App:
|
|
| 174 |
no_speech_threshold=nb_no_speech_threshold,
|
| 175 |
compute_type=dd_compute_type,
|
| 176 |
best_of=nb_best_of,
|
| 177 |
-
patience=nb_patience
|
|
|
|
|
|
|
| 178 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 179 |
inputs=params + whisper_params.to_list(),
|
| 180 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 65 |
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
| 66 |
nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
|
| 67 |
nb_patience = gr.Number(label="Patience", value=1, interactive=True)
|
| 68 |
+
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
| 69 |
+
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 70 |
with gr.Row():
|
| 71 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 72 |
with gr.Row():
|
|
|
|
| 83 |
no_speech_threshold=nb_no_speech_threshold,
|
| 84 |
compute_type=dd_compute_type,
|
| 85 |
best_of=nb_best_of,
|
| 86 |
+
patience=nb_patience,
|
| 87 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
| 88 |
+
initial_prompt=tb_initial_prompt)
|
| 89 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 90 |
inputs=params + whisper_params.to_list(),
|
| 91 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 119 |
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
| 120 |
nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
|
| 121 |
nb_patience = gr.Number(label="Patience", value=1, interactive=True)
|
| 122 |
+
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
| 123 |
+
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 124 |
with gr.Row():
|
| 125 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 126 |
with gr.Row():
|
|
|
|
| 137 |
no_speech_threshold=nb_no_speech_threshold,
|
| 138 |
compute_type=dd_compute_type,
|
| 139 |
best_of=nb_best_of,
|
| 140 |
+
patience=nb_patience,
|
| 141 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
| 142 |
+
initial_prompt=tb_initial_prompt)
|
| 143 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 144 |
inputs=params + whisper_params.to_list(),
|
| 145 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 166 |
dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
|
| 167 |
nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
|
| 168 |
nb_patience = gr.Number(label="Patience", value=1, interactive=True)
|
| 169 |
+
cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
|
| 170 |
+
tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
|
| 171 |
with gr.Row():
|
| 172 |
btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
|
| 173 |
with gr.Row():
|
|
|
|
| 184 |
no_speech_threshold=nb_no_speech_threshold,
|
| 185 |
compute_type=dd_compute_type,
|
| 186 |
best_of=nb_best_of,
|
| 187 |
+
patience=nb_patience,
|
| 188 |
+
condition_on_previous_text=cb_condition_on_previous_text,
|
| 189 |
+
initial_prompt=tb_initial_prompt)
|
| 190 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 191 |
inputs=params + whisper_params.to_list(),
|
| 192 |
outputs=[tb_indicator, files_subtitles])
|
modules/whisper_data_class.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from dataclasses import dataclass, fields
|
| 2 |
import gradio as gr
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
@dataclass
|
|
@@ -13,6 +14,8 @@ class WhisperGradioComponents:
|
|
| 13 |
compute_type: gr.Dropdown
|
| 14 |
best_of: gr.Number
|
| 15 |
patience: gr.Number
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
A data class to pass Gradio components to the function before Gradio pre-processing.
|
| 18 |
See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
@@ -21,26 +24,44 @@ class WhisperGradioComponents:
|
|
| 21 |
----------
|
| 22 |
model_size: gr.Dropdown
|
| 23 |
Whisper model size.
|
|
|
|
| 24 |
lang: gr.Dropdown
|
| 25 |
Source language of the file to transcribe.
|
|
|
|
| 26 |
is_translate: gr.Checkbox
|
| 27 |
Boolean value that determines whether to translate to English.
|
| 28 |
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
|
|
|
| 29 |
beam_size: gr.Number
|
| 30 |
Int value that is used for decoding option.
|
|
|
|
| 31 |
log_prob_threshold: gr.Number
|
| 32 |
If the average log probability over sampled tokens is below this value, treat as failed.
|
|
|
|
| 33 |
no_speech_threshold: gr.Number
|
| 34 |
If the no_speech probability is higher than this value AND
|
| 35 |
the average log probability over sampled tokens is below `log_prob_threshold`,
|
| 36 |
consider the segment as silent.
|
|
|
|
| 37 |
compute_type: gr.Dropdown
|
| 38 |
compute type for transcription.
|
| 39 |
see more info : https://opennmt.net/CTranslate2/quantization.html
|
|
|
|
| 40 |
best_of: gr.Number
|
| 41 |
Number of candidates when sampling with non-zero temperature.
|
|
|
|
| 42 |
patience: gr.Number
|
| 43 |
Beam search patience factor.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
|
| 46 |
def to_list(self) -> list:
|
|
@@ -66,6 +87,8 @@ class WhisperValues:
|
|
| 66 |
compute_type: str
|
| 67 |
best_of: int
|
| 68 |
patience: float
|
|
|
|
|
|
|
| 69 |
"""
|
| 70 |
A data class to use Whisper parameters in your function after Gradio pre-processing.
|
| 71 |
See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components
|
|
|
|
| 1 |
from dataclasses import dataclass, fields
|
| 2 |
import gradio as gr
|
| 3 |
+
from typing import Optional
|
| 4 |
|
| 5 |
|
| 6 |
@dataclass
|
|
|
|
| 14 |
compute_type: gr.Dropdown
|
| 15 |
best_of: gr.Number
|
| 16 |
patience: gr.Number
|
| 17 |
+
condition_on_previous_text: gr.Checkbox
|
| 18 |
+
initial_prompt: gr.Textbox
|
| 19 |
"""
|
| 20 |
A data class to pass Gradio components to the function before Gradio pre-processing.
|
| 21 |
See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
|
|
| 24 |
----------
|
| 25 |
model_size: gr.Dropdown
|
| 26 |
Whisper model size.
|
| 27 |
+
|
| 28 |
lang: gr.Dropdown
|
| 29 |
Source language of the file to transcribe.
|
| 30 |
+
|
| 31 |
is_translate: gr.Checkbox
|
| 32 |
Boolean value that determines whether to translate to English.
|
| 33 |
It's Whisper's feature to translate speech from another language directly into English end-to-end.
|
| 34 |
+
|
| 35 |
beam_size: gr.Number
|
| 36 |
Int value that is used for decoding option.
|
| 37 |
+
|
| 38 |
log_prob_threshold: gr.Number
|
| 39 |
If the average log probability over sampled tokens is below this value, treat as failed.
|
| 40 |
+
|
| 41 |
no_speech_threshold: gr.Number
|
| 42 |
If the no_speech probability is higher than this value AND
|
| 43 |
the average log probability over sampled tokens is below `log_prob_threshold`,
|
| 44 |
consider the segment as silent.
|
| 45 |
+
|
| 46 |
compute_type: gr.Dropdown
|
| 47 |
compute type for transcription.
|
| 48 |
see more info : https://opennmt.net/CTranslate2/quantization.html
|
| 49 |
+
|
| 50 |
best_of: gr.Number
|
| 51 |
Number of candidates when sampling with non-zero temperature.
|
| 52 |
+
|
| 53 |
patience: gr.Number
|
| 54 |
Beam search patience factor.
|
| 55 |
+
|
| 56 |
+
condition_on_previous_text: gr.Checkbox
|
| 57 |
+
if True, the previous output of the model is provided as a prompt for the next window;
|
| 58 |
+
disabling may make the text inconsistent across windows, but the model becomes less prone to
|
| 59 |
+
getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
|
| 60 |
+
|
| 61 |
+
initial_prompt: gr.Textbox
|
| 62 |
+
Optional text to provide as a prompt for the first window. This can be used to provide, or
|
| 63 |
+
"prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
|
| 64 |
+
to make it more likely to predict those word correctly.
|
| 65 |
"""
|
| 66 |
|
| 67 |
def to_list(self) -> list:
|
|
|
|
| 87 |
compute_type: str
|
| 88 |
best_of: int
|
| 89 |
patience: float
|
| 90 |
+
condition_on_previous_text: bool
|
| 91 |
+
initial_prompt: Optional[str]
|
| 92 |
"""
|
| 93 |
A data class to use Whisper parameters in your function after Gradio pre-processing.
|
| 94 |
See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components
|