Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Apr 30, 2024

Commit

a788d62

2 Parent(s): 76932eb c3b532a

Merge pull request #141 from jhj0517/feature/add-parameters

Browse files

Files changed (2) hide show

app.py +15 -3
modules/whisper_data_class.py +23 -0

app.py CHANGED Viewed

@@ -65,6 +65,8 @@ class App:
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                         nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
                         nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -81,7 +83,9 @@ class App:
                                                              no_speech_threshold=nb_no_speech_threshold,
                                                              compute_type=dd_compute_type,
                                                              best_of=nb_best_of,
-                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -115,6 +119,8 @@ class App:
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                         nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
                         nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -131,7 +137,9 @@ class App:
                                                              no_speech_threshold=nb_no_speech_threshold,
                                                              compute_type=dd_compute_type,
                                                              best_of=nb_best_of,
-                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
@@ -158,6 +166,8 @@ class App:
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                         nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
                         nb_patience = gr.Number(label="Patience", value=1, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -174,7 +184,9 @@ class App:
                                                              no_speech_threshold=nb_no_speech_threshold,
                                                              compute_type=dd_compute_type,
                                                              best_of=nb_best_of,
-                                                             patience=nb_patience)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])

                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                         nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
                         nb_patience = gr.Number(label="Patience", value=1, interactive=True)
+                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
+                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                              no_speech_threshold=nb_no_speech_threshold,
                                                              compute_type=dd_compute_type,
                                                              best_of=nb_best_of,
+                                                             patience=nb_patience,
+                                                             condition_on_previous_text=cb_condition_on_previous_text,
+                                                             initial_prompt=tb_initial_prompt)
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                         nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
                         nb_patience = gr.Number(label="Patience", value=1, interactive=True)
+                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
+                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                              no_speech_threshold=nb_no_speech_threshold,
                                                              compute_type=dd_compute_type,
                                                              best_of=nb_best_of,
+                                                             patience=nb_patience,
+                                                             condition_on_previous_text=cb_condition_on_previous_text,
+                                                             initial_prompt=tb_initial_prompt)
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])
                         dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                         nb_best_of = gr.Number(label="Best Of", value=5, interactive=True)
                         nb_patience = gr.Number(label="Patience", value=1, interactive=True)
+                        cb_condition_on_previous_text = gr.Checkbox(label="Condition On Previous Text", value=True, interactive=True)
+                        tb_initial_prompt = gr.Textbox(label="Initial Prompt", value=None, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                                                              no_speech_threshold=nb_no_speech_threshold,
                                                              compute_type=dd_compute_type,
                                                              best_of=nb_best_of,
+                                                             patience=nb_patience,
+                                                             condition_on_previous_text=cb_condition_on_previous_text,
+                                                             initial_prompt=tb_initial_prompt)
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.to_list(),
                                   outputs=[tb_indicator, files_subtitles])

modules/whisper_data_class.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from dataclasses import dataclass, fields
 import gradio as gr
 @dataclass
@@ -13,6 +14,8 @@ class WhisperGradioComponents:
     compute_type: gr.Dropdown
     best_of: gr.Number
     patience: gr.Number
     """
     A data class to pass Gradio components to the function before Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
@@ -21,26 +24,44 @@ class WhisperGradioComponents:
     ----------
     model_size: gr.Dropdown
         Whisper model size.
     lang: gr.Dropdown
         Source language of the file to transcribe.
     is_translate: gr.Checkbox
         Boolean value that determines whether to translate to English.
         It's Whisper's feature to translate speech from another language directly into English end-to-end.
     beam_size: gr.Number
         Int value that is used for decoding option.
     log_prob_threshold: gr.Number
         If the average log probability over sampled tokens is below this value, treat as failed.
     no_speech_threshold: gr.Number
         If the no_speech probability is higher than this value AND
         the average log probability over sampled tokens is below `log_prob_threshold`,
         consider the segment as silent.
     compute_type: gr.Dropdown
         compute type for transcription.
         see more info : https://opennmt.net/CTranslate2/quantization.html
     best_of: gr.Number
         Number of candidates when sampling with non-zero temperature.
     patience: gr.Number
         Beam search patience factor.
     """
     def to_list(self) -> list:
@@ -66,6 +87,8 @@ class WhisperValues:
     compute_type: str
     best_of: int
     patience: float
     """
     A data class to use Whisper parameters in your function after Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components

 from dataclasses import dataclass, fields
 import gradio as gr
+from typing import Optional
 @dataclass
     compute_type: gr.Dropdown
     best_of: gr.Number
     patience: gr.Number
+    condition_on_previous_text: gr.Checkbox
+    initial_prompt: gr.Textbox
     """
     A data class to pass Gradio components to the function before Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: https://www.gradio.app/docs/components
     ----------
     model_size: gr.Dropdown
         Whisper model size.
     lang: gr.Dropdown
         Source language of the file to transcribe.
     is_translate: gr.Checkbox
         Boolean value that determines whether to translate to English.
         It's Whisper's feature to translate speech from another language directly into English end-to-end.
     beam_size: gr.Number
         Int value that is used for decoding option.
     log_prob_threshold: gr.Number
         If the average log probability over sampled tokens is below this value, treat as failed.
     no_speech_threshold: gr.Number
         If the no_speech probability is higher than this value AND
         the average log probability over sampled tokens is below `log_prob_threshold`,
         consider the segment as silent.
     compute_type: gr.Dropdown
         compute type for transcription.
         see more info : https://opennmt.net/CTranslate2/quantization.html
     best_of: gr.Number
         Number of candidates when sampling with non-zero temperature.
     patience: gr.Number
         Beam search patience factor.
+    condition_on_previous_text: gr.Checkbox
+        if True, the previous output of the model is provided as a prompt for the next window;
+        disabling may make the text inconsistent across windows, but the model becomes less prone to
+        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
+    initial_prompt: gr.Textbox
+        Optional text to provide as a prompt for the first window. This can be used to provide, or
+        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
+        to make it more likely to predict those word correctly.
     """
     def to_list(self) -> list:
     compute_type: str
     best_of: int
     patience: float
+    condition_on_previous_text: bool
+    initial_prompt: Optional[str]
     """
     A data class to use Whisper parameters in your function after Gradio pre-processing.
     See this documentation for more information about Gradio pre-processing: : https://www.gradio.app/docs/components