Spaces:
Sleeping
Sleeping
mj-new
commited on
Commit
·
0587641
1
Parent(s):
81fdfb6
Baseline local whisper model and language selection
Browse files
app.css
CHANGED
@@ -36,3 +36,7 @@ text-align: left;
|
|
36 |
thead tr {
|
37 |
text-align: left;
|
38 |
}
|
|
|
|
|
|
|
|
|
|
36 |
thead tr {
|
37 |
text-align: left;
|
38 |
}
|
39 |
+
|
40 |
+
#pw {
|
41 |
+
-webkit-text-security: disc;
|
42 |
+
}
|
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
import numpy as np
|
|
|
4 |
|
5 |
def greet(name):
|
6 |
return "Hello " + name + "!!"
|
@@ -11,13 +12,120 @@ with open('app.css','r') as f:
|
|
11 |
markdown="""
|
12 |
# Polish ASR BIGOS workspace
|
13 |
"""
|
|
|
|
|
|
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
block = gr.Blocks(css=css_file)
|
16 |
with block:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
gr.Markdown(markdown)
|
18 |
with gr.Tabs():
|
19 |
with gr.TabItem('Voicebot playground'):
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
block.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import whisper
|
3 |
import numpy as np
|
4 |
+
import openai
|
5 |
|
6 |
def greet(name):
|
7 |
return "Hello " + name + "!!"
|
|
|
12 |
markdown="""
|
13 |
# Polish ASR BIGOS workspace
|
14 |
"""
|
15 |
+
def whisper_model_change(radio_whisper_model):
|
16 |
+
whisper_model = whisper.load_model(radio_whisper_model)
|
17 |
+
return(whisper_model)
|
18 |
|
19 |
+
def prompt_gpt(input_text):
|
20 |
+
messages = [
|
21 |
+
{"role": "system", "content": "You are a helpful assistant."}]
|
22 |
+
|
23 |
+
if input_text:
|
24 |
+
messages.append(
|
25 |
+
{"role": "user", "content": input_text},
|
26 |
+
)
|
27 |
+
chat_completion = openai.ChatCompletion.create(
|
28 |
+
model="gpt-3.5-turbo", messages=messages
|
29 |
+
)
|
30 |
+
|
31 |
+
reply = chat_completion.choices[0].message.content
|
32 |
+
return reply
|
33 |
+
|
34 |
+
def process_pipeline(audio):
|
35 |
+
asr_out = transcribe(audio)
|
36 |
+
gpt_out = prompt_gpt(asr_out)
|
37 |
+
tts_out = synthesize_speech(gpt_out)
|
38 |
+
return(tts_out)
|
39 |
+
|
40 |
+
def transcribe(audio, language, whisper_model, whisper_model_type):
|
41 |
+
if not whisper_model:
|
42 |
+
whisper_model=init_whisper_model(whisper_model_type)
|
43 |
+
|
44 |
+
print(f"Transcribing {audio} for language {language} and model {whisper_model_type}")
|
45 |
+
audio = whisper.load_audio(audio)
|
46 |
+
audio = whisper.pad_or_trim(audio)
|
47 |
+
|
48 |
+
mel = whisper.log_mel_spectrogram(audio)
|
49 |
+
|
50 |
+
options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False)
|
51 |
+
result = whisper.decode(whisper_model, mel, options)
|
52 |
+
result_text = result.text
|
53 |
+
return result_text
|
54 |
+
|
55 |
+
def init_whisper_model(whisper_model_type):
|
56 |
+
print("Initializing whisper model")
|
57 |
+
print(whisper_model_type)
|
58 |
+
whisper_model = whisper.load_model(whisper_model_type)
|
59 |
+
return whisper_model
|
60 |
+
|
61 |
+
def synthesize_speech(text):
|
62 |
+
audioobj = gTTS(text = out_result,
|
63 |
+
lang = lang,
|
64 |
+
slow = False)
|
65 |
+
|
66 |
+
audioobj.save("Temp.mp3")
|
67 |
+
return("Temp.mp3")
|
68 |
+
|
69 |
block = gr.Blocks(css=css_file)
|
70 |
with block:
|
71 |
+
|
72 |
+
#state variables
|
73 |
+
language = gr.State("en")
|
74 |
+
whisper_model_type = gr.State("base")
|
75 |
+
whisper_model = gr.State()
|
76 |
+
|
77 |
+
# state handling functions
|
78 |
+
def change_language(choice):
|
79 |
+
if choice == "Polish":
|
80 |
+
language="pl"
|
81 |
+
print("Switching to Polish")
|
82 |
+
print("language")
|
83 |
+
print(language)
|
84 |
+
elif choice == "English":
|
85 |
+
language="en"
|
86 |
+
print("Switching to English")
|
87 |
+
print("language")
|
88 |
+
print(language)
|
89 |
+
return(language)
|
90 |
+
|
91 |
+
def change_whisper_model(choice):
|
92 |
+
whisper_model_type = choice
|
93 |
+
print("Switching Whisper model")
|
94 |
+
print(whisper_model_type)
|
95 |
+
whisper_model = init_whisper_model(whisper_model_type)
|
96 |
+
return [whisper_model_type, whisper_model]
|
97 |
+
|
98 |
gr.Markdown(markdown)
|
99 |
with gr.Tabs():
|
100 |
with gr.TabItem('Voicebot playground'):
|
101 |
+
with gr.Box():
|
102 |
+
gr.HTML("<p class=\"apikey\">API Key:</p>")
|
103 |
+
# API key textbox (password-style)
|
104 |
+
api_key = gr.Textbox(label="", elem_id="pw")
|
105 |
+
|
106 |
+
radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
|
107 |
+
#radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
|
108 |
+
#radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
|
109 |
+
radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base")
|
110 |
+
|
111 |
+
mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
|
112 |
+
|
113 |
+
out_asr = gr.Textbox(placeholder="ASR output",
|
114 |
+
lines=5,
|
115 |
+
max_lines=10,
|
116 |
+
show_label=False)
|
117 |
+
out_gpt = gr.Textbox(placeholder="ChatGPT output",
|
118 |
+
lines=10,
|
119 |
+
max_lines=25,
|
120 |
+
show_label=False)
|
121 |
+
|
122 |
+
button_transcribe = gr.Button("Transcribe")
|
123 |
+
button_prompt_gpt = gr.Button("Prompt ChatGPT")
|
124 |
+
|
125 |
+
button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr)
|
126 |
+
button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt)
|
127 |
+
|
128 |
+
radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
|
129 |
+
radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
|
130 |
|
131 |
block.launch()
|