|
import gradio as gr |
|
import whisper |
|
import numpy as np |
|
import openai |
|
|
|
def greet(name): |
|
return "Hello " + name + "!!" |
|
|
|
with open('app.css','r') as f: |
|
css_file = f.read() |
|
|
|
markdown=""" |
|
# Polish ASR BIGOS workspace |
|
""" |
|
def whisper_model_change(radio_whisper_model): |
|
whisper_model = whisper.load_model(radio_whisper_model) |
|
return(whisper_model) |
|
|
|
def prompt_gpt(input_text): |
|
messages = [ |
|
{"role": "system", "content": "You are a helpful assistant."}] |
|
|
|
if input_text: |
|
messages.append( |
|
{"role": "user", "content": input_text}, |
|
) |
|
chat_completion = openai.ChatCompletion.create( |
|
model="gpt-3.5-turbo", messages=messages |
|
) |
|
|
|
reply = chat_completion.choices[0].message.content |
|
return reply |
|
|
|
def process_pipeline(audio): |
|
asr_out = transcribe(audio) |
|
gpt_out = prompt_gpt(asr_out) |
|
tts_out = synthesize_speech(gpt_out) |
|
return(tts_out) |
|
|
|
def transcribe(audio, language, whisper_model, whisper_model_type): |
|
if not whisper_model: |
|
whisper_model=init_whisper_model(whisper_model_type) |
|
|
|
print(f"Transcribing {audio} for language {language} and model {whisper_model_type}") |
|
audio = whisper.load_audio(audio) |
|
audio = whisper.pad_or_trim(audio) |
|
|
|
mel = whisper.log_mel_spectrogram(audio) |
|
|
|
options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False) |
|
result = whisper.decode(whisper_model, mel, options) |
|
result_text = result.text |
|
return result_text |
|
|
|
def init_whisper_model(whisper_model_type): |
|
print("Initializing whisper model") |
|
print(whisper_model_type) |
|
whisper_model = whisper.load_model(whisper_model_type) |
|
return whisper_model |
|
|
|
def synthesize_speech(text): |
|
audioobj = gTTS(text = out_result, |
|
lang = lang, |
|
slow = False) |
|
|
|
audioobj.save("Temp.mp3") |
|
return("Temp.mp3") |
|
|
|
block = gr.Blocks(css=css_file) |
|
with block: |
|
|
|
|
|
language = gr.State("en") |
|
whisper_model_type = gr.State("base") |
|
whisper_model = gr.State() |
|
|
|
|
|
def change_language(choice): |
|
if choice == "Polish": |
|
language="pl" |
|
print("Switching to Polish") |
|
print("language") |
|
print(language) |
|
elif choice == "English": |
|
language="en" |
|
print("Switching to English") |
|
print("language") |
|
print(language) |
|
return(language) |
|
|
|
def change_whisper_model(choice): |
|
whisper_model_type = choice |
|
print("Switching Whisper model") |
|
print(whisper_model_type) |
|
whisper_model = init_whisper_model(whisper_model_type) |
|
return [whisper_model_type, whisper_model] |
|
|
|
gr.Markdown(markdown) |
|
with gr.Tabs(): |
|
with gr.TabItem('Voicebot playground'): |
|
with gr.Box(): |
|
gr.HTML("<p class=\"apikey\">API Key:</p>") |
|
|
|
api_key = gr.Textbox(label="", elem_id="pw") |
|
|
|
radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used") |
|
|
|
|
|
radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base") |
|
|
|
mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice') |
|
|
|
out_asr = gr.Textbox(placeholder="ASR output", |
|
lines=5, |
|
max_lines=10, |
|
show_label=False) |
|
out_gpt = gr.Textbox(placeholder="ChatGPT output", |
|
lines=10, |
|
max_lines=25, |
|
show_label=False) |
|
|
|
button_transcribe = gr.Button("Transcribe") |
|
button_prompt_gpt = gr.Button("Prompt ChatGPT") |
|
|
|
button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr) |
|
button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt) |
|
|
|
radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language) |
|
radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model]) |
|
|
|
block.launch() |