Spaces:

amu-cai
/

amu-bigos-audio-recorder

Sleeping

App Files Files Community

mj-new commited on Jun 11, 2023

Commit

0587641

1 Parent(s): 81fdfb6

Baseline local whisper model and language selection

Browse files

Files changed (2) hide show

app.css +4 -0
app.py +111 -3

app.css CHANGED Viewed

@@ -36,3 +36,7 @@ text-align: left;
 thead tr {
 text-align: left;
 }

 thead tr {
 text-align: left;
 }
+#pw {
+    -webkit-text-security: disc;
+  }

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
-#import whisper
 import numpy as np
 def greet(name):
     return "Hello " + name + "!!"
@@ -11,13 +12,120 @@ with open('app.css','r') as f:
 markdown="""
 # Polish ASR BIGOS workspace
 """
 block = gr.Blocks(css=css_file)
 with block:
     gr.Markdown(markdown)
     with gr.Tabs():
         with gr.TabItem('Voicebot playground'):
-            record = gr.Audio(source="microphone", label='Record your voice')
-            save = gr.Button("Submit")
 block.launch()

 import gradio as gr
+import whisper
 import numpy as np
+import openai
 def greet(name):
     return "Hello " + name + "!!"
 markdown="""
 # Polish ASR BIGOS workspace
 """
+def whisper_model_change(radio_whisper_model):
+    whisper_model = whisper.load_model(radio_whisper_model)
+    return(whisper_model)
+def prompt_gpt(input_text):
+    messages = [
+    {"role": "system", "content": "You are a helpful assistant."}]
+    if input_text:
+        messages.append(
+            {"role": "user", "content": input_text},
+        )
+        chat_completion = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo", messages=messages
+        )
+    reply = chat_completion.choices[0].message.content
+    return reply
+def process_pipeline(audio):
+    asr_out = transcribe(audio)
+    gpt_out = prompt_gpt(asr_out)
+    tts_out = synthesize_speech(gpt_out)
+    return(tts_out)
+def transcribe(audio, language, whisper_model, whisper_model_type):
+    if not whisper_model:
+        whisper_model=init_whisper_model(whisper_model_type)
+    print(f"Transcribing {audio} for language {language} and model {whisper_model_type}")
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio)
+    options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False)
+    result = whisper.decode(whisper_model, mel, options)
+    result_text = result.text
+    return result_text
+def init_whisper_model(whisper_model_type):
+    print("Initializing whisper model")
+    print(whisper_model_type)
+    whisper_model = whisper.load_model(whisper_model_type)
+    return whisper_model
+def synthesize_speech(text):
+    audioobj = gTTS(text = out_result,
+                    lang = lang,
+                    slow = False)
+    audioobj.save("Temp.mp3")
+    return("Temp.mp3")
 block = gr.Blocks(css=css_file)
 with block:
+    #state variables
+    language = gr.State("en")
+    whisper_model_type = gr.State("base")
+    whisper_model = gr.State()
+    # state handling functions
+    def change_language(choice):
+        if choice == "Polish":
+            language="pl"
+            print("Switching to Polish")
+            print("language")
+            print(language)
+        elif choice == "English":
+            language="en"
+            print("Switching to English")
+            print("language")
+            print(language)
+        return(language)
+    def change_whisper_model(choice):
+        whisper_model_type = choice
+        print("Switching Whisper model")
+        print(whisper_model_type)
+        whisper_model = init_whisper_model(whisper_model_type)
+        return [whisper_model_type, whisper_model]
     gr.Markdown(markdown)
     with gr.Tabs():
         with gr.TabItem('Voicebot playground'):
+            with gr.Box():
+                gr.HTML("<p class=\"apikey\">API Key:</p>")
+                # API key textbox (password-style)
+                api_key = gr.Textbox(label="", elem_id="pw")
+            radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used")
+            #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money")
+            #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service")
+            radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are better, but slower. Default - base")
+            mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice')
+            out_asr = gr.Textbox(placeholder="ASR output",
+                               lines=5,
+                               max_lines=10,
+                               show_label=False)
+            out_gpt = gr.Textbox(placeholder="ChatGPT output",
+                               lines=10,
+                               max_lines=25,
+                               show_label=False)
+            button_transcribe = gr.Button("Transcribe")
+            button_prompt_gpt = gr.Button("Prompt ChatGPT")
+            button_transcribe.click(transcribe, inputs=[mic_recording,language, whisper_model,whisper_model_type], outputs=out_asr)
+            button_prompt_gpt.click(prompt_gpt, inputs=out_asr, outputs=out_gpt)
+            radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language)
+            radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model])
 block.launch()