Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import whisper | |
| import numpy as np | |
| import openai | |
| import os | |
| from gtts import gTTS | |
| import json | |
| import hashlib | |
| import random | |
| import string | |
| import uuid | |
| from datetime import date,datetime | |
| from huggingface_hub import Repository, upload_file | |
| import shutil | |
| HF_TOKEN_WRITE = os.environ.get("HF_TOKEN_WRITE") | |
| print("HF_TOKEN_WRITE", HF_TOKEN_WRITE) | |
| today = date.today() | |
| today_ymd = today.strftime("%Y%m%d") | |
| def greet(name): | |
| return "Hello " + name + "!!" | |
| with open('app.css','r') as f: | |
| css_file = f.read() | |
| markdown=""" | |
| # Polish ASR BIGOS workspace | |
| """ | |
| # TODO move to config | |
| WORKING_DATASET_REPO_URL = "https://huggingface.co/datasets/goodmike31/working-db" | |
| REPO_NAME = "goodmike31/working-db" | |
| REPOSITORY_DIR = "data" | |
| LOCAL_DIR = "data_local" | |
| os.makedirs(LOCAL_DIR,exist_ok=True) | |
| def dump_json(thing,file): | |
| with open(file,'w+',encoding="utf8") as f: | |
| json.dump(thing,f) | |
| def get_unique_name(): | |
| return ''.join([random.choice(string.ascii_letters | |
| + string.digits) for n in range(32)]) | |
| def save_recording_and_meta(project_name, recording, transcript, language): | |
| #, name, age, gender): | |
| # TODO save user data in the next version | |
| speaker_metadata={} | |
| speaker_metadata['gender'] = "test" #gender if gender!=GENDER[0] else '' | |
| speaker_metadata['age'] = "test" #age if age !='' else '' | |
| speaker_metadata['accent'] = "test" #accent if accent!='' else '' | |
| lang_id = language.lower() | |
| # TODO get ISO-693-1 codes | |
| transcript =transcript.strip() | |
| SAVE_ROOT_DIR = os.path.join(LOCAL_DIR, project_name, today_ymd) | |
| SAVE_DIR_AUDIO = os.path.join(SAVE_ROOT_DIR, "audio") | |
| SAVE_DIR_META = os.path.join(SAVE_ROOT_DIR, "meta") | |
| os.makedirs(SAVE_DIR_AUDIO, exist_ok=True) | |
| os.makedirs(SAVE_DIR_META, exist_ok=True) | |
| # Write audio to file | |
| #audio_name = get_unique_name() | |
| uuid_name = str(uuid.uuid4()) | |
| audio_fn = uuid_name + ".wav" | |
| audio_output_fp = os.path.join(SAVE_DIR_AUDIO, audio_fn) | |
| print (f"Saving {recording} as {audio_output_fp}") | |
| shutil.copy2(recording, audio_output_fp) | |
| # Write metadata.json to file | |
| meta_fn = uuid_name + 'metadata.jsonl' | |
| json_file_path = os.path.join(SAVE_DIR_META, meta_fn) | |
| now = datetime.now() | |
| timestamp_str = now.strftime("%d/%m/%Y %H:%M:%S") | |
| metadata= {'id':uuid_name,'audio_file': audio_fn, | |
| 'language_name':language,'language_id':lang_id, | |
| 'transcript':transcript,'age': speaker_metadata['age'], | |
| 'gender': speaker_metadata['gender'],'accent': speaker_metadata['accent'], | |
| "date":today_ymd, "timestamp": timestamp_str } | |
| dump_json(metadata, json_file_path) | |
| # Simply upload the audio file and metadata using the hub's upload_file | |
| # Upload the audio | |
| repo_audio_path = os.path.join(REPOSITORY_DIR, project_name, today_ymd, "audio", audio_fn) | |
| _ = upload_file(path_or_fileobj = audio_output_fp, | |
| path_in_repo = repo_audio_path, | |
| repo_id = REPO_NAME, | |
| repo_type = 'dataset', | |
| token = HF_TOKEN_WRITE | |
| ) | |
| # Upload the metadata | |
| repo_json_path = os.path.join(REPOSITORY_DIR, project_name, today_ymd, "meta", meta_fn) | |
| _ = upload_file(path_or_fileobj = json_file_path, | |
| path_in_repo = repo_json_path, | |
| repo_id = REPO_NAME, | |
| repo_type = 'dataset', | |
| token = HF_TOKEN_WRITE | |
| ) | |
| output = print(f"Recording {audio_fn} and meta file {meta_fn} successfully saved to repo!") | |
| return | |
| def whisper_model_change(radio_whisper_model): | |
| whisper_model = whisper.load_model(radio_whisper_model) | |
| return(whisper_model) | |
| def prompt_gpt(input_text, api_key, temperature): | |
| #, role, template_prompt, template_answer): | |
| #TODO add option to specify instruction | |
| openai.api_key = api_key | |
| #TODO add specific message for specific role | |
| system_role_message="You are a helpful assistant" | |
| messages = [ | |
| {"role": "system", "content": system_role_message}] | |
| if input_text: | |
| messages.append( | |
| {"role": "user", "content": input_text}, | |
| ) | |
| chat_completion = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=messages, | |
| temperature=temperature | |
| ) | |
| reply = chat_completion.choices[0].message.content | |
| #TODO save chat completion for future reuse | |
| return reply | |
| def process_pipeline(audio): | |
| asr_out = transcribe(audio) | |
| gpt_out = prompt_gpt(asr_out) | |
| tts_out = synthesize_speech(gpt_out) | |
| return(tts_out) | |
| def transcribe(audio, language, whisper_model, whisper_model_type): | |
| if not whisper_model: | |
| whisper_model=init_whisper_model(whisper_model_type) | |
| print(f"Transcribing {audio} for language {language} and model {whisper_model_type}") | |
| audio = whisper.load_audio(audio) | |
| audio = whisper.pad_or_trim(audio) | |
| mel = whisper.log_mel_spectrogram(audio) | |
| options = whisper.DecodingOptions(language=language, without_timestamps=True, fp16=False) | |
| result = whisper.decode(whisper_model, mel, options) | |
| result_text = result.text | |
| return result_text | |
| def init_whisper_model(whisper_model_type): | |
| print("Initializing whisper model") | |
| print(whisper_model_type) | |
| whisper_model = whisper.load_model(whisper_model_type) | |
| return whisper_model | |
| def synthesize_speech(text, language): | |
| audioobj = gTTS(text = text, | |
| lang = language, | |
| slow = False) | |
| audioobj.save("Temp.mp3") | |
| return("Temp.mp3") | |
| block = gr.Blocks(css=css_file) | |
| with block: | |
| #state variables | |
| language = gr.State("en") | |
| temperature = gr.State(0) | |
| whisper_model_type = gr.State("base") | |
| whisper_model = gr.State() | |
| api_key = gr.State() | |
| project_name = gr.State("voicebot") # TODO add list of projects to organize saved data | |
| # state handling functions | |
| def change_language(choice): | |
| if choice == "Polish": | |
| language="pl" | |
| print("Switching to Polish") | |
| print("language") | |
| print(language) | |
| elif choice == "English": | |
| language="en" | |
| print("Switching to English") | |
| print("language") | |
| print(language) | |
| return(language) | |
| def change_whisper_model(choice): | |
| whisper_model_type = choice | |
| print("Switching Whisper model") | |
| print(whisper_model_type) | |
| whisper_model = init_whisper_model(whisper_model_type) | |
| return [whisper_model_type, whisper_model] | |
| gr.Markdown(markdown) | |
| with gr.Tabs(): | |
| with gr.Row(): | |
| with gr.TabItem('Voicebot playground'): | |
| with gr.Accordion(label="Settings"): | |
| gr.HTML("<p class=\"apikey\">Open AI API Key:</p>") | |
| # API key textbox (password-style) | |
| api_key = gr.Textbox(label="", elem_id="pw") | |
| slider_temp = gr.Slider(minimum=0, maximum= 2, step=0.2, label="ChatGPT temperature") | |
| radio_lang = gr.Radio(["Polish", "English"], label="Language", info="If none selected, English is used") | |
| #radio_asr_type = gr.Radio(["Local", "Cloud"], label="Select ASR type", info="Cloud models are faster and more accurate, but costs money") | |
| #radio_cloud_asr = gr.Radio(["Whisper", "Google", "Azure"], label="Select Cloud ASR provider", info="You need to provide API keys for specific service") | |
| radio_whisper_model = gr.Radio(["tiny", "base", "small", "medium", "large"], label="Whisper ASR model (local)", info="Larger models are more accurate, but slower. Default - base") | |
| with gr.Box(): | |
| with gr.Row(): | |
| mic_recording = gr.Audio(source="microphone", type="filepath", label='Record your voice') | |
| button_transcribe = gr.Button("Transcribe speech") | |
| button_save_audio_and_trans = gr.Button("Save recording and meta") | |
| out_asr = gr.Textbox(placeholder="ASR output", | |
| lines=2, | |
| max_lines=5, | |
| show_label=False) | |
| button_prompt_gpt = gr.Button("Prompt ChatGPT") | |
| out_gpt = gr.Textbox(placeholder="ChatGPT output", | |
| lines=4, | |
| max_lines=10, | |
| show_label=False) | |
| button_synth_speech = gr.Button("Synthesize speech") | |
| synth_recording = gr.Audio() | |
| # Events actions | |
| button_save_audio_and_trans.click(save_recording_and_meta, inputs=[project_name, mic_recording, out_asr, language], outputs=[]) | |
| button_transcribe.click(transcribe, inputs=[mic_recording, language, whisper_model,whisper_model_type], outputs=out_asr) | |
| button_prompt_gpt.click(prompt_gpt, inputs=[out_asr, api_key, slider_temp], outputs=out_gpt) | |
| button_synth_speech.click(synthesize_speech, inputs=[out_gpt, language], outputs=synth_recording) | |
| radio_lang.change(fn=change_language, inputs=radio_lang, outputs=language) | |
| radio_whisper_model.change(fn=change_whisper_model, inputs=radio_whisper_model, outputs=[whisper_model_type, whisper_model]) | |
| block.launch() |