|
import gradio as gr |
|
import librosa |
|
from asr import transcribe |
|
from tts import synthesize, TTS_EXAMPLES |
|
|
|
ALL_LANGUAGES = {} |
|
|
|
for task in ["asr", "tts", "lid"]: |
|
ALL_LANGUAGES.setdefault(task, {}) |
|
with open(f"data/{task}/all_langs.tsv") as f: |
|
for line in f: |
|
iso, name = line.split(" ", 1) |
|
ALL_LANGUAGES[task][iso] = name |
|
|
|
|
|
def identify(microphone, file_upload): |
|
LID_SAMPLING_RATE = 16_000 |
|
|
|
warn_output = "" |
|
if (microphone is not None) and (file_upload is not None): |
|
warn_output = ( |
|
"WARNING: You've uploaded an audio file and used the microphone. " |
|
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" |
|
) |
|
|
|
elif (microphone is None) and (file_upload is None): |
|
return "ERROR: You have to either use the microphone or upload an audio file" |
|
|
|
audio_fp = microphone if microphone is not None else file_upload |
|
inputs = librosa.load(audio_fp, sr=LID_SAMPLING_RATE, mono=True)[0] |
|
|
|
raw_output = {"eng": 0.9, "hin": 0.04, "heb": 0.03, "ara": 0.02, "fra": 0.01} |
|
return {(k + ": " + ALL_LANGUAGES["lid"][k]): v for k, v in raw_output.items()} |
|
|
|
|
|
demo = gr.Blocks() |
|
|
|
mms_transcribe = gr.Interface( |
|
fn=transcribe, |
|
inputs=[ |
|
gr.Audio(source="microphone", type="filepath"), |
|
gr.Audio(source="upload", type="filepath"), |
|
gr.Dropdown( |
|
[f"{k}: {v}" for k, v in ALL_LANGUAGES["asr"].items()], |
|
label="Language", |
|
value="eng: English", |
|
), |
|
], |
|
outputs="text", |
|
title="Speech-to-text", |
|
description=("Transcribe audio!"), |
|
allow_flagging="never", |
|
) |
|
|
|
mms_synthesize = gr.Interface( |
|
fn=synthesize, |
|
inputs=[ |
|
gr.Text(label="Input text"), |
|
gr.Dropdown( |
|
[f"{k}: {v}" for k, v in ALL_LANGUAGES["tts"].items()], |
|
label="Language", |
|
value="eng: English", |
|
), |
|
gr.Slider(minimum=0.1, maximum=4.0, value=1.0, step=0.1, label="Speed"), |
|
], |
|
outputs=[ |
|
gr.Audio(label="Generated Audio", type="numpy"), |
|
gr.Text(label="Filtered text after removing OOVs"), |
|
], |
|
examples=TTS_EXAMPLES, |
|
title="Text-to-speech", |
|
description=("Generate audio!"), |
|
allow_flagging="never", |
|
) |
|
|
|
mms_identify = gr.Interface( |
|
fn=identify, |
|
inputs=[ |
|
gr.Audio(source="microphone", type="filepath"), |
|
gr.Audio(source="upload", type="filepath"), |
|
], |
|
outputs=gr.Label(num_top_classes=10), |
|
title="Language Identification", |
|
description=("Identity the language of audio!"), |
|
allow_flagging="never", |
|
) |
|
|
|
with demo: |
|
gr.TabbedInterface( |
|
[mms_transcribe, mms_synthesize, mms_identify], |
|
["Speech-to-text", "Text-to-speech", "Language Identification"], |
|
) |
|
|
|
demo.launch() |
|
|