Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import torch | |
| import librosa | |
| from glob import glob | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM | |
| # ASR | |
| model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english" | |
| processor_asr = Wav2Vec2Processor.from_pretrained(model_name) | |
| model_asr = Wav2Vec2ForCTC.from_pretrained(model_name) | |
| # Classifier Intent | |
| model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification' | |
| tokenizer_intent = AutoTokenizer.from_pretrained(model_name) | |
| model_intent = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent) | |
| # Classifier Language | |
| model_name = 'qanastek/51-languages-classifier' | |
| tokenizer_langs = AutoTokenizer.from_pretrained(model_name) | |
| model_langs = AutoModelForSequenceClassification.from_pretrained(model_name) | |
| classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs) | |
| # NER Extractor | |
| model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU' | |
| tokenizer_ner = AutoTokenizer.from_pretrained(model_name) | |
| model_ner = AutoModelForTokenClassification.from_pretrained(model_name) | |
| predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner) | |
| EXAMPLE_DIR = './' | |
| examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav'))) | |
| def transcribe(audio_path): | |
| speech_array, sampling_rate = librosa.load(audio_path, sr=16_000) | |
| inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| return processor_asr.batch_decode(predicted_ids)[0] | |
| def getUniform(text): | |
| idx = 0 | |
| res = {} | |
| for t in text: | |
| raw = t["entity"].replace("B-","").replace("I-","") | |
| word = t["word"].replace("β","") | |
| if "B-" in t["entity"]: | |
| res[f"{raw}|{idx}"] = [word] | |
| idx += 1 | |
| else: | |
| res[f"{raw}|{idx}"].append(word) | |
| res = [(r.split("|")[0], res[r]) for r in res] | |
| return res | |
| def process(path): | |
| text = transcribe(path).replace("apizza","a pizza") | |
| intent_class = classifier_intent(text)[0]["label"] | |
| language_class = classifier_language(text)[0]["label"] | |
| named_entities = getUniform(predict_ner(text)) | |
| return { | |
| "text": text, | |
| "language": language_class, | |
| "intent_class": intent_class, | |
| "named_entities": named_entities, | |
| } | |
| def predict(wav_file): | |
| res = process(wav_file) | |
| return res | |
| # iface = gr.Interface(fn=predict, inputs="text", outputs="text") | |
| iface = gr.Interface( | |
| predict, | |
| title='Alexa NLU Clone', | |
| description='Upload your wav file to test the model', | |
| inputs=[ | |
| gr.inputs.Audio(label='wav file', source='microphone', type='filepath') | |
| ], | |
| outputs=[ | |
| gr.outputs.JSON(label='Slot Recognition + Intent Classification + Language Classification + ASR'), | |
| ], | |
| examples=examples, | |
| article='Made with β€οΈ by <a href="https://www.linkedin.com/in/yanis-labrak-8a7412145/">Yanis Labrak</a> thanks to π€', | |
| ) | |
| iface.launch() |