Spaces:

qanastek
/

Alexa-NLU-Clone

Running

App Files Files Community

Alexa-NLU-Clone / app.py

qanastek

Update app.py

44d6f69 over 3 years ago

raw

history blame

3.37 kB

	import gradio as gr

	import os
	import torch
	import librosa
	from glob import glob
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM

	# ASR
	model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
	processor_asr = Wav2Vec2Processor.from_pretrained(model_name)
	model_asr = Wav2Vec2ForCTC.from_pretrained(model_name)

	# Classifier Intent
	model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
	tokenizer_intent = AutoTokenizer.from_pretrained(model_name)
	model_intent = AutoModelForSequenceClassification.from_pretrained(model_name)
	classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent)

	# Classifier Language
	model_name = 'qanastek/51-languages-classifier'
	tokenizer_langs = AutoTokenizer.from_pretrained(model_name)
	model_langs = AutoModelForSequenceClassification.from_pretrained(model_name)
	classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs)

	# NER Extractor
	model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU'
	tokenizer_ner = AutoTokenizer.from_pretrained(model_name)
	model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
	predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)

	EXAMPLE_DIR = './'
	examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))

	def transcribe(audio_path):

	speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)

	inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

	with torch.no_grad():
	logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits

	predicted_ids = torch.argmax(logits, dim=-1)

	return processor_asr.batch_decode(predicted_ids)[0]

	def getUniform(text):

	idx = 0
	res = {}

	for t in text:

	raw = t["entity"].replace("B-","").replace("I-","")
	word = t["word"].replace("▁","")

	if "B-" in t["entity"]:
	res[f"{raw}\|{idx}"] = [word]
	idx += 1
	else:
	res[f"{raw}\|{idx}"].append(word)

	res = [(r.split("\|")[0], res[r]) for r in res]

	return res


	def process(path):

	text = transcribe(path).replace("apizza","a pizza")

	intent_class = classifier_intent(text)[0]["label"]
	language_class = classifier_language(text)[0]["label"]
	named_entities = getUniform(predict_ner(text))

	return {
	"text": text,
	"language": language_class,
	"intent_class": intent_class,
	"named_entities": named_entities,
	}

	def predict(wav_file):
	res = process(wav_file)
	return res

	# iface = gr.Interface(fn=predict, inputs="text", outputs="text")

	iface = gr.Interface(
	predict,
	title='Alexa NLU Clone',
	description='Upload your wav file to test the model',
	inputs=[
	gr.inputs.Audio(label='wav file', source='microphone', type='filepath')
	],
	outputs=[
	gr.outputs.JSON(label='Slot Recognition + Intent Classification + Language Classification + ASR'),
	],
	examples=examples,
	article='Made with ❤️ by <a href="https://www.linkedin.com/in/yanis-labrak-8a7412145/">Yanis Labrak</a> thanks to 🤗',
	)

	iface.launch()