Spaces:

qanastek
/

Alexa-NLU-Clone

Running

File size: 3,249 Bytes

import gradio as gr

import torch
import librosa
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM

# ASR
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
processor_asr = Wav2Vec2Processor.from_pretrained(model_name)
model_asr = Wav2Vec2ForCTC.from_pretrained(model_name)

# Classifier Intent
model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
tokenizer_intent = AutoTokenizer.from_pretrained(model_name)
model_intent = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent)

# Classifier Language
model_name = 'qanastek/51-languages-classifier'
tokenizer_langs = AutoTokenizer.from_pretrained(model_name)
model_langs = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs)

# NER Extractor
model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU'
tokenizer_ner = AutoTokenizer.from_pretrained(model_name)
model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)

def transcribe(audio_path):
        
    speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)

    inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    
    return processor_asr.batch_decode(predicted_ids)[0]

def getUniform(text):

    idx = 0
    res = {}

    for t in text:

        raw = t["entity"].replace("B-","").replace("I-","")
        word = t["word"].replace("▁","")

        if "B-" in t["entity"]:
            res[f"{raw}|{idx}"] = [word]
            idx += 1
        else:
            res[f"{raw}|{idx}"].append(word)

    res = [(r.split("|")[0], res[r]) for r in res]

    return res


def process(path):
    
    text = transcribe(path)

    intent_class = classifier_intent(text)[0]["label"]
    language_class = classifier_language(text)[0]["label"]
    named_entities = getUniform(predict_ner(text))

    return {
        "text": text,
        "language": language_class,
        "intent_class": intent_class,
        "named_entities": named_entities,
    }

audio_paths = [
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/set-the-volume-to-low.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/tell-me-a-joke.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/tell me the artist of this song.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/order-a-pizza.wav",

    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/TTS_1/tell-me-a-good-joke.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/TTS_1/order me a pizza.wav",
    "/users/ylabrak/Alexa_NLU/Pipeline/wavs/TTS_1/tell-me-the-artist-of-this-song.wav",
]

def greet(name):
    return "Hello " + name + "!!"

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()