Spaces:
Running
Running
import gradio as gr | |
import torch | |
import librosa | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM | |
# ASR | |
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english" | |
processor_asr = Wav2Vec2Processor.from_pretrained(model_name) | |
model_asr = Wav2Vec2ForCTC.from_pretrained(model_name) | |
# Classifier Intent | |
model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification' | |
tokenizer_intent = AutoTokenizer.from_pretrained(model_name) | |
model_intent = AutoModelForSequenceClassification.from_pretrained(model_name) | |
classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent) | |
# Classifier Language | |
model_name = 'qanastek/51-languages-classifier' | |
tokenizer_langs = AutoTokenizer.from_pretrained(model_name) | |
model_langs = AutoModelForSequenceClassification.from_pretrained(model_name) | |
classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs) | |
# NER Extractor | |
model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU' | |
tokenizer_ner = AutoTokenizer.from_pretrained(model_name) | |
model_ner = AutoModelForTokenClassification.from_pretrained(model_name) | |
predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner) | |
def transcribe(audio_path): | |
speech_array, sampling_rate = librosa.load(audio_path, sr=16_000) | |
inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True) | |
with torch.no_grad(): | |
logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
return processor_asr.batch_decode(predicted_ids)[0] | |
def getUniform(text): | |
idx = 0 | |
res = {} | |
for t in text: | |
raw = t["entity"].replace("B-","").replace("I-","") | |
word = t["word"].replace("β","") | |
if "B-" in t["entity"]: | |
res[f"{raw}|{idx}"] = [word] | |
idx += 1 | |
else: | |
res[f"{raw}|{idx}"].append(word) | |
res = [(r.split("|")[0], res[r]) for r in res] | |
return res | |
def greet(name): | |
return "Hello " + name + "!!" | |
iface = gr.Interface(fn=greet, inputs="text", outputs="text") | |
iface.launch() |