Spaces:
Runtime error
Runtime error
File size: 3,353 Bytes
ccbbbf4 b5a2ee4 e4a6674 b5a2ee4 3161b19 b5a2ee4 ccbbbf4 e4a6674 05856c6 e4a6674 ccbbbf4 b5a2ee4 e4a6674 ccbbbf4 93390f1 ccbbbf4 e4a6674 93390f1 e4a6674 5272af9 e4a6674 b5a2ee4 7eb62ed b5a2ee4 7eb62ed e4a6674 5272af9 7eb62ed ccbbbf4 7d443e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from huggingface_hub import from_pretrained_keras
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.preprocessing import StandardScaler
import logging
import librosa
import numpy as np
import pickle
#call tokenizer and NLP model for text classification
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
# call whisper model for audio/speech processing
model = whisper.load_model("small")
# call model for audio emotions
reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
# call scaler and decoder
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
with open("encoder.pkl", "rb") as f:
encoder = pickle.load(f)
def inference_audio(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
return result.text
def inference_text(audio):
text =inference_audio(audio)
sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
res=sentiment_task(text)[0]
return res['label'],res['score']
def extract_features(data):
# ZCR
result = np.array([])
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
result=np.hstack((result, zcr)) # stacking horizontally
# Chroma_stft
stft = np.abs(librosa.stft(data))
chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
result = np.hstack((result, chroma_stft)) # stacking horizontally
# MFCC
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mfcc)) # stacking horizontally
# Root Mean Square Value
rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
result = np.hstack((result, rms)) # stacking horizontally
# MelSpectogram
mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mel)) # stacking horizontally
return result
def audio_emotions(audio):
sr,data = audio
features_audio = extract_features(data)
features_audio = np.array(features_audio)
scaled_features=scaler.transform(features_audio)
scaled_features = np.expand_dims(scaled_features, axis=2)
prediction=reloaded_model.predict(scaled_features)
y_pred = encoder.inverse_transform(prediction)
return y_pred
def main(audio):
r1,r2=inference_text(audio)
r3=audio_emotions(audio)
return r1,r2,r3
#audio = gr.Audio(
# label="Input Audio",
# show_label=False,
# source="microphone",
# type="filepath"
# )
app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=gr.Audio(source="microphone"), outputs=["text","text","text"])
app.launch(debug = True, share = True) |