Spaces:

jmparejaz
/

Audio_to_text_classification

Runtime error

File size: 3,353 Bytes

ccbbbf4
 
 
 
b5a2ee4
e4a6674
 
b5a2ee4
 
3161b19
 
b5a2ee4
 
ccbbbf4
e4a6674
 
 
 
05856c6
e4a6674
 
ccbbbf4
 
b5a2ee4
 
 
 
 
 
 
 
 
 
 
e4a6674
 
ccbbbf4
 
 
 
 
 
 
 
 
 
93390f1
ccbbbf4
e4a6674
93390f1
e4a6674
 
5272af9
 
 
e4a6674
b5a2ee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7eb62ed
b5a2ee4
 
 
 
 
 
 
 
 
 
 
 
 
 
7eb62ed
 
 
 
 
 
e4a6674
5272af9
7eb62ed
ccbbbf4
 
 
7d443e6

import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from huggingface_hub import from_pretrained_keras
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.preprocessing import StandardScaler
import logging
import librosa
import numpy as np
import pickle



#call tokenizer and NLP model for text classification
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


# call whisper model for audio/speech processing
model = whisper.load_model("small")

# call model for audio emotions
reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')

# call scaler and decoder
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("encoder.pkl", "rb") as f:
    encoder = pickle.load(f)



def inference_audio(audio):
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    _, probs = model.detect_language(mel)
    
    options = whisper.DecodingOptions(fp16 = False)
    result = whisper.decode(model, mel, options)
    
    return result.text

def inference_text(audio):
    text =inference_audio(audio)

    sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
    res=sentiment_task(text)[0]

    return res['label'],res['score']

    
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def audio_emotions(audio):
    sr,data = audio
    features_audio = extract_features(data)
    features_audio = np.array(features_audio)
    scaled_features=scaler.transform(features_audio)
    scaled_features = np.expand_dims(scaled_features, axis=2)
    prediction=reloaded_model.predict(scaled_features)
    y_pred = encoder.inverse_transform(prediction)
    return y_pred

def main(audio):
    r1,r2=inference_text(audio)
    r3=audio_emotions(audio)
    return r1,r2,r3
    

#audio = gr.Audio(
#                    label="Input Audio",
#                    show_label=False,
#                    source="microphone",
#                    type="filepath"
#                )


app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=gr.Audio(source="microphone"), outputs=["text","text","text"])



app.launch(debug = True, share = True)