cscan jmparejaz commited on
Commit
821363b
·
0 Parent(s):

Duplicate from jmparejaz/Audio_to_text_classification

Browse files

Co-authored-by: Jose Mario Pareja <[email protected]>

Files changed (6) hide show
  1. .gitattributes +33 -0
  2. README.md +14 -0
  3. app.py +107 -0
  4. encoder.pkl +3 -0
  5. requirements.txt +3 -0
  6. scaler.pkl +3 -0
.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Sound Emotion Recognition
3
+ emoji: 💻
4
+ colorFrom: pink
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 3.6
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ duplicated_from: jmparejaz/Audio_to_text_classification
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip install git+https://github.com/openai/whisper.git")
3
+ import gradio as gr
4
+ import whisper
5
+ from huggingface_hub import from_pretrained_keras
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
7
+ from transformers import pipeline
8
+ from sklearn.preprocessing import StandardScaler
9
+ import logging
10
+ import librosa
11
+ import numpy as np
12
+ import pickle
13
+
14
+
15
+
16
+ #call tokenizer and NLP model for text classification
17
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
18
+ model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
19
+
20
+
21
+ # call whisper model for audio/speech processing
22
+ model = whisper.load_model("small")
23
+
24
+ # call model for audio emotions
25
+ reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
26
+
27
+ # call scaler and decoder
28
+ with open("scaler.pkl", "rb") as f:
29
+ scaler = pickle.load(f)
30
+
31
+ with open("encoder.pkl", "rb") as f:
32
+ encoder = pickle.load(f)
33
+
34
+
35
+
36
+ def inference_audio(audio):
37
+ audio = whisper.load_audio(audio)
38
+ audio = whisper.pad_or_trim(audio)
39
+
40
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
41
+
42
+ _, probs = model.detect_language(mel)
43
+
44
+ options = whisper.DecodingOptions(fp16 = False)
45
+ result = whisper.decode(model, mel, options)
46
+
47
+ return result.text
48
+
49
+ def inference_text(audio):
50
+ text =inference_audio(audio)
51
+
52
+ sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
53
+ res=sentiment_task(text)[0]
54
+
55
+ return text,res['label'],res['score']
56
+
57
+
58
+ def extract_features(data):
59
+ # ZCR
60
+ result = np.array([])
61
+ zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
62
+ result=np.hstack((result, zcr)) # stacking horizontally
63
+
64
+ # Chroma_stft
65
+ stft = np.abs(librosa.stft(data))
66
+ chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
67
+ result = np.hstack((result, chroma_stft)) # stacking horizontally
68
+
69
+ # MFCC
70
+ mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
71
+ result = np.hstack((result, mfcc)) # stacking horizontally
72
+
73
+ # Root Mean Square Value
74
+ rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
75
+ result = np.hstack((result, rms)) # stacking horizontally
76
+
77
+ # MelSpectogram
78
+ mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
79
+ result = np.hstack((result, mel)) # stacking horizontally
80
+
81
+ return result
82
+ """
83
+ def audio_emotions(audio):
84
+ sr,data = audio
85
+ features_audio = extract_features(data)
86
+ features_audio = np.array(features_audio)
87
+ scaled_features=scaler.transform(features_audio)
88
+ scaled_features = np.expand_dims(scaled_features, axis=2)
89
+ prediction=reloaded_model.predict(scaled_features)
90
+ y_pred = encoder.inverse_transform(prediction)
91
+ return y_pred
92
+ """
93
+ def main(audio):
94
+ r1,r2,r3=inference_text(audio)
95
+ #r3=audio_emotions(audio)
96
+ return r1,r2,r3
97
+
98
+
99
+ audio = gr.Audio(
100
+ label="Input Audio",
101
+ show_label=False,
102
+ source="microphone",
103
+ type="filepath"
104
+ )
105
+
106
+
107
+ app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True)
encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c6a3ef0f2c45329f271e0c533784148f2e9fbe6ed814f22933699660d8e5a14
3
+ size 430
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ librosa
2
+ tensorflow
3
+ scikit-learn==1.0.2
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac2a609b867b59002822ad08d86679c11338c24776e83ad2d2dd51eaba9cf53
3
+ size 4346