File size: 2,813 Bytes
9524ef8
 
 
 
 
 
 
 
 
2bb9b8c
9524ef8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f39b2d
e7f35e2
9524ef8
 
2bb9b8c
9524ef8
 
 
 
e277959
9ea318f
e7f35e2
9524ef8
 
4dc9da5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from tensorflow import keras
import os
import soundfile as sf
import numpy as np
import librosa
import gradio as gr
import seaborn as sns
import pandas as pd
import plotly.express as px
model = keras.models.load_model('emotion.h5')
labels = ['Angry', 'Disgusted', 'Fearful', 'Happy', 'Neutral', 'Sad', 'Suprised']
def predict(audio):
    wave, sr = librosa.load(audio, sr=None)
    segment_dur_secs = 3  
    segment_length = sr * segment_dur_secs
    num_sections = int(np.ceil(len(wave) / segment_length))
    split = []
    paths =[]
    for i in range(num_sections):
        t = wave[i * segment_length: (i + 1) * segment_length]
        split.append(t)
        
    out_dir = ('audio_data/splits/')
    os.makedirs(out_dir, exist_ok=True)
    for i in range(num_sections):
        recording_name = os.path.basename(audio[:-4])
        out_file = f"{recording_name}_{str(i)}.wav"
        sf.write(os.path.join(out_dir, out_file), split[i], sr)
        paths.append(os.path.join(out_dir, out_file))
        
            
    predicted_features = pd.DataFrame(columns=['features'])
    counter=0
    for path in paths:
        X, sample_rate = librosa.load(path
                                      ,duration=2.5
                                      ,sr=44100
                                      ,offset=0.5
                                     )
        sample_rate = np.array(sample_rate)
        mfccs = np.mean(librosa.feature.mfcc(y=X, 
                                            sr=sample_rate, 
                                            n_mfcc=13),
                        axis=0)
        predicted_features.loc[counter] = [mfccs]
        counter=counter+1
    predicted_features = pd.DataFrame(predicted_features['features'].values.tolist())
    predicted_features.dropna(inplace=True)
    preds = model.predict(predicted_features)

    preds=preds.argmax(axis=1)
    df_preds = pd.DataFrame(preds,columns = ['prediction'])
    emotions = []
    for i in df_preds['prediction']:
        emotion = labels[int(i)]
        emotions.append(emotion)
    df_preds['emotion'] = emotions
    df_preds = df_preds.reset_index()
    fig = px.line(df_preds, x="index", y="emotion", title='How emotion change over speech')
    fig.update_xaxes(title='The 3s intervals of speech')
    return fig

outputs = gr.Plot()
title = "Emotion recognition"
description = "This model can shows how speaker emotion changes over the speech"

infr = gr.Interface(fn=predict,
                    inputs=gr.Audio(type="filepath"),
                    examples=['audio_samples/1.mp3','audio_samples/2.mp3','audio_samples/3.mp3','audio_samples/4.mp3'],
                    cache_examples=True,
                    outputs=outputs,
                    title=title,description=description,interpretation='default',)
infr.launch()