Spaces:

jdalfonso
/

SISE-ULTIMATE-CHALLENGE

Sleeping

App Files Files Community

Cyr-CK commited on Mar 13

Commit

aaa3b8b

1 Parent(s): f9bbbb3

Added real-time emotion detection over an uploaded audio file

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +3 -0
src/model/predict.py +29 -16
src/model/transcriber.py +8 -5
views/application.py +143 -6
views/real_time.py +35 -48

.gitignore CHANGED Viewed

@@ -178,6 +178,7 @@ dataset/
 old/
 *.wav
 data/*
 # Mac
 .DS_Store

 old/
 *.wav
 data/*
+*.pth
 # Mac
 .DS_Store

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from streamlit_option_menu import option_menu
 from views.application import application
 from views.about import about
 # Set the logo
 st.sidebar.image("img/logo.png", use_container_width=True)

 from views.application import application
 from views.about import about
+if "model_loaded" not in st.session_state:
+    st.session_state.model_loaded = None
 # Set the logo
 st.sidebar.image("img/logo.png", use_container_width=True)

src/model/predict.py CHANGED Viewed

@@ -1,35 +1,48 @@
 import torch
 from transformers import Wav2Vec2Processor
-from model import Wav2Vec2EmotionClassifier
 import librosa
-# Charger le modèle et le processeur
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
-model = Wav2Vec2EmotionClassifier()
-model.load_state_dict(torch.load("wav2vec2_emotion.pth"))
-model.to(device)
-model.eval()
 emotion_labels = ["joie", "colère", "neutre"]
 def predict_emotion(audio_path, output_probs=False, sampling_rate=16000):
-    waveform, _ = librosa.load(audio_path, sr=sampling_rate)
-    input_values = processor(waveform, return_tensors="pt", sampling_rate=sampling_rate).input_values
     input_values = input_values.to(device)
     with torch.no_grad():
         outputs = model(input_values)
-    if output_probs:
         # Appliquer softmax pour obtenir des probabilités
-        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
         # Convertir en numpy array et prendre le premier (et seul) élément
         probabilities = probabilities[0].detach().cpu().numpy()
         # Créer un dictionnaire associant chaque émotion à sa probabilité
         emotion_probabilities = {emotion: prob for emotion, prob in zip(emotion_labels, probabilities)}
         return emotion_probabilities
     else:
         # Obtenir l'émotion la plus probable (i.e. la prédiction)
@@ -38,6 +51,6 @@ def predict_emotion(audio_path, output_probs=False, sampling_rate=16000):
 # Exemple d'utilisation
-audio_test = "data/n1ac.wav"
-emotion = predict_emotion(audio_test)
-print(f"Émotion détectée : {emotion}")

+import os
 import torch
 from transformers import Wav2Vec2Processor
+from src.model.emotion_classifier import Wav2Vec2EmotionClassifier
 import librosa
+import streamlit as st
+if "model_loaded" not in st.session_state:
+    st.session_state.model_loaded = None
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Charger le modèle et le processeur
+if st.session_state.model_loaded is None:
+    st.session_state.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
+    st.session_state.model = Wav2Vec2EmotionClassifier()
+    st.session_state.model.load_state_dict(torch.load(os.path.join("src","model","wav2vec2_emotion.pth"), map_location=torch.device('cpu')), strict=False)
+    st.session_state.model_loaded = True
+if st.session_state.model_loaded:
+    processor = st.session_state.processor
+    model = st.session_state.model
+    model.to(device)
+    model.eval()
 emotion_labels = ["joie", "colère", "neutre"]
 def predict_emotion(audio_path, output_probs=False, sampling_rate=16000):
+    # waveform, _ = librosa.load(audio_path, sr=sampling_rate)
+    input_values = processor(audio_path, return_tensors="pt", sampling_rate=sampling_rate).input_values
     input_values = input_values.to(device)
     with torch.no_grad():
         outputs = model(input_values)
+    if output_probs:
         # Appliquer softmax pour obtenir des probabilités
+        probabilities = torch.nn.functional.softmax(outputs, dim=-1)
         # Convertir en numpy array et prendre le premier (et seul) élément
         probabilities = probabilities[0].detach().cpu().numpy()
         # Créer un dictionnaire associant chaque émotion à sa probabilité
         emotion_probabilities = {emotion: prob for emotion, prob in zip(emotion_labels, probabilities)}
+        # emotion_probabilities = {"emotions": [emotion for emotion in emotion_labels],
+        #                          "probabilities": [prob for prob in probabilities]}
         return emotion_probabilities
     else:
         # Obtenir l'émotion la plus probable (i.e. la prédiction)
 # Exemple d'utilisation
+# audio_test = "data/n1ac.wav"
+# emotion = predict_emotion(audio_test)
+# print(f"Émotion détectée : {emotion}")

src/model/transcriber.py CHANGED Viewed

@@ -1,13 +1,16 @@
 import torch
 from transformers import Wav2Vec2Processor
 from src.model.emotion_classifier import Wav2Vec2EmotionClassifier
 import librosa
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
-# model = Wav2Vec2EmotionClassifier()
-# model.load_state_dict(torch.load("wav2vec2_emotion.pth"))
-# model.to(device)
 def transcribe_audio(audio, sampling_rate=16000):

+import os
 import torch
 from transformers import Wav2Vec2Processor
 from src.model.emotion_classifier import Wav2Vec2EmotionClassifier
 import librosa
+# Charger le modèle et le processeur
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# if st.
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
+model = Wav2Vec2EmotionClassifier()
+model.load_state_dict(torch.load(os.path.join("src","model","wav2vec2_emotion.pth"), map_location=torch.device('cpu')), strict=False)
+model.to(device)
 def transcribe_audio(audio, sampling_rate=16000):

views/application.py CHANGED Viewed

@@ -1,11 +1,20 @@
 import streamlit as st
 from st_audiorec import st_audiorec
 import datetime
 import os
 from src.model.transcriber import transcribe_audio
 DIRECTORY = "audios"
 FILE_NAME = "audio.wav"
 def application():
     st.title("SISE ultimate challenge")
@@ -25,12 +34,141 @@ def application():
         st.header("⬆️ Upload Audio Record")
         st.write("Here you can upload a pre-recorded audio.")
         audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
         if audio_file is not None:
-            with open(f"{DIRECTORY}/{FILE_NAME}", "wb") as f:
                 f.write(audio_file.getbuffer())
                 st.success(f"Saved file: {FILE_NAME}")
     with tab2:
         st.header("🔈 Realtime Audio Record")
         st.write("Here you can record an audio.")
@@ -52,17 +190,16 @@ def application():
         ############################# A décommenté quand ce sera débogué
         if st.button("Transcribe", key="transcribe-button"):
         #     # Fonction pour transcrire l'audio
-        #     transcription = transcribe_audio(st.audio)
         #     # Charger et transcrire l'audio
         #     # audio, rate = load_audio(audio_file_path) # (re)chargement de l'audio si nécessaire
-        #     transcription = transcribe_audio(audio_file, sampling_rate=16000)
         #     # Afficher la transcription
-        #     st.write("Transcription :", transcription)
-             st.success("Audio registered successfully.")
         #     if save:
         #         file_path = "transcript.txt"

 import streamlit as st
+import pandas as pd
 from st_audiorec import st_audiorec
 import datetime
 import os
+import matplotlib.pyplot as plt
+import librosa
 from src.model.transcriber import transcribe_audio
+from src.model.predict import predict_emotion
 DIRECTORY = "audios"
 FILE_NAME = "audio.wav"
+CHUNK = 1024
+# FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
 def application():
     st.title("SISE ultimate challenge")
         st.header("⬆️ Upload Audio Record")
         st.write("Here you can upload a pre-recorded audio.")
         audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
         if audio_file is not None:
+            with open(os.path.join(DIRECTORY,FILE_NAME), "wb") as f:
                 f.write(audio_file.getbuffer())
                 st.success(f"Saved file: {FILE_NAME}")
+            start_inference = st.button("Start emotion recogniton","inf_on_upl_btn")
+            emotion_labels = ["joie", "colère", "neutre"]
+            colors = ['#f6d60a', '#f71c1c', '#cac8c8']
+            if start_inference:
+                # Configuration Streamlit
+                with st.spinner("Real-time emotion analysis..."):
+                    # uploaded_file = st.file_uploader("Choisissez un fichier audio", type=["wav", "mp3"])
+                    if audio_file is not None:
+                        # Charger et rééchantillonner l'audio
+                        audio, sr = librosa.load(audio_file, sr=RATE)
+                        # chunk = audio_file
+                        # Paramètres de la fenêtre glissante
+                        window_size = 1  # en secondes
+                        hop_length = 0.5  # en secondes
+                        # Créer un graphique en temps réel
+                        fig, ax = plt.subplots()
+                        lines = [ax.plot([], [], label=emotion)[0] for emotion in emotion_labels]
+                        ax.set_ylim(0, 1)
+                        ax.set_xlim(0, len(audio) / sr)
+                        ax.set_xlabel("Temps (s)")
+                        ax.set_ylabel("Probabilité")
+                        ax.legend()
+                        chart = st.pyplot(fig)
+                        scores = [[],[],[]] # 3 émotions pour l'instant
+                        # Traitement par fenêtre glissante
+                        for i in range(0, len(audio), int(hop_length * sr)):
+                            chunk = audio[i:i + int(window_size * sr)]
+                            if len(chunk) < int(window_size * sr):
+                                break
+                            emotion_scores = predict_emotion(chunk, output_probs=True, sampling_rate=RATE)
+                            # Mettre à jour le graphique
+                            for emotion, line in zip(emotion_labels, lines):
+                                xdata = list(line.get_xdata())
+                                ydata = list(line.get_ydata())
+                                xdata.append(i / sr)
+                                ydata.append(emotion_scores[emotion])
+                                scores[list(emotion_scores).index(emotion)].append(emotion_scores[emotion])
+                                line.set_data(xdata, ydata)
+                            ax.relim()
+                            ax.autoscale_view()
+                            chart.pyplot(fig, use_container_width=True)
+                        # Prepare the styling
+                        st.markdown("""
+                                    <style>
+                                    .colored-box {
+                                        padding: 10px;
+                                        border-radius: 5px;
+                                        color: white;
+                                        font-weight: bold;
+                                        text-align: center;
+                                    }
+                                    </style>
+                                    """
+                                    , unsafe_allow_html=True)
+                        # Dynamically create the specified number of columns
+                        columns = st.columns(len(emotion_scores))
+                        # emotion_scores_mean = [sum(sublist) / len(sublist) for sublist in scores]
+                        emotion_scores_mean = {emotion:sum(sublist) / len(sublist) for emotion, sublist in zip(emotion_labels, scores)}
+                        max_emo = max(emotion_scores_mean)
+                        emotion_scores_sorted = dict(sorted(emotion_scores_mean.items(), key=lambda x: x[1], reverse=True))
+                        colors_sorted = [colors[list(emotion_scores_mean.keys()).index(key)] for key in list(emotion_scores_sorted.keys())]
+                        # Add content to each column
+                        for i, (col, emotion) in enumerate(zip(columns, emotion_scores_sorted)):
+                            color = colors_sorted[i % len(colors_sorted)]  # Cycle through colors if more columns than colors
+                            col.markdown(f"""
+                                        <div class="colored-box" style="background-color: {color};">
+                                            {emotion} : {100*emotion_scores_sorted[emotion]:.2f} %
+                                        </div>
+                                        """
+                            , unsafe_allow_html=True)
+                        st.success("Analyse terminée !")
+                    else:
+                        st.warning("You need to load an audio file !")
+                st.subheader("Feedback")
+                # Initialisation du fichier CSV
+                csv_file = os.path.join("src","predictions","feedback.csv")
+                # Vérifier si le fichier CSV existe, sinon le créer avec des colonnes appropriées
+                if not os.path.exists(csv_file):
+                    df = pd.DataFrame(columns=["filepath", "prediction", "feedback"])
+                    df.to_csv(csv_file, index=False)
+                # Charger les données existantes du CSV
+                df = pd.read_csv(csv_file)
+                with st.form("feedback_form"):
+                    st.write("What should have been the correct prediction ? (*Choose the same emotion if the prediction was correct*).")
+                    feedback = st.selectbox("Your answer :", ['Sadness','Anger', 'Disgust', 'Fear', 'Surprise', 'Joy', 'Neutral'])
+                    submit_button = st.form_submit_button("Submit")
+                    st.write("En cliquant sur ce bouton, vous acceptez que votre audio soit sauvegardé dans notre base de données.")
+                    if submit_button:
+                        # Ajouter le feedback au DataFrame
+                        new_entry = {"filepath": audio_file.name, "prediction": max_emo, "feedback": feedback}
+                        df = df.append(new_entry, ignore_index=True)
+                        # Sauvegarder les données mises à jour dans le fichier CSV
+                        df.to_csv(csv_file, index=False)
+                        # Sauvegarder le fichier audio
+                        with open(os.path.join("src","predictions","data"), "wb") as f:
+                            f.write(audio_file.getbuffer())
+                        # Confirmation pour l'utilisateur
+                        st.success("Merci pour votre retour ! Vos données ont été sauvegardées.")
     with tab2:
         st.header("🔈 Realtime Audio Record")
         st.write("Here you can record an audio.")
         ############################# A décommenté quand ce sera débogué
         if st.button("Transcribe", key="transcribe-button"):
         #     # Fonction pour transcrire l'audio
+            # transcription = transcribe_audio(st.audio)
         #     # Charger et transcrire l'audio
         #     # audio, rate = load_audio(audio_file_path) # (re)chargement de l'audio si nécessaire
+            # transcription = transcribe_audio(audio_file, sampling_rate=16000)
         #     # Afficher la transcription
+            # st.write("Transcription :", transcription)
+            st.success("Audio registered successfully.")
         #     if save:
         #         file_path = "transcript.txt"

views/real_time.py CHANGED Viewed

@@ -86,64 +86,51 @@ if start_button:
 ### Real time prediction for uploaded audio file
 ###############################
 # Charger le modèle wav2vec et le processeur
-model = Wav2Vec2ForSequenceClassification.from_pretrained("your_emotion_model_path")
-processor = Wav2Vec2Processor.from_pretrained("your_emotion_model_path")
-# Définir les émotions
-emotions = ["neutre", "joie", "colère", "tristesse"]  # Ajustez selon votre modèle
-# Fonction pour prédire l'émotion
-# def predict_emotion(audio_chunk):
-#     inputs = processor(audio_chunk, sampling_rate=16000, return_tensors="pt", padding=True)
-#     with torch.no_grad():
-#         logits = model(**inputs).logits
-#     scores = torch.softmax(logits, dim=1).squeeze().tolist()
-#     return dict(zip(emotions, scores))
-# Configuration Streamlit
-st.title("Analyse des émotions en temps réel")
-uploaded_file = st.file_uploader("Choisissez un fichier audio", type=["wav", "mp3"])
-if uploaded_file is not None:
-    # Charger et rééchantillonner l'audio
-    audio, sr = librosa.load(uploaded_file, sr=16000)
-    # Paramètres de la fenêtre glissante
-    window_size = 1  # en secondes
-    hop_length = 0.5  # en secondes
-    # Créer un graphique en temps réel
-    fig, ax = plt.subplots()
-    lines = [ax.plot([], [], label=emotion)[0] for emotion in emotions]
-    ax.set_ylim(0, 1)
-    ax.set_xlim(0, len(audio) / sr)
-    ax.set_xlabel("Temps (s)")
-    ax.set_ylabel("Probabilité")
-    ax.legend()
-    chart = st.pyplot(fig)
-    # Traitement par fenêtre glissante
-    for i in range(0, len(audio), int(hop_length * sr)):
-        chunk = audio[i:i + int(window_size * sr)]
-        if len(chunk) < int(window_size * sr):
-            break
-        emotion_scores = predict_emotion(chunk, output_probs=False, sampling_rate=RATE)
-        # Mettre à jour le graphique
-        for emotion, line in zip(emotions, lines):
-            xdata = line.get_xdata().tolist()
-            ydata = line.get_ydata().tolist()
-            xdata.append(i / sr)
-            ydata.append(emotion_scores[emotion])
-            line.set_data(xdata, ydata)
-        ax.relim()
-        ax.autoscale_view()
-        chart.pyplot(fig)
-    st.success("Analyse terminée !")

 ### Real time prediction for uploaded audio file
 ###############################
 # Charger le modèle wav2vec et le processeur
+# # Configuration Streamlit
+# st.title("Analyse des émotions en temps réel")
+# uploaded_file = st.file_uploader("Choisissez un fichier audio", type=["wav", "mp3"])
+# if uploaded_file is not None:
+#     # Charger et rééchantillonner l'audio
+#     audio, sr = librosa.load(uploaded_file, sr=16000)
+#     # Paramètres de la fenêtre glissante
+#     window_size = 1  # en secondes
+#     hop_length = 0.5  # en secondes
+#     # Créer un graphique en temps réel
+#     fig, ax = plt.subplots()
+#     lines = [ax.plot([], [], label=emotion)[0] for emotion in emotions]
+#     ax.set_ylim(0, 1)
+#     ax.set_xlim(0, len(audio) / sr)
+#     ax.set_xlabel("Temps (s)")
+#     ax.set_ylabel("Probabilité")
+#     ax.legend()
+#     chart = st.pyplot(fig)
+#     # Traitement par fenêtre glissante
+#     for i in range(0, len(audio), int(hop_length * sr)):
+#         chunk = audio[i:i + int(window_size * sr)]
+#         if len(chunk) < int(window_size * sr):
+#             break
+#         emotion_scores = predict_emotion(chunk, output_probs=False, sampling_rate=RATE)
+#         # Mettre à jour le graphique
+#         for emotion, line in zip(emotions, lines):
+#             xdata = line.get_xdata().tolist()
+#             ydata = line.get_ydata().tolist()
+#             xdata.append(i / sr)
+#             ydata.append(emotion_scores[emotion])
+#             line.set_data(xdata, ydata)
+#         ax.relim()
+#         ax.autoscale_view()
+#         chart.pyplot(fig)
+#     st.success("Analyse terminée !")