Marina Kpamegan commited on
Commit
cfd1552
·
1 Parent(s): 06c46fb

predict file rebased

Browse files
.gitignore CHANGED
@@ -183,4 +183,4 @@ data/*
183
 
184
  # Mac
185
  .DS_Store
186
- *.pth
 
183
 
184
  # Mac
185
  .DS_Store
186
+ .idea
requirements.txt CHANGED
@@ -15,4 +15,5 @@ scikit-learn
15
  huggingface
16
  huggingface_hub
17
  pyaudio
18
- streamlit_audiorec
 
 
15
  huggingface
16
  huggingface_hub
17
  pyaudio
18
+ streamlit_audiorec
19
+ dotenv
src/model/predict.py DELETED
@@ -1,56 +0,0 @@
1
- import os
2
- import torch
3
- from transformers import Wav2Vec2Processor
4
- from src.model.emotion_classifier import Wav2Vec2EmotionClassifier
5
- import librosa
6
- import streamlit as st
7
-
8
- if "model_loaded" not in st.session_state:
9
- st.session_state.model_loaded = None
10
-
11
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
- # Charger le modèle et le processeur
13
- if st.session_state.model_loaded is None:
14
- st.session_state.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
15
- st.session_state.model = Wav2Vec2EmotionClassifier()
16
- st.session_state.model.load_state_dict(torch.load(os.path.join("src","model","wav2vec2_emotion.pth"), map_location=torch.device('cpu')), strict=False)
17
- st.session_state.model_loaded = True
18
-
19
- if st.session_state.model_loaded:
20
- processor = st.session_state.processor
21
- model = st.session_state.model
22
- model.to(device)
23
- model.eval()
24
-
25
- emotion_labels = ["joie", "colère", "neutre"]
26
-
27
- def predict_emotion(audio_path, output_probs=False, sampling_rate=16000):
28
- # waveform, _ = librosa.load(audio_path, sr=sampling_rate)
29
- input_values = processor(audio_path, return_tensors="pt", sampling_rate=sampling_rate).input_values
30
- input_values = input_values.to(device)
31
-
32
- with torch.no_grad():
33
- outputs = model(input_values)
34
-
35
- if output_probs:
36
- # Appliquer softmax pour obtenir des probabilités
37
- probabilities = torch.nn.functional.softmax(outputs, dim=-1)
38
-
39
- # Convertir en numpy array et prendre le premier (et seul) élément
40
- probabilities = probabilities[0].detach().cpu().numpy()
41
-
42
- # Créer un dictionnaire associant chaque émotion à sa probabilité
43
- emotion_probabilities = {emotion: prob for emotion, prob in zip(emotion_labels, probabilities)}
44
- # emotion_probabilities = {"emotions": [emotion for emotion in emotion_labels],
45
- # "probabilities": [prob for prob in probabilities]}
46
- return emotion_probabilities
47
- else:
48
- # Obtenir l'émotion la plus probable (i.e. la prédiction)
49
- predicted_label = torch.argmax(outputs, dim=1).item()
50
- return emotion_labels[predicted_label]
51
-
52
-
53
- # Exemple d'utilisation
54
- # audio_test = "data/n1ac.wav"
55
- # emotion = predict_emotion(audio_test)
56
- # print(f"Émotion détectée : {emotion}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/predict.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import soundfile as sf
4
+ import numpy as np
5
+ from model.emotion_classifier import EmotionClassifier
6
+ from model.feature_extrator import feature_extractor, processor
7
+ from utils.preprocessing import resampler
8
+ from config import DEVICE, LABELS
9
+ import os
10
+
11
+
12
+ # Charger le modèle sauvegardé
13
+ classifier = EmotionClassifier(feature_extractor.config.hidden_size, len(LABELS)).to(DEVICE)
14
+ classifier.load_state_dict(torch.load(os.path.join("best_emotion_model.pth"), map_location=torch.device(DEVICE)))
15
+ classifier.eval()
16
+
17
+
18
+ # Fonction de prédiction
19
+ def predict_emotion(audio_path):
20
+ # Charger l'audio
21
+ speech, sample_rate = sf.read(audio_path, dtype="float32")
22
+
23
+ # Rééchantillonnage si nécessaire
24
+ if sample_rate != 16000:
25
+ speech = torch.tensor(speech).unsqueeze(0)
26
+ speech = resampler(speech).squeeze(0).numpy()
27
+
28
+ # Extraire les features
29
+ inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
30
+ input_values = inputs.input_values.to(DEVICE)
31
+
32
+ with torch.no_grad():
33
+ features = feature_extractor(input_values).last_hidden_state.mean(dim=1)
34
+ logits = classifier(features)
35
+
36
+ # Obtenir la prédiction
37
+ predicted_label = torch.argmax(logits, dim=-1).item()
38
+ emotion = list(LABELS.keys())[predicted_label]
39
+
40
+ return emotion
41
+
42
+ # Exemple d'utilisation
43
+ if __name__ == "__main__":
44
+ base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
45
+ audio_file = os.path.join(base_path, "colere", "c1ac.wav")
46
+ emotion = predict_emotion(audio_file)
47
+ print(f"🎤 L'émotion prédite est : {emotion}")
views/application.py CHANGED
@@ -6,7 +6,7 @@ import os
6
  import matplotlib.pyplot as plt
7
  import librosa
8
  from src.model.transcriber import transcribe_audio
9
- from src.model.predict import predict_emotion
10
 
11
 
12
  DIRECTORY = "audios"
 
6
  import matplotlib.pyplot as plt
7
  import librosa
8
  from src.model.transcriber import transcribe_audio
9
+ from predict import predict_emotion
10
 
11
 
12
  DIRECTORY = "audios"
views/real_time.py CHANGED
@@ -10,7 +10,7 @@ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
10
  import matplotlib.pyplot as plt
11
  import numpy as np
12
  import time
13
- from src.model.predict import predict_emotion
14
 
15
  # Charger le modèle Wav2Vec2 pour la classification des émotions
16
  model_name = "superb/wav2vec2-base-superb-er" # Exemple de modèle pour la reconnaissance des émotions
 
10
  import matplotlib.pyplot as plt
11
  import numpy as np
12
  import time
13
+ from predict import predict_emotion
14
 
15
  # Charger le modèle Wav2Vec2 pour la classification des émotions
16
  model_name = "superb/wav2vec2-base-superb-er" # Exemple de modèle pour la reconnaissance des émotions