Marina Kpamegan commited on
Commit
6855218
·
1 Parent(s): 2b8147e
src/model/emotion_classifier.py CHANGED
@@ -1,5 +1,4 @@
1
- import torch
2
- import torch.nn as nn
3
 
4
  # Prédit 33% environ partout (dans le cas 3 classes)
5
 
@@ -19,17 +18,37 @@ import torch.nn as nn
19
 
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  class EmotionClassifier(nn.Module):
23
- def __init__(self, feature_dim, num_labels=3):
 
24
  super(EmotionClassifier, self).__init__()
25
- self.fc = nn.Linear(feature_dim, num_labels)
26
- self.dropout = nn.Dropout(0.3) # Evite l'overfitting
 
27
 
28
  def forward(self, x):
29
- pooled_output = torch.mean(x, dim=1) # Moyenne des features audio
30
- pooled_output = self.dropout(pooled_output) # Dropout avant classification
31
- logits = self.fc(pooled_output)
32
- return logits
 
33
 
34
 
35
 
 
1
+
 
2
 
3
  # Prédit 33% environ partout (dans le cas 3 classes)
4
 
 
18
 
19
 
20
 
21
+ import torch
22
+ import torch.nn as nn
23
+ import torch.nn.functional as F
24
+
25
+ class Attention(nn.Module):
26
+ """Mécanisme d’attention permettant de pondérer l’importance des caractéristiques audio"""
27
+ def __init__(self, hidden_dim):
28
+ super(Attention, self).__init__()
29
+ self.attention_weights = nn.Linear(hidden_dim, 1)
30
+
31
+ def forward(self, lstm_output):
32
+ # lstm_output: (batch_size, sequence_length, hidden_dim)
33
+ attention_scores = self.attention_weights(lstm_output) # (batch_size, sequence_length, 1)
34
+ attention_weights = torch.softmax(attention_scores, dim=1) # Normalisation softmax
35
+ weighted_output = lstm_output * attention_weights # Pondération des features
36
+ return weighted_output.sum(dim=1) # Somme pondérée sur la séquence
37
+
38
  class EmotionClassifier(nn.Module):
39
+ """Modèle de classification des émotions basé sur BiLSTM et attention"""
40
+ def __init__(self, feature_dim, num_labels, hidden_dim=128):
41
  super(EmotionClassifier, self).__init__()
42
+ self.lstm = nn.LSTM(feature_dim, hidden_dim, batch_first=True, bidirectional=True)
43
+ self.attention = Attention(hidden_dim * 2) # Bidirectionnel → hidden_dim * 2
44
+ self.fc = nn.Linear(hidden_dim * 2, num_labels) # Couche de classification finale
45
 
46
  def forward(self, x):
47
+ lstm_out, _ = self.lstm(x) # (batch_size, sequence_length, hidden_dim*2)
48
+ attention_out = self.attention(lstm_out) # (batch_size, hidden_dim*2)
49
+ logits = self.fc(attention_out) # (batch_size, num_labels)
50
+ return logits
51
+
52
 
53
 
54
 
src/model/feature_extractor.py CHANGED
@@ -1,6 +1,6 @@
1
  import torch
2
  from transformers import Wav2Vec2Model, Wav2Vec2Processor
3
- from config import MODEL_NAME, DEVICE
4
 
5
  processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
6
  feature_extractor = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
 
1
  import torch
2
  from transformers import Wav2Vec2Model, Wav2Vec2Processor
3
+ from src.config import MODEL_NAME, DEVICE
4
 
5
  processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
6
  feature_extractor = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
src/predict.py CHANGED
@@ -4,8 +4,8 @@ import torch
4
  import librosa
5
  import numpy as np
6
  from model.emotion_classifier import EmotionClassifier
7
- from utils.preprocessing import collate_fn
8
- from config import DEVICE, NUM_LABELS, BEST_MODEL_NAME
9
 
10
  # Charger le modèle entraîné
11
  feature_dim = 40 # Nombre de MFCCs utilisés
 
4
  import librosa
5
  import numpy as np
6
  from model.emotion_classifier import EmotionClassifier
7
+ from src.utils.preprocessing import collate_fn
8
+ from src.config import DEVICE, NUM_LABELS, BEST_MODEL_NAME
9
 
10
  # Charger le modèle entraîné
11
  feature_dim = 40 # Nombre de MFCCs utilisés
src/test_speech.py DELETED
@@ -1,49 +0,0 @@
1
- import torch
2
- import torchaudio
3
- from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
4
- import os
5
-
6
- # 🔹 Paramètres
7
- MODEL_NAME = "./wav2vec2_emotion" # Chemin du modèle sauvegardé
8
- LABELS = ["colere", "joie", "neutre"] # Les classes
9
-
10
- # 🔹 Charger le processeur et le modèle
11
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
13
- model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
14
- model.eval() # Mode évaluation
15
-
16
-
17
- def predict_emotion(audio_path):
18
- # Charger l'audio
19
- waveform, sample_rate = torchaudio.load(audio_path)
20
-
21
- # Prétraitement du son
22
- inputs = processor(
23
- waveform.squeeze().numpy(),
24
- sampling_rate=sample_rate,
25
- return_tensors="pt",
26
- padding=True,
27
- truncation=True,
28
- max_length=32000 # Ajuste selon la durée de tes fichiers
29
- )
30
-
31
- # Envoyer les données sur le bon device (CPU ou GPU)
32
- input_values = inputs["input_values"].to(device)
33
-
34
- # Prédiction
35
- with torch.no_grad():
36
- logits = model(input_values).logits
37
-
38
- # Trouver l'émotion prédite
39
- predicted_class = torch.argmax(logits, dim=-1).item()
40
-
41
- return LABELS[predicted_class] # Retourne le label correspondant
42
-
43
- base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
44
- audio_file = os.path.join(base_path, "colere", "c1ac.wav")
45
- predicted_emotion = predict_emotion(audio_file)
46
- print(f"🎙️ Émotion prédite : {predicted_emotion}")
47
-
48
-
49
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/train_speech.py DELETED
@@ -1,88 +0,0 @@
1
- import torch
2
- import torchaudio
3
- import os
4
- from datasets import Dataset, DatasetDict
5
- from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
6
-
7
- # 🔹 Paramètres
8
- MODEL_NAME = "facebook/wav2vec2-large-xlsr-53-french"
9
- NUM_LABELS = 3 # Nombre de classes émotionnelles
10
- BATCH_SIZE = 8
11
- EPOCHS = 10
12
- LEARNING_RATE = 1e-4
13
- MAX_LENGTH = 32000 # Ajuste selon la durée de tes fichiers audio
14
-
15
- # 🔹 Vérifier GPU dispo
16
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
-
18
- # 🔹 Charger le processeur et le modèle
19
- processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
20
- model = Wav2Vec2ForSequenceClassification.from_pretrained(
21
- MODEL_NAME,
22
- num_labels=NUM_LABELS,
23
- problem_type="single_label_classification"
24
- ).to(device)
25
-
26
- # 🔹 Fonction pour charger les fichiers audio sans CSV
27
- def load_audio_data(data_dir):
28
- data = {"file_path": [], "label": []}
29
- labels = ["colere", "joie", "neutre"] # Ajuste selon tes classes
30
-
31
- for label in labels:
32
- folder_path = os.path.join(data_dir, label)
33
- for file in os.listdir(folder_path):
34
- if file.endswith(".wav"):
35
- data["file_path"].append(os.path.join(folder_path, file))
36
- data["label"].append(labels.index(label))
37
-
38
- dataset = Dataset.from_dict(data)
39
- train_test_split = dataset.train_test_split(test_size=0.2) # 80% train, 20% test
40
- return DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})
41
-
42
- # 🔹 Prétraitement de l'audio
43
- def preprocess_audio(file_path):
44
- waveform, sample_rate = torchaudio.load(file_path)
45
- inputs = processor(
46
- waveform.squeeze().numpy(),
47
- sampling_rate=sample_rate,
48
- return_tensors="pt",
49
- padding=True,
50
- truncation=True,
51
- max_length=MAX_LENGTH # ✅ Correction de l'erreur
52
- )
53
- return inputs["input_values"][0] # Récupère les valeurs audio prétraitées
54
-
55
- # 🔹 Charger et prétraiter le dataset
56
- data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
57
- ds = load_audio_data(data_dir)
58
-
59
- def preprocess_batch(batch):
60
- batch["input_values"] = preprocess_audio(batch["file_path"])
61
- return batch
62
-
63
- ds = ds.map(preprocess_batch, remove_columns=["file_path"])
64
-
65
- # 🔹 Définir les arguments d'entraînement
66
- training_args = TrainingArguments(
67
- output_dir="./wav2vec2_emotion",
68
- evaluation_strategy="epoch",
69
- save_strategy="epoch",
70
- learning_rate=LEARNING_RATE,
71
- per_device_train_batch_size=BATCH_SIZE,
72
- per_device_eval_batch_size=BATCH_SIZE,
73
- num_train_epochs=EPOCHS,
74
- save_total_limit=2,
75
- logging_dir="./logs",
76
- logging_steps=10,
77
- )
78
-
79
- # 🔹 Définir le trainer
80
- trainer = Trainer(
81
- model=model,
82
- args=training_args,
83
- train_dataset=ds["train"],
84
- eval_dataset=ds["test"],
85
- )
86
-
87
- # 🚀 Lancer l'entraînement
88
- trainer.train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/dataset.py CHANGED
@@ -1,13 +1,8 @@
1
  import os
2
  from datasets import Dataset
3
- from config import LABELS
4
  import pandas as pd
5
 
6
- import os
7
- from datasets import Dataset, DatasetDict
8
- import pandas as pd
9
- from src.config import LABELS
10
-
11
  def load_audio_data(data_dir):
12
  data = []
13
  for label_name, label_id in LABELS.items():
 
1
  import os
2
  from datasets import Dataset
3
+ from src.config import LABELS
4
  import pandas as pd
5
 
 
 
 
 
 
6
  def load_audio_data(data_dir):
7
  data = []
8
  for label_name, label_id in LABELS.items():
src/utils/preprocessing.py CHANGED
@@ -3,8 +3,8 @@ import soundfile as sf
3
  import torch
4
  import torchaudio
5
  import numpy as np
6
- from model.feature_extractor import processor # type: ignore
7
- from config import DEVICE
8
 
9
  # Resampler pour convertir en 16kHz
10
  resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
 
3
  import torch
4
  import torchaudio
5
  import numpy as np
6
+ from src.model.feature_extractor import processor # type: ignore
7
+ from src.config import DEVICE
8
 
9
  # Resampler pour convertir en 16kHz
10
  resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)