Spaces:
Sleeping
Sleeping
Marina Kpamegan
commited on
Commit
·
6855218
1
Parent(s):
2b8147e
new model
Browse files- src/model/emotion_classifier.py +28 -9
- src/model/feature_extractor.py +1 -1
- src/predict.py +2 -2
- src/test_speech.py +0 -49
- src/train_speech.py +0 -88
- src/utils/dataset.py +1 -6
- src/utils/preprocessing.py +2 -2
src/model/emotion_classifier.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
|
2 |
-
import torch.nn as nn
|
3 |
|
4 |
# Prédit 33% environ partout (dans le cas 3 classes)
|
5 |
|
@@ -19,17 +18,37 @@ import torch.nn as nn
|
|
19 |
|
20 |
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
class EmotionClassifier(nn.Module):
|
23 |
-
|
|
|
24 |
super(EmotionClassifier, self).__init__()
|
25 |
-
self.
|
26 |
-
self.
|
|
|
27 |
|
28 |
def forward(self, x):
|
29 |
-
|
30 |
-
|
31 |
-
logits = self.fc(
|
32 |
-
return logits
|
|
|
33 |
|
34 |
|
35 |
|
|
|
1 |
+
|
|
|
2 |
|
3 |
# Prédit 33% environ partout (dans le cas 3 classes)
|
4 |
|
|
|
18 |
|
19 |
|
20 |
|
21 |
+
import torch
|
22 |
+
import torch.nn as nn
|
23 |
+
import torch.nn.functional as F
|
24 |
+
|
25 |
+
class Attention(nn.Module):
|
26 |
+
"""Mécanisme d’attention permettant de pondérer l’importance des caractéristiques audio"""
|
27 |
+
def __init__(self, hidden_dim):
|
28 |
+
super(Attention, self).__init__()
|
29 |
+
self.attention_weights = nn.Linear(hidden_dim, 1)
|
30 |
+
|
31 |
+
def forward(self, lstm_output):
|
32 |
+
# lstm_output: (batch_size, sequence_length, hidden_dim)
|
33 |
+
attention_scores = self.attention_weights(lstm_output) # (batch_size, sequence_length, 1)
|
34 |
+
attention_weights = torch.softmax(attention_scores, dim=1) # Normalisation softmax
|
35 |
+
weighted_output = lstm_output * attention_weights # Pondération des features
|
36 |
+
return weighted_output.sum(dim=1) # Somme pondérée sur la séquence
|
37 |
+
|
38 |
class EmotionClassifier(nn.Module):
|
39 |
+
"""Modèle de classification des émotions basé sur BiLSTM et attention"""
|
40 |
+
def __init__(self, feature_dim, num_labels, hidden_dim=128):
|
41 |
super(EmotionClassifier, self).__init__()
|
42 |
+
self.lstm = nn.LSTM(feature_dim, hidden_dim, batch_first=True, bidirectional=True)
|
43 |
+
self.attention = Attention(hidden_dim * 2) # Bidirectionnel → hidden_dim * 2
|
44 |
+
self.fc = nn.Linear(hidden_dim * 2, num_labels) # Couche de classification finale
|
45 |
|
46 |
def forward(self, x):
|
47 |
+
lstm_out, _ = self.lstm(x) # (batch_size, sequence_length, hidden_dim*2)
|
48 |
+
attention_out = self.attention(lstm_out) # (batch_size, hidden_dim*2)
|
49 |
+
logits = self.fc(attention_out) # (batch_size, num_labels)
|
50 |
+
return logits
|
51 |
+
|
52 |
|
53 |
|
54 |
|
src/model/feature_extractor.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import torch
|
2 |
from transformers import Wav2Vec2Model, Wav2Vec2Processor
|
3 |
-
from config import MODEL_NAME, DEVICE
|
4 |
|
5 |
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
6 |
feature_extractor = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
|
|
|
1 |
import torch
|
2 |
from transformers import Wav2Vec2Model, Wav2Vec2Processor
|
3 |
+
from src.config import MODEL_NAME, DEVICE
|
4 |
|
5 |
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
6 |
feature_extractor = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
|
src/predict.py
CHANGED
@@ -4,8 +4,8 @@ import torch
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
from model.emotion_classifier import EmotionClassifier
|
7 |
-
from utils.preprocessing import collate_fn
|
8 |
-
from config import DEVICE, NUM_LABELS, BEST_MODEL_NAME
|
9 |
|
10 |
# Charger le modèle entraîné
|
11 |
feature_dim = 40 # Nombre de MFCCs utilisés
|
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
from model.emotion_classifier import EmotionClassifier
|
7 |
+
from src.utils.preprocessing import collate_fn
|
8 |
+
from src.config import DEVICE, NUM_LABELS, BEST_MODEL_NAME
|
9 |
|
10 |
# Charger le modèle entraîné
|
11 |
feature_dim = 40 # Nombre de MFCCs utilisés
|
src/test_speech.py
DELETED
@@ -1,49 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torchaudio
|
3 |
-
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
|
4 |
-
import os
|
5 |
-
|
6 |
-
# 🔹 Paramètres
|
7 |
-
MODEL_NAME = "./wav2vec2_emotion" # Chemin du modèle sauvegardé
|
8 |
-
LABELS = ["colere", "joie", "neutre"] # Les classes
|
9 |
-
|
10 |
-
# 🔹 Charger le processeur et le modèle
|
11 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
12 |
-
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
13 |
-
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
|
14 |
-
model.eval() # Mode évaluation
|
15 |
-
|
16 |
-
|
17 |
-
def predict_emotion(audio_path):
|
18 |
-
# Charger l'audio
|
19 |
-
waveform, sample_rate = torchaudio.load(audio_path)
|
20 |
-
|
21 |
-
# Prétraitement du son
|
22 |
-
inputs = processor(
|
23 |
-
waveform.squeeze().numpy(),
|
24 |
-
sampling_rate=sample_rate,
|
25 |
-
return_tensors="pt",
|
26 |
-
padding=True,
|
27 |
-
truncation=True,
|
28 |
-
max_length=32000 # Ajuste selon la durée de tes fichiers
|
29 |
-
)
|
30 |
-
|
31 |
-
# Envoyer les données sur le bon device (CPU ou GPU)
|
32 |
-
input_values = inputs["input_values"].to(device)
|
33 |
-
|
34 |
-
# Prédiction
|
35 |
-
with torch.no_grad():
|
36 |
-
logits = model(input_values).logits
|
37 |
-
|
38 |
-
# Trouver l'émotion prédite
|
39 |
-
predicted_class = torch.argmax(logits, dim=-1).item()
|
40 |
-
|
41 |
-
return LABELS[predicted_class] # Retourne le label correspondant
|
42 |
-
|
43 |
-
base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
|
44 |
-
audio_file = os.path.join(base_path, "colere", "c1ac.wav")
|
45 |
-
predicted_emotion = predict_emotion(audio_file)
|
46 |
-
print(f"🎙️ Émotion prédite : {predicted_emotion}")
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/train_speech.py
DELETED
@@ -1,88 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
import torchaudio
|
3 |
-
import os
|
4 |
-
from datasets import Dataset, DatasetDict
|
5 |
-
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
|
6 |
-
|
7 |
-
# 🔹 Paramètres
|
8 |
-
MODEL_NAME = "facebook/wav2vec2-large-xlsr-53-french"
|
9 |
-
NUM_LABELS = 3 # Nombre de classes émotionnelles
|
10 |
-
BATCH_SIZE = 8
|
11 |
-
EPOCHS = 10
|
12 |
-
LEARNING_RATE = 1e-4
|
13 |
-
MAX_LENGTH = 32000 # Ajuste selon la durée de tes fichiers audio
|
14 |
-
|
15 |
-
# 🔹 Vérifier GPU dispo
|
16 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
-
|
18 |
-
# 🔹 Charger le processeur et le modèle
|
19 |
-
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
|
20 |
-
model = Wav2Vec2ForSequenceClassification.from_pretrained(
|
21 |
-
MODEL_NAME,
|
22 |
-
num_labels=NUM_LABELS,
|
23 |
-
problem_type="single_label_classification"
|
24 |
-
).to(device)
|
25 |
-
|
26 |
-
# 🔹 Fonction pour charger les fichiers audio sans CSV
|
27 |
-
def load_audio_data(data_dir):
|
28 |
-
data = {"file_path": [], "label": []}
|
29 |
-
labels = ["colere", "joie", "neutre"] # Ajuste selon tes classes
|
30 |
-
|
31 |
-
for label in labels:
|
32 |
-
folder_path = os.path.join(data_dir, label)
|
33 |
-
for file in os.listdir(folder_path):
|
34 |
-
if file.endswith(".wav"):
|
35 |
-
data["file_path"].append(os.path.join(folder_path, file))
|
36 |
-
data["label"].append(labels.index(label))
|
37 |
-
|
38 |
-
dataset = Dataset.from_dict(data)
|
39 |
-
train_test_split = dataset.train_test_split(test_size=0.2) # 80% train, 20% test
|
40 |
-
return DatasetDict({"train": train_test_split["train"], "test": train_test_split["test"]})
|
41 |
-
|
42 |
-
# 🔹 Prétraitement de l'audio
|
43 |
-
def preprocess_audio(file_path):
|
44 |
-
waveform, sample_rate = torchaudio.load(file_path)
|
45 |
-
inputs = processor(
|
46 |
-
waveform.squeeze().numpy(),
|
47 |
-
sampling_rate=sample_rate,
|
48 |
-
return_tensors="pt",
|
49 |
-
padding=True,
|
50 |
-
truncation=True,
|
51 |
-
max_length=MAX_LENGTH # ✅ Correction de l'erreur
|
52 |
-
)
|
53 |
-
return inputs["input_values"][0] # Récupère les valeurs audio prétraitées
|
54 |
-
|
55 |
-
# 🔹 Charger et prétraiter le dataset
|
56 |
-
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
|
57 |
-
ds = load_audio_data(data_dir)
|
58 |
-
|
59 |
-
def preprocess_batch(batch):
|
60 |
-
batch["input_values"] = preprocess_audio(batch["file_path"])
|
61 |
-
return batch
|
62 |
-
|
63 |
-
ds = ds.map(preprocess_batch, remove_columns=["file_path"])
|
64 |
-
|
65 |
-
# 🔹 Définir les arguments d'entraînement
|
66 |
-
training_args = TrainingArguments(
|
67 |
-
output_dir="./wav2vec2_emotion",
|
68 |
-
evaluation_strategy="epoch",
|
69 |
-
save_strategy="epoch",
|
70 |
-
learning_rate=LEARNING_RATE,
|
71 |
-
per_device_train_batch_size=BATCH_SIZE,
|
72 |
-
per_device_eval_batch_size=BATCH_SIZE,
|
73 |
-
num_train_epochs=EPOCHS,
|
74 |
-
save_total_limit=2,
|
75 |
-
logging_dir="./logs",
|
76 |
-
logging_steps=10,
|
77 |
-
)
|
78 |
-
|
79 |
-
# 🔹 Définir le trainer
|
80 |
-
trainer = Trainer(
|
81 |
-
model=model,
|
82 |
-
args=training_args,
|
83 |
-
train_dataset=ds["train"],
|
84 |
-
eval_dataset=ds["test"],
|
85 |
-
)
|
86 |
-
|
87 |
-
# 🚀 Lancer l'entraînement
|
88 |
-
trainer.train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils/dataset.py
CHANGED
@@ -1,13 +1,8 @@
|
|
1 |
import os
|
2 |
from datasets import Dataset
|
3 |
-
from config import LABELS
|
4 |
import pandas as pd
|
5 |
|
6 |
-
import os
|
7 |
-
from datasets import Dataset, DatasetDict
|
8 |
-
import pandas as pd
|
9 |
-
from src.config import LABELS
|
10 |
-
|
11 |
def load_audio_data(data_dir):
|
12 |
data = []
|
13 |
for label_name, label_id in LABELS.items():
|
|
|
1 |
import os
|
2 |
from datasets import Dataset
|
3 |
+
from src.config import LABELS
|
4 |
import pandas as pd
|
5 |
|
|
|
|
|
|
|
|
|
|
|
6 |
def load_audio_data(data_dir):
|
7 |
data = []
|
8 |
for label_name, label_id in LABELS.items():
|
src/utils/preprocessing.py
CHANGED
@@ -3,8 +3,8 @@ import soundfile as sf
|
|
3 |
import torch
|
4 |
import torchaudio
|
5 |
import numpy as np
|
6 |
-
from model.feature_extractor import processor # type: ignore
|
7 |
-
from config import DEVICE
|
8 |
|
9 |
# Resampler pour convertir en 16kHz
|
10 |
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
|
|
|
3 |
import torch
|
4 |
import torchaudio
|
5 |
import numpy as np
|
6 |
+
from src.model.feature_extractor import processor # type: ignore
|
7 |
+
from src.config import DEVICE
|
8 |
|
9 |
# Resampler pour convertir en 16kHz
|
10 |
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
|