Spaces:
Sleeping
Sleeping
Falonne Kpamegan
commited on
Commit
·
730469b
1
Parent(s):
732a8f8
base code
Browse files- requirements.txt +9 -2
- src/model/emotion_classifier.py +17 -0
- src/model/emotion_dataset.py +22 -0
- src/model/predict.py +30 -0
- src/model/test.py +0 -51
- src/model/test_wav2vec.py +62 -0
- src/model/train.py +42 -61
requirements.txt
CHANGED
@@ -1,3 +1,10 @@
|
|
1 |
-
transformers
|
2 |
-
|
|
|
|
|
3 |
librosa
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
datasets
|
3 |
+
torchaudio
|
4 |
+
torch
|
5 |
librosa
|
6 |
+
soundfile
|
7 |
+
numpy
|
8 |
+
pandas
|
9 |
+
matplotlib
|
10 |
+
scikit-learn
|
src/model/emotion_classifier.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import Wav2Vec2Model
|
4 |
+
|
5 |
+
class Wav2Vec2EmotionClassifier(nn.Module):
|
6 |
+
|
7 |
+
def __init__(self, model_name="facebook/wav2vec2-large-xlsr-53-french", num_labels=3):
|
8 |
+
super(Wav2Vec2EmotionClassifier, self).__init__()
|
9 |
+
self.wav2vec2 = Wav2Vec2Model.from_pretrained(model_name)
|
10 |
+
self.fc = nn.Linear(self.wav2vec2.config.hidden_size, num_labels)
|
11 |
+
self.softmax = nn.Softmax(dim=1)
|
12 |
+
|
13 |
+
def forward(self, input_values):
|
14 |
+
outputs = self.wav2vec2(input_values).last_hidden_state
|
15 |
+
pooled_output = torch.mean(outputs, dim=1)
|
16 |
+
logits = self.fc(pooled_output)
|
17 |
+
return self.softmax(logits)
|
src/model/emotion_dataset.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
from torch.utils.data import Dataset
|
5 |
+
|
6 |
+
class EmotionDataset(Dataset):
|
7 |
+
def __init__(self, csv_file, processor):
|
8 |
+
self.data = pd.read_csv(csv_file)
|
9 |
+
self.processor = processor
|
10 |
+
self.emotion_labels = {"joie": 0, "colère": 1, "neutre": 2}
|
11 |
+
|
12 |
+
def __len__(self):
|
13 |
+
return len(self.data)
|
14 |
+
|
15 |
+
def __getitem__(self, idx):
|
16 |
+
audio_path = self.data.iloc[idx, 0]
|
17 |
+
label = self.emotion_labels[self.data.iloc[idx, 1]]
|
18 |
+
|
19 |
+
waveform, _ = librosa.load(audio_path, sr=16000)
|
20 |
+
input_values = self.processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
|
21 |
+
|
22 |
+
return input_values.squeeze(0), torch.tensor(label, dtype=torch.long)
|
src/model/predict.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import Wav2Vec2Processor
|
3 |
+
from model import Wav2Vec2EmotionClassifier
|
4 |
+
import librosa
|
5 |
+
|
6 |
+
# Charger le modèle et le processeur
|
7 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
8 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
|
9 |
+
model = Wav2Vec2EmotionClassifier()
|
10 |
+
model.load_state_dict(torch.load("wav2vec2_emotion.pth"))
|
11 |
+
model.to(device)
|
12 |
+
model.eval()
|
13 |
+
|
14 |
+
emotion_labels = ["joie", "colère", "neutre"]
|
15 |
+
|
16 |
+
def predict_emotion(audio_path):
|
17 |
+
waveform, _ = librosa.load(audio_path, sr=16000)
|
18 |
+
input_values = processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
|
19 |
+
input_values = input_values.to(device)
|
20 |
+
|
21 |
+
with torch.no_grad():
|
22 |
+
outputs = model(input_values)
|
23 |
+
|
24 |
+
predicted_label = torch.argmax(outputs, dim=1).item()
|
25 |
+
return emotion_labels[predicted_label]
|
26 |
+
|
27 |
+
# Exemple d'utilisation
|
28 |
+
audio_test = "data/audio1.wav"
|
29 |
+
emotion = predict_emotion(audio_test)
|
30 |
+
print(f"Émotion détectée : {emotion}")
|
src/model/test.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
import torchaudio
|
2 |
-
from datasets import load_dataset
|
3 |
-
from evaluate import load as load_metric
|
4 |
-
from transformers import (
|
5 |
-
Wav2Vec2ForCTC,
|
6 |
-
Wav2Vec2Processor,
|
7 |
-
)
|
8 |
-
import torch
|
9 |
-
import re
|
10 |
-
import sys
|
11 |
-
|
12 |
-
model_name = "facebook/wav2vec2-large-xlsr-53-french"
|
13 |
-
device = "cpu"
|
14 |
-
|
15 |
-
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
|
16 |
-
|
17 |
-
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
|
18 |
-
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
19 |
-
|
20 |
-
ds = load_dataset("facebook/voxpopuli", "fr", trust_remote_code=True)
|
21 |
-
|
22 |
-
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
|
23 |
-
|
24 |
-
def map_to_array(batch):
|
25 |
-
speech, _ = torchaudio.load(batch["path"])
|
26 |
-
batch["speech"] = resampler.forward(speech.squeeze(0)).numpy()
|
27 |
-
batch["sampling_rate"] = resampler.new_freq
|
28 |
-
batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower().replace("’", "'")
|
29 |
-
return batch
|
30 |
-
|
31 |
-
ds = ds.map(map_to_array)
|
32 |
-
|
33 |
-
def map_to_pred(batch):
|
34 |
-
features = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0], padding=True, return_tensors="pt")
|
35 |
-
input_values = features.input_values.to(device)
|
36 |
-
attention_mask = features.attention_mask.to(device)
|
37 |
-
with torch.no_grad():
|
38 |
-
logits = model(input_values, attention_mask=attention_mask).logits
|
39 |
-
pred_ids = torch.argmax(logits, dim=-1)
|
40 |
-
batch["predicted"] = processor.batch_decode(pred_ids)
|
41 |
-
batch["target"] = batch["sentence"]
|
42 |
-
return batch
|
43 |
-
|
44 |
-
result = ds.map(map_to_pred, batched=True, batch_size=16, remove_columns=list(ds.features.keys()))
|
45 |
-
|
46 |
-
wer = load_metric("wer")
|
47 |
-
wer_score = wer.compute(predictions=result["predicted"], references=result["target"])
|
48 |
-
print(f"WER: {wer_score}")
|
49 |
-
|
50 |
-
|
51 |
-
# print(wer.compute(predictions=result["predicted"], references=result["target"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/model/test_wav2vec.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
2 |
+
import torch
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
# Charger le modèle et le processeur Wav2Vec 2.0
|
8 |
+
model_name = "facebook/wav2vec2-large-xlsr-53-french"
|
9 |
+
processor = Wav2Vec2Processor.from_pretrained(model_name)
|
10 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_name)
|
11 |
+
|
12 |
+
# Charger l'audio
|
13 |
+
audio_file = "C:\\Users\\fkpamegan\\Downloads\\datasets_oreau2_m_sessp_07a01Pa.wav"
|
14 |
+
y, sr = librosa.load(audio_file, sr=16000)
|
15 |
+
|
16 |
+
# Prétraiter l'audio avec le processeur Wav2Vec 2.0
|
17 |
+
input_values = processor(y, return_tensors="pt").input_values
|
18 |
+
|
19 |
+
# Obtenir la prédiction (logits)
|
20 |
+
with torch.no_grad():
|
21 |
+
logits = model(input_values).logitsa
|
22 |
+
|
23 |
+
# Obtenir les IDs des tokens prédits (transcription)
|
24 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
25 |
+
|
26 |
+
# Décoder les IDs pour obtenir le texte transcrit
|
27 |
+
transcription = processor.decode(predicted_ids[0])
|
28 |
+
|
29 |
+
print("Transcription:", transcription)
|
30 |
+
|
31 |
+
|
32 |
+
# Extraire le pitch (hauteur tonale) et l'intensité
|
33 |
+
pitch, magnitudes = librosa.core.piptrack(y=y, sr=sr)
|
34 |
+
intensity = librosa.feature.rms(y=y) # Intensité (volume)
|
35 |
+
|
36 |
+
# Calculer le tempo (vitesse de parole)
|
37 |
+
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
|
38 |
+
|
39 |
+
# Affichage du pitch
|
40 |
+
plt.figure(figsize=(10, 6))
|
41 |
+
librosa.display.specshow(pitch, x_axis='time', y_axis='log')
|
42 |
+
plt.colorbar()
|
43 |
+
plt.title("Pitch (Hauteur Tonale)")
|
44 |
+
plt.show()
|
45 |
+
|
46 |
+
# Affichage de l'intensité
|
47 |
+
plt.figure(figsize=(10, 6))
|
48 |
+
librosa.display.specshow(intensity, x_axis='time')
|
49 |
+
plt.colorbar()
|
50 |
+
plt.title("Intensité")
|
51 |
+
plt.show()
|
52 |
+
|
53 |
+
# Fusionner la transcription avec les caractéristiques prosodiques (pitch, intensité, tempo)
|
54 |
+
features = np.hstack([
|
55 |
+
np.mean(intensity, axis=1), # Moyenne de l'intensité
|
56 |
+
np.mean(pitch, axis=1), # Moyenne du pitch
|
57 |
+
tempo # Tempo
|
58 |
+
])
|
59 |
+
|
60 |
+
# Afficher les caractéristiques extraites
|
61 |
+
print("Caractéristiques combinées :")
|
62 |
+
print(features)
|
src/model/train.py
CHANGED
@@ -1,62 +1,43 @@
|
|
1 |
-
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
2 |
import torch
|
3 |
-
import
|
4 |
-
import
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
#
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# Affichage de l'intensité
|
47 |
-
plt.figure(figsize=(10, 6))
|
48 |
-
librosa.display.specshow(intensity, x_axis='time')
|
49 |
-
plt.colorbar()
|
50 |
-
plt.title("Intensité")
|
51 |
-
plt.show()
|
52 |
-
|
53 |
-
# Fusionner la transcription avec les caractéristiques prosodiques (pitch, intensité, tempo)
|
54 |
-
features = np.hstack([
|
55 |
-
np.mean(intensity, axis=1), # Moyenne de l'intensité
|
56 |
-
np.mean(pitch, axis=1), # Moyenne du pitch
|
57 |
-
tempo # Tempo
|
58 |
-
])
|
59 |
-
|
60 |
-
# Afficher les caractéristiques extraites
|
61 |
-
print("Caractéristiques combinées :")
|
62 |
-
print(features)
|
|
|
|
|
1 |
import torch
|
2 |
+
import torch.optim as optim
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.utils.data import DataLoader
|
5 |
+
from transformers import Wav2Vec2Processor
|
6 |
+
from emotion_dataset import EmotionDataset
|
7 |
+
from model import Wav2Vec2EmotionClassifier
|
8 |
+
|
9 |
+
# Charger le processeur et le dataset
|
10 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
|
11 |
+
dataset = EmotionDataset("data/dataset.csv", processor)
|
12 |
+
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
|
13 |
+
|
14 |
+
# Initialiser le modèle
|
15 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
16 |
+
model = Wav2Vec2EmotionClassifier().to(device)
|
17 |
+
|
18 |
+
# Définir la fonction de perte et l'optimiseur
|
19 |
+
criterion = nn.CrossEntropyLoss()
|
20 |
+
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
|
21 |
+
|
22 |
+
# Entraînement du modèle
|
23 |
+
num_epochs = 10
|
24 |
+
for epoch in range(num_epochs):
|
25 |
+
model.train()
|
26 |
+
total_loss = 0
|
27 |
+
|
28 |
+
for inputs, labels in dataloader:
|
29 |
+
inputs, labels = inputs.to(device), labels.to(device)
|
30 |
+
|
31 |
+
optimizer.zero_grad()
|
32 |
+
outputs = model(inputs)
|
33 |
+
loss = criterion(outputs, labels)
|
34 |
+
loss.backward()
|
35 |
+
optimizer.step()
|
36 |
+
|
37 |
+
total_loss += loss.item()
|
38 |
+
|
39 |
+
print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
|
40 |
+
|
41 |
+
# Sauvegarde du modèle
|
42 |
+
torch.save(model.state_dict(), "wav2vec2_emotion.pth")
|
43 |
+
print("Modèle sauvegardé !")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|