Marina Kpamegan commited on
Commit
06c46fb
·
1 Parent(s): 129ee9b

Reorganisation

Browse files
.gitignore CHANGED
@@ -2,6 +2,7 @@
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
 
5
 
6
  # C extensions
7
  *.so
@@ -182,3 +183,4 @@ data/*
182
 
183
  # Mac
184
  .DS_Store
 
 
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
5
+ .idea/
6
 
7
  # C extensions
8
  *.so
 
183
 
184
  # Mac
185
  .DS_Store
186
+ *.pth
src/config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from dotenv import load_dotenv
4
+
5
+ # Charger les variables d'environnement
6
+ load_dotenv()
7
+ HF_API_KEY = os.getenv("HF_API_KEY")
8
+
9
+ if not HF_API_KEY:
10
+ raise ValueError("Le token Hugging Face n'a pas été trouvé dans .env")
11
+
12
+ # Labels d'émotions
13
+ LABELS = {"colere": 0, "neutre": 1, "joie": 2}
14
+ NUM_LABELS = len(LABELS)
15
+
16
+ # Choisir le device
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # Modèle Wav2Vec2
20
+ MODEL_NAME = "facebook/wav2vec2-large-xlsr-53-french"
src/model/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/model/emotion_classifier.py CHANGED
@@ -1,17 +1,15 @@
1
- import torch
2
  import torch.nn as nn
3
- from transformers import Wav2Vec2Model
4
 
5
- class Wav2Vec2EmotionClassifier(nn.Module):
 
 
 
 
 
 
6
 
7
- def __init__(self, model_name="facebook/wav2vec2-large-xlsr-53-french", num_labels=3):
8
- super(Wav2Vec2EmotionClassifier, self).__init__()
9
- self.wav2vec2 = Wav2Vec2Model.from_pretrained(model_name)
10
- self.fc = nn.Linear(self.wav2vec2.config.hidden_size, num_labels)
11
- self.softmax = nn.Softmax(dim=1)
12
-
13
- def forward(self, input_values):
14
- outputs = self.wav2vec2(input_values).last_hidden_state
15
- pooled_output = torch.mean(outputs, dim=1)
16
- logits = self.fc(pooled_output)
17
- return self.softmax(logits)
 
 
1
  import torch.nn as nn
 
2
 
3
+ class EmotionClassifier(nn.Module):
4
+ def __init__(self, feature_dim, num_labels):
5
+ super(EmotionClassifier, self).__init__()
6
+ self.fc1 = nn.Linear(feature_dim, 256)
7
+ self.relu = nn.ReLU()
8
+ self.dropout = nn.Dropout(0.3)
9
+ self.fc2 = nn.Linear(256, num_labels)
10
 
11
+ def forward(self, x):
12
+ x = self.fc1(x)
13
+ x = self.relu(x)
14
+ x = self.dropout(x)
15
+ return self.fc2(x)
 
 
 
 
 
 
src/model/emotion_dataset.py DELETED
@@ -1,29 +0,0 @@
1
- import librosa
2
- import torch
3
- import pandas as pd
4
- from torch.utils.data import Dataset
5
- import os
6
-
7
- class EmotionDataset(Dataset):
8
- def __init__(self, csv_file, processor):
9
- self.data = pd.read_csv(csv_file, sep=",", header=0)
10
- # print(self.data.info()) # Pour voir les premières lignes du dataset
11
- self.processor = processor
12
- self.emotion_labels = {"joie": 0, "colere": 1, "neutre": 2}
13
- # print(self.data["emotion"].unique()) # Pour voir les valeurs exactes
14
-
15
-
16
- def __len__(self):
17
- return len(self.data)
18
-
19
- def __getitem__(self, idx):
20
- base_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data"))
21
- audio_file = self.data.iloc[idx, 0]
22
- label = self.emotion_labels[self.data.iloc[idx, 1].strip()]
23
-
24
- audio_path = os.path.join(base_path, audio_file)
25
- waveform, _ = librosa.load(audio_path, sr=16000) # Chargement audio
26
- input_values = self.processor(waveform, return_tensors="pt", sampling_rate=16000).input_values
27
-
28
- return input_values.squeeze(0), torch.tensor(label, dtype=torch.long)
29
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/model/feature_extrator.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Wav2Vec2Model, Wav2Vec2Processor
3
+ from config import MODEL_NAME, DEVICE
4
+
5
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
6
+ feature_extractor = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(DEVICE)
src/model/test_wav2vec.py DELETED
@@ -1,62 +0,0 @@
1
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
- import torch
3
- import librosa
4
- import numpy as np
5
- import matplotlib.pyplot as plt
6
-
7
- # Charger le modèle et le processeur Wav2Vec 2.0
8
- model_name = "facebook/wav2vec2-large-xlsr-53-french"
9
- processor = Wav2Vec2Processor.from_pretrained(model_name)
10
- model = Wav2Vec2ForCTC.from_pretrained(model_name)
11
-
12
- # Charger l'audio
13
- audio_file = "C:\\Users\\fkpamegan\\Downloads\\datasets_oreau2_m_sessp_07a01Pa.wav"
14
- y, sr = librosa.load(audio_file, sr=16000)
15
-
16
- # Prétraiter l'audio avec le processeur Wav2Vec 2.0
17
- input_values = processor(y, return_tensors="pt").input_values
18
-
19
- # Obtenir la prédiction (logits)
20
- with torch.no_grad():
21
- logits = model(input_values).logitsa
22
-
23
- # Obtenir les IDs des tokens prédits (transcription)
24
- predicted_ids = torch.argmax(logits, dim=-1)
25
-
26
- # Décoder les IDs pour obtenir le texte transcrit
27
- transcription = processor.decode(predicted_ids[0])
28
-
29
- print("Transcription:", transcription)
30
-
31
-
32
- # Extraire le pitch (hauteur tonale) et l'intensité
33
- pitch, magnitudes = librosa.core.piptrack(y=y, sr=sr)
34
- intensity = librosa.feature.rms(y=y) # Intensité (volume)
35
-
36
- # Calculer le tempo (vitesse de parole)
37
- tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
38
-
39
- # Affichage du pitch
40
- plt.figure(figsize=(10, 6))
41
- librosa.display.specshow(pitch, x_axis='time', y_axis='log')
42
- plt.colorbar()
43
- plt.title("Pitch (Hauteur Tonale)")
44
- plt.show()
45
-
46
- # Affichage de l'intensité
47
- plt.figure(figsize=(10, 6))
48
- librosa.display.specshow(intensity, x_axis='time')
49
- plt.colorbar()
50
- plt.title("Intensité")
51
- plt.show()
52
-
53
- # Fusionner la transcription avec les caractéristiques prosodiques (pitch, intensité, tempo)
54
- features = np.hstack([
55
- np.mean(intensity, axis=1), # Moyenne de l'intensité
56
- np.mean(pitch, axis=1), # Moyenne du pitch
57
- tempo # Tempo
58
- ])
59
-
60
- # Afficher les caractéristiques extraites
61
- print("Caractéristiques combinées :")
62
- print(features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/model/train.py DELETED
@@ -1,51 +0,0 @@
1
- import torch
2
- import torch.optim as optim
3
- import torch.nn as nn
4
- from torch.utils.data import DataLoader
5
- from transformers import Wav2Vec2Processor
6
- from emotion_dataset import EmotionDataset
7
- from emotion_classifier import Wav2Vec2EmotionClassifier
8
- import os
9
- from utils import collate_fn
10
-
11
-
12
- # Charger le processeur et le dataset
13
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-french")
14
- data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "data", "dataset.csv"))
15
- if not os.path.exists(data_path):
16
- raise FileNotFoundError(f"Le fichier {data_path} est introuvable.")
17
-
18
- dataset = EmotionDataset(data_path, processor)
19
- dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn) # collate_fn ajouté
20
-
21
-
22
- # Initialiser le modèle
23
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
- model = Wav2Vec2EmotionClassifier().to(device)
25
-
26
- # Définir la fonction de perte et l'optimiseur
27
- criterion = nn.CrossEntropyLoss()
28
- optimizer = optim.AdamW(model.parameters(), lr=5e-5)
29
-
30
- # Entraînement du modèle
31
- num_epochs = 10
32
- for epoch in range(num_epochs):
33
- model.train()
34
- total_loss = 0
35
-
36
- for inputs, labels in dataloader:
37
- inputs, labels = inputs.to(device), labels.to(device)
38
-
39
- optimizer.zero_grad()
40
- outputs = model(inputs)
41
- loss = criterion(outputs, labels)
42
- loss.backward()
43
- optimizer.step()
44
-
45
- total_loss += loss.item()
46
-
47
- print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
48
-
49
- # Sauvegarde du modèle
50
- torch.save(model.state_dict(), "wav2vec2_emotion.pth")
51
- print("Modèle sauvegardé !")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/model/utils.py DELETED
@@ -1,8 +0,0 @@
1
- import torch
2
- from torch.nn.utils.rnn import pad_sequence
3
-
4
- def collate_fn(batch):
5
- inputs, labels = zip(*batch) # Séparer les features et les labels
6
- inputs = pad_sequence(inputs, batch_first=True, padding_value=0) # Padding des audios
7
- labels = torch.tensor(labels, dtype=torch.long) # Conversion en tensor
8
- return inputs, labels
 
 
 
 
 
 
 
 
 
src/speech2.py DELETED
@@ -1,201 +0,0 @@
1
- import os
2
- import torch
3
- import torch.nn as nn
4
- import torch.optim as optim
5
- import soundfile as sf
6
- import torchaudio
7
- import numpy as np
8
- from datasets import Dataset
9
- from transformers import (
10
- Wav2Vec2Model,
11
- Wav2Vec2Processor
12
- )
13
- from dotenv import load_dotenv
14
- from sklearn.metrics import accuracy_score
15
-
16
- # Charger .env pour Hugging Face API Key
17
- load_dotenv()
18
- HF_API_KEY = os.getenv("HF_API_KEY")
19
-
20
- if not HF_API_KEY:
21
- raise ValueError("Le token Hugging Face n'a pas été trouvé dans .env")
22
-
23
- # Définition des labels pour la classification des émotions
24
- LABELS = {"colere": 0, "neutre": 1, "joie": 2}
25
- NUM_LABELS = len(LABELS)
26
-
27
- # Charger le processeur et le modèle pour l'extraction de features
28
- model_name = "facebook/wav2vec2-large-xlsr-53-french"
29
- device = "cuda" if torch.cuda.is_available() else "cpu"
30
-
31
- processor = Wav2Vec2Processor.from_pretrained(model_name)
32
- feature_extractor = Wav2Vec2Model.from_pretrained(model_name).to(device)
33
-
34
- # Resampleur pour convertir en 16 kHz
35
- resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
36
-
37
- # Définition du classifieur amélioré
38
- class EmotionClassifier(nn.Module):
39
- def __init__(self, feature_dim, num_labels):
40
- super(EmotionClassifier, self).__init__()
41
- self.fc1 = nn.Linear(feature_dim, 256)
42
- self.relu = nn.ReLU()
43
- self.dropout = nn.Dropout(0.3)
44
- self.fc2 = nn.Linear(256, num_labels)
45
-
46
- def forward(self, x):
47
- x = self.fc1(x)
48
- x = self.relu(x)
49
- x = self.dropout(x)
50
- return self.fc2(x)
51
-
52
- # Instancier le classifieur
53
- classifier = EmotionClassifier(feature_extractor.config.hidden_size, NUM_LABELS).to(device)
54
-
55
- # Charger les fichiers audio et leurs labels
56
- def load_audio_data(data_dir):
57
- data = []
58
- for label_name, label_id in LABELS.items():
59
- label_dir = os.path.join(data_dir, label_name)
60
- for file in os.listdir(label_dir):
61
- if file.endswith(".wav"):
62
- file_path = os.path.join(label_dir, file)
63
- data.append({"path": file_path, "label": label_id})
64
- return Dataset.from_list(data)
65
-
66
- # Chargement du dataset
67
- data_dir = "./dataset"
68
- ds = load_audio_data(data_dir)
69
-
70
- # Charger les fichiers audio avec SoundFile et rééchantillonner à 16 kHz
71
- def preprocess_audio(batch):
72
- speech, sample_rate = sf.read(batch["path"], dtype="float32")
73
-
74
- if sample_rate != 16000:
75
- speech = torch.tensor(speech).unsqueeze(0)
76
- speech = resampler(speech).squeeze(0).numpy()
77
-
78
- batch["speech"] = speech.tolist() # Convertir en liste pour éviter les erreurs de PyArrow
79
- batch["sampling_rate"] = 16000
80
- return batch
81
-
82
- ds = ds.map(preprocess_audio)
83
-
84
- # Vérifier la distribution des longueurs des fichiers audio
85
- lengths = [len(sample["speech"]) for sample in ds]
86
- max_length = int(np.percentile(lengths, 95))
87
-
88
- # Transformer l'audio en features utilisables par le modèle
89
- def prepare_features(batch):
90
- features = processor(
91
- batch["speech"],
92
- sampling_rate=16000,
93
- padding=True,
94
- truncation=True,
95
- max_length=max_length,
96
- return_tensors="pt"
97
- )
98
- batch["input_values"] = features.input_values.squeeze(0)
99
- batch["label"] = torch.tensor(batch["label"], dtype=torch.long)
100
- return batch
101
-
102
- ds = ds.map(prepare_features)
103
-
104
- # Diviser les données en train et test
105
- ds = ds.train_test_split(test_size=0.2)
106
- train_ds = ds["train"]
107
- test_ds = ds["test"]
108
-
109
- # Fonction d'évaluation sur les données de test
110
- def evaluate(classifier, feature_extractor, test_ds):
111
- classifier.eval()
112
- correct = 0
113
- total = 0
114
-
115
- with torch.no_grad():
116
- for batch in test_ds:
117
- input_values = processor(
118
- batch["speech"],
119
- sampling_rate=16000,
120
- return_tensors="pt",
121
- padding=True,
122
- truncation=True,
123
- max_length=max_length
124
- ).input_values.to(device)
125
-
126
- features = feature_extractor(input_values).last_hidden_state.mean(dim=1)
127
- logits = classifier(features)
128
- predictions = logits.argmax(dim=-1)
129
- labels = torch.tensor(batch["label"], dtype=torch.long, device=device)
130
-
131
- correct += (predictions == labels).sum().item()
132
- total += 1
133
-
134
- return correct / total
135
-
136
- # Fonction d'entraînement
137
- def train_classifier(feature_extractor, classifier, train_ds, test_ds, epochs=10, batch_size=16):
138
- optimizer = optim.Adam(classifier.parameters(), lr=1e-4)
139
- scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
140
- loss_fn = nn.CrossEntropyLoss()
141
-
142
- best_accuracy = 0.0 # Variable pour stocker la meilleure accuracy
143
-
144
- for epoch in range(epochs):
145
- classifier.train()
146
- total_loss, correct = 0, 0
147
- batch_count = 0
148
-
149
- for i in range(0, len(train_ds), batch_size):
150
- batch = train_ds[i: i + batch_size]
151
- optimizer.zero_grad()
152
-
153
- input_values = processor(
154
- batch["speech"],
155
- sampling_rate=16000,
156
- return_tensors="pt",
157
- padding=True,
158
- truncation=True,
159
- max_length=max_length
160
- ).input_values.to(device)
161
-
162
- with torch.no_grad():
163
- features = feature_extractor(input_values).last_hidden_state.mean(dim=1)
164
- features = (features - features.mean()) / features.std() # Normalisation
165
-
166
- logits = classifier(features)
167
- labels = torch.tensor(batch["label"], dtype=torch.long, device=device)
168
-
169
- if labels.numel() == 0:
170
- continue
171
-
172
- loss = loss_fn(logits, labels)
173
- loss.backward()
174
- optimizer.step()
175
-
176
- total_loss += loss.item()
177
- correct += (logits.argmax(dim=-1) == labels).sum().item()
178
- batch_count += 1
179
-
180
- train_acc = correct / len(train_ds)
181
- test_acc = evaluate(classifier, feature_extractor, test_ds)
182
- scheduler.step()
183
-
184
- # Sauvegarde uniquement si l'accuracy sur test est la meilleure obtenue
185
- if test_acc > best_accuracy:
186
- best_accuracy = test_acc
187
- torch.save({
188
- "classifier_state_dict": classifier.state_dict(),
189
- "feature_extractor_state_dict": feature_extractor.state_dict(),
190
- "processor": processor
191
- }, "best_emotion_model.pth")
192
- print(f"✅ Nouveau meilleur modèle sauvegardé ! Accuracy Test: {best_accuracy:.4f}")
193
-
194
- print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/batch_count:.4f} - Train Accuracy: {train_acc:.4f} - Test Accuracy: {test_acc:.4f}")
195
-
196
- return classifier
197
-
198
- # Entraînement
199
- trained_classifier = train_classifier(feature_extractor, classifier, train_ds, test_ds, epochs=10, batch_size=16)
200
-
201
- print("✅ Entraînement terminé, le meilleur modèle a été sauvegardé !")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/train.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.optim as optim
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ from sklearn.metrics import accuracy_score
6
+ from utils.dataset import load_audio_data
7
+ from utils.preprocessing import preprocess_audio, prepare_features
8
+ from model.emotion_classifier import EmotionClassifier
9
+ from model.feature_extrator import feature_extractor, processor
10
+ from config import DEVICE, NUM_LABELS
11
+ import os
12
+
13
+ # Charger les données
14
+ data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "data"))
15
+ print(f"data dir {data_dir}")
16
+ ds = load_audio_data(data_dir)
17
+
18
+ # Prétraitement
19
+ ds = ds.map(preprocess_audio)
20
+
21
+ # Ajustement de la longueur maximale
22
+ lengths = [len(sample["speech"]) for sample in ds]
23
+ max_length = int(np.percentile(lengths, 95))
24
+
25
+ ds = ds.map(lambda batch: prepare_features(batch, max_length))
26
+
27
+ # Séparation en train et test
28
+ ds = ds.train_test_split(test_size=0.2)
29
+ train_ds, test_ds = ds["train"], ds["test"]
30
+
31
+ # Instancier le modèle
32
+ classifier = EmotionClassifier(feature_extractor.config.hidden_size, NUM_LABELS).to(DEVICE)
33
+
34
+ # Fonction d'entraînement
35
+ def train_classifier(classifier, train_ds, test_ds, epochs=20, batch_size=8):
36
+ optimizer = optim.AdamW(classifier.parameters(), lr=2e-5, weight_decay=0.01)
37
+ loss_fn = nn.CrossEntropyLoss()
38
+ best_accuracy = 0.0
39
+
40
+ for epoch in range(epochs):
41
+ classifier.train()
42
+ total_loss, correct = 0, 0
43
+ batch_count = 0
44
+
45
+ for i in range(0, len(train_ds), batch_size):
46
+ batch = train_ds[i: i + batch_size]
47
+ optimizer.zero_grad()
48
+
49
+ input_values = processor(
50
+ batch["speech"],
51
+ sampling_rate=16000,
52
+ return_tensors="pt",
53
+ padding=True,
54
+ truncation=True,
55
+ max_length=max_length
56
+ ).input_values.to(DEVICE)
57
+
58
+ with torch.no_grad():
59
+ features = feature_extractor(input_values).last_hidden_state.mean(dim=1)
60
+
61
+ logits = classifier(features)
62
+ labels = torch.tensor(batch["label"], dtype=torch.long, device=DEVICE)
63
+
64
+ if labels.numel() == 0:
65
+ continue
66
+
67
+ loss = loss_fn(logits, labels)
68
+ loss.backward()
69
+ optimizer.step()
70
+
71
+ total_loss += loss.item()
72
+ correct += (logits.argmax(dim=-1) == labels).sum().item()
73
+ batch_count += 1
74
+
75
+ train_acc = correct / len(train_ds)
76
+
77
+ if train_acc > best_accuracy:
78
+ best_accuracy = train_acc
79
+ torch.save({
80
+ "classifier_state_dict": classifier.state_dict(),
81
+ "feature_extractor_state_dict": feature_extractor.state_dict(),
82
+ "processor": processor
83
+ }, "acc_model.pth")
84
+ print(f"Nouveau meilleur modèle sauvegardé ! Accuracy: {best_accuracy:.4f}")
85
+
86
+ print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/batch_count:.4f} - Accuracy: {train_acc:.4f}")
87
+
88
+ return classifier
89
+
90
+ # Lancer l'entraînement
91
+ trained_classifier = train_classifier(classifier, train_ds, test_ds, epochs=20, batch_size=8)
92
+
93
+ print("✅ Entraînement terminé, le meilleur modèle a été sauvegardé !")
src/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/utils/dataset.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datasets import Dataset
3
+ from config import LABELS
4
+
5
+ def load_audio_data(data_dir):
6
+ data = []
7
+ for label_name, label_id in LABELS.items():
8
+ label_dir = os.path.join(data_dir, label_name)
9
+ for file in os.listdir(label_dir):
10
+ if file.endswith(".wav"):
11
+ file_path = os.path.join(label_dir, file)
12
+ data.append({"path": file_path, "label": label_id})
13
+ return Dataset.from_list(data)
src/utils/preprocessing.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import torch
3
+ import torchaudio
4
+ import numpy as np
5
+ from model.feature_extrator import processor
6
+ from config import DEVICE
7
+
8
+ # Resampler
9
+ resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000)
10
+
11
+ def preprocess_audio(batch):
12
+ speech, sample_rate = sf.read(batch["path"], dtype="float32")
13
+
14
+ if sample_rate != 16000:
15
+ speech = torch.tensor(speech).unsqueeze(0)
16
+ speech = resampler(speech).squeeze(0).numpy()
17
+
18
+ batch["speech"] = speech.tolist()
19
+ batch["sampling_rate"] = 16000
20
+ return batch
21
+
22
+ def prepare_features(batch, max_length):
23
+ features = processor(
24
+ batch["speech"],
25
+ sampling_rate=16000,
26
+ padding=True,
27
+ truncation=True,
28
+ max_length=max_length,
29
+ return_tensors="pt"
30
+ )
31
+ batch["input_values"] = features.input_values.squeeze(0)
32
+ batch["label"] = torch.tensor(batch["label"], dtype=torch.long)
33
+ return batch