|
import librosa |
|
import soundfile as sf |
|
import torch |
|
import torchaudio |
|
import numpy as np |
|
from model.feature_extractor import processor |
|
from config import DEVICE |
|
|
|
|
|
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000) |
|
|
|
def preprocess_audio(batch): |
|
speech, sample_rate = sf.read(batch["path"], dtype="float32") |
|
|
|
|
|
speech = np.array(speech, dtype=np.float32) |
|
|
|
|
|
if speech.dtype != np.float32: |
|
raise ValueError(f"Le fichier {batch['path']} n'est pas en float32.") |
|
|
|
|
|
if sample_rate != 16000: |
|
speech = torch.tensor(speech).unsqueeze(0) |
|
speech = resampler(speech).squeeze(0).numpy() |
|
|
|
batch["speech"] = speech |
|
batch["sampling_rate"] = 16000 |
|
return batch |
|
|
|
|
|
def prepare_features(batch, max_length): |
|
y, sr = batch["speech"], 16000 |
|
|
|
|
|
if not isinstance(y, np.ndarray): |
|
y = np.array(y, dtype=np.float32) |
|
|
|
|
|
y = y.astype(np.float32) |
|
|
|
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40) |
|
|
|
|
|
|
|
|
|
|
|
if mfcc.shape[1] > max_length: |
|
mfcc = mfcc[:, :max_length] |
|
else: |
|
pad_width = max_length - mfcc.shape[1] |
|
mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant') |
|
|
|
|
|
|
|
|
|
batch["input_values"] = torch.tensor(mfcc.T, dtype=torch.float32) |
|
return batch |
|
|
|
|
|
def collate_fn(batch): |
|
""" Fonction pour assembler les batchs avec padding des features """ |
|
|
|
|
|
inputs = [np.array(sample["input_values"], dtype=np.float32) for sample in batch] |
|
labels = torch.tensor([sample["label"] for sample in batch], dtype=torch.long) |
|
|
|
|
|
|
|
|
|
|
|
max_length = max([x.shape[0] for x in inputs]) |
|
|
|
|
|
padded_inputs = [np.pad(x, ((0, max_length - x.shape[0]), (0, 0)), mode="constant") for x in inputs] |
|
|
|
|
|
inputs_tensor = torch.tensor(padded_inputs, dtype=torch.float32) |
|
|
|
return inputs_tensor, labels |
|
|
|
|