|
import soundfile as sf |
|
import torch |
|
import torchaudio |
|
import numpy as np |
|
from model.feature_extrator import processor |
|
from config import DEVICE |
|
|
|
|
|
resampler = torchaudio.transforms.Resample(orig_freq=48_000, new_freq=16_000) |
|
|
|
def preprocess_audio(batch): |
|
speech, sample_rate = sf.read(batch["path"], dtype="float32") |
|
|
|
if sample_rate != 16000: |
|
speech = torch.tensor(speech).unsqueeze(0) |
|
speech = resampler(speech).squeeze(0).numpy() |
|
|
|
batch["speech"] = speech.tolist() |
|
batch["sampling_rate"] = 16000 |
|
return batch |
|
|
|
def prepare_features(batch, max_length): |
|
features = processor( |
|
batch["speech"], |
|
sampling_rate=16000, |
|
padding=True, |
|
truncation=True, |
|
max_length=max_length, |
|
return_tensors="pt" |
|
) |
|
batch["input_values"] = features.input_values.squeeze(0) |
|
batch["label"] = torch.tensor(batch["label"], dtype=torch.long) |
|
return batch |
|
|