audio-electroma / tasks /utils /preprocess.py
electroma's picture
xgboost api (#5)
6dacae5 verified
import librosa
import numpy as np
# Function to resample the audio array
def resample_audio(array, orig_sr, target_sr):
array = np.array(array) # Ensure it's a numpy array
if orig_sr != target_sr:
array = librosa.resample(array, orig_sr=orig_sr, target_sr=target_sr)
return array
def create_mel_spectrogram(waveform, sr, n_mels=128, n_fft=2048, hop_length=512):
"""
Create a Mel spectrogram from a waveform.
Args:
waveform (np.ndarray): 1D NumPy array of the audio waveform.
sr (int): Sampling rate of the waveform.
n_mels (int): Number of Mel bands to generate.
n_fft (int): Length of the FFT window.
hop_length (int): Number of samples between successive frames.
Returns:
np.ndarray: 2D NumPy array of the Mel spectrogram (shape: [n_mels, time]).
"""
# Create Mel spectrogram
mel_spectrogram = librosa.feature.melspectrogram(
y=waveform,
sr=sr,
n_fft=n_fft,
hop_length=hop_length,
n_mels=n_mels
)
# Convert power spectrogram (amplitude squared) to decibel (log scale)
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
# Ensure consistent length for each feature
#log_mel_spec = librosa.util.fix_length(log_mel_spec, 1300)
return mel_spectrogram_db