akhaliq3
spaces demo
607ecc1
raw
history blame
2.74 kB
from typing import Callable, Optional
import warnings
import gin
import librosa
import numpy as np
from .upsampling import linear_interpolation
def compute_power_spectrogram(
audio: np.ndarray,
n_fft: int,
hop_length: int,
window: str,
epsilon: float,
):
spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length, window=window)
magnitude_spectrogram = np.abs(spectrogram)
power_spectrogram = librosa.amplitude_to_db(
magnitude_spectrogram, ref=np.max, amin=epsilon
)
return power_spectrogram
def perform_perceptual_weighting(
power_spectrogram_in_db: np.ndarray, sample_rate: float, n_fft: int
):
centre_frequencies = librosa.fft_frequencies(sample_rate, n_fft)
# We know that we will get a log(0) warning here due to the DC component -- we can
# safely ignore as it is clipped to the default min dB value of -80.0 dB
with warnings.catch_warnings():
warnings.simplefilter("ignore")
weights = librosa.A_weighting(centre_frequencies)
weights = np.expand_dims(weights, axis=1)
weighted_spectrogram = power_spectrogram_in_db # + weights
return weighted_spectrogram
@gin.configurable
def extract_perceptual_loudness(
audio: np.ndarray,
sample_rate: float = 16000,
n_fft: int = 2048,
hop_length: int = 512,
window: str = "hann",
epsilon: float = 1e-5,
interpolate_fn: Optional[Callable] = linear_interpolation,
normalise: bool = True,
):
power_spectrogram = compute_power_spectrogram(
audio, n_fft=n_fft, hop_length=hop_length, window=window, epsilon=epsilon
)
perceptually_weighted_spectrogram = perform_perceptual_weighting(
power_spectrogram, sample_rate=sample_rate, n_fft=n_fft
)
loudness = np.mean(perceptually_weighted_spectrogram, axis=0)
if interpolate_fn:
loudness = interpolate_fn(
loudness, n_fft, hop_length, original_length=audio.size
)
if normalise:
loudness = (loudness + 80) / 80
return loudness
@gin.configurable
def extract_rms(
audio: np.ndarray,
window_size: int = 2048,
hop_length: int = 512,
sample_rate: Optional[float] = 16000.0,
interpolate_fn: Optional[Callable] = linear_interpolation,
):
# pad audio to centre frames
padded_audio = np.pad(audio, (window_size // 2, window_size // 2))
frames = librosa.util.frame(padded_audio, window_size, hop_length)
squared = frames ** 2
mean = np.mean(squared, axis=0)
root = np.sqrt(mean)
if interpolate_fn:
assert sample_rate is not None, "Must provide sample rate if upsampling"
root = interpolate_fn(root, window_size, hop_length, original_length=audio.size)
return root