File size: 2,742 Bytes
607ecc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from typing import Callable, Optional
import warnings

import gin
import librosa
import numpy as np

from .upsampling import linear_interpolation


def compute_power_spectrogram(
    audio: np.ndarray,
    n_fft: int,
    hop_length: int,
    window: str,
    epsilon: float,
):
    spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length, window=window)
    magnitude_spectrogram = np.abs(spectrogram)
    power_spectrogram = librosa.amplitude_to_db(
        magnitude_spectrogram, ref=np.max, amin=epsilon
    )
    return power_spectrogram


def perform_perceptual_weighting(
    power_spectrogram_in_db: np.ndarray, sample_rate: float, n_fft: int
):
    centre_frequencies = librosa.fft_frequencies(sample_rate, n_fft)

    # We know that we will get a log(0) warning here due to the DC component -- we can
    # safely ignore as it is clipped to the default min dB value of -80.0 dB
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        weights = librosa.A_weighting(centre_frequencies)

    weights = np.expand_dims(weights, axis=1)
    weighted_spectrogram = power_spectrogram_in_db  # + weights
    return weighted_spectrogram


@gin.configurable
def extract_perceptual_loudness(
    audio: np.ndarray,
    sample_rate: float = 16000,
    n_fft: int = 2048,
    hop_length: int = 512,
    window: str = "hann",
    epsilon: float = 1e-5,
    interpolate_fn: Optional[Callable] = linear_interpolation,
    normalise: bool = True,
):
    power_spectrogram = compute_power_spectrogram(
        audio, n_fft=n_fft, hop_length=hop_length, window=window, epsilon=epsilon
    )
    perceptually_weighted_spectrogram = perform_perceptual_weighting(
        power_spectrogram, sample_rate=sample_rate, n_fft=n_fft
    )
    loudness = np.mean(perceptually_weighted_spectrogram, axis=0)
    if interpolate_fn:
        loudness = interpolate_fn(
            loudness, n_fft, hop_length, original_length=audio.size
        )

    if normalise:
        loudness = (loudness + 80) / 80

    return loudness


@gin.configurable
def extract_rms(
    audio: np.ndarray,
    window_size: int = 2048,
    hop_length: int = 512,
    sample_rate: Optional[float] = 16000.0,
    interpolate_fn: Optional[Callable] = linear_interpolation,
):
    # pad audio to centre frames
    padded_audio = np.pad(audio, (window_size // 2, window_size // 2))
    frames = librosa.util.frame(padded_audio, window_size, hop_length)
    squared = frames ** 2
    mean = np.mean(squared, axis=0)
    root = np.sqrt(mean)
    if interpolate_fn:
        assert sample_rate is not None, "Must provide sample rate if upsampling"
        root = interpolate_fn(root, window_size, hop_length, original_length=audio.size)

    return root