akhaliq3
spaces demo
607ecc1
from functools import partial
from typing import Callable, Sequence, Union
import gin
import librosa
import numpy as np
import resampy
import scipy.io.wavfile as wavfile
from .f0_extraction import extract_f0_with_crepe, extract_f0_with_pyin
from .loudness_extraction import extract_perceptual_loudness, extract_rms
from .mfcc_extraction import extract_mfcc
from ...utils import apply, apply_unpack, unzip
def read_audio_files(files: list):
rates_and_audios = apply(wavfile.read, files)
return unzip(rates_and_audios)
def convert_to_float32_audio(audio: np.ndarray):
if audio.dtype == np.float32:
return audio
max_sample_value = np.iinfo(audio.dtype).max
floating_point_audio = audio / max_sample_value
return floating_point_audio.astype(np.float32)
def make_monophonic(audio: np.ndarray, strategy: str = "keep_left"):
# deal with non stereo array formats
if len(audio.shape) == 1:
return audio
elif len(audio.shape) != 2:
raise ValueError("Unknown audio array format.")
# deal with single audio channel
if audio.shape[0] == 1:
return audio[0]
elif audio.shape[1] == 1:
return audio[:, 0]
# deal with more than two channels
elif audio.shape[0] != 2 and audio.shape[1] != 2:
raise ValueError("Expected stereo input audio but got too many channels.")
# put channel first
if audio.shape[1] == 2:
audio = audio.T
# make stereo audio monophonic
if strategy == "keep_left":
return audio[0]
elif strategy == "keep_right":
return audio[1]
elif strategy == "sum":
return np.mean(audio, axis=0)
elif strategy == "diff":
return audio[0] - audio[1]
def normalise_signal(audio: np.ndarray, factor: float):
return audio / factor
def resample_audio(audio: np.ndarray, original_sr: float, target_sr: float):
return resampy.resample(audio, original_sr, target_sr)
def segment_signal(
signal: np.ndarray,
sample_rate: float,
segment_length_in_seconds: float,
hop_length_in_seconds: float,
):
segment_length_in_samples = int(sample_rate * segment_length_in_seconds)
hop_length_in_samples = int(sample_rate * hop_length_in_seconds)
segments = librosa.util.frame(
signal, segment_length_in_samples, hop_length_in_samples
)
return segments
def filter_segments(
threshold: float,
key_segments: np.ndarray,
segments: Sequence[np.ndarray],
):
mean_keys = key_segments.mean(axis=0)
mask = mean_keys > threshold
filtered_segments = apply(
lambda x: x[:, mask] if len(x.shape) == 2 else x[:, :, mask], segments
)
return filtered_segments
def preprocess_single_audio_file(
file: str,
control_decimation_factor: float,
target_sr: float = 16000.0,
segment_length_in_seconds: float = 4.0,
hop_length_in_seconds: float = 2.0,
confidence_threshold: float = 0.85,
f0_extractor: Callable = extract_f0_with_crepe,
loudness_extractor: Callable = extract_perceptual_loudness,
mfcc_extractor: Callable = extract_mfcc,
normalisation_factor: Union[float, None] = None,
):
print("Loading audio file: %s..." % file)
original_sr, audio = wavfile.read(file)
audio = convert_to_float32_audio(audio)
audio = make_monophonic(audio)
if normalisation_factor:
audio = normalise_signal(audio, normalisation_factor)
print("Resampling audio file: %s..." % file)
audio = resample_audio(audio, original_sr, target_sr)
print("Extracting f0 with extractor '%s': %s..." % (f0_extractor.__name__, file))
f0, confidence = f0_extractor(audio)
print(
"Extracting loudness with extractor '%s': %s..."
% (loudness_extractor.__name__, file)
)
loudness = loudness_extractor(audio)
print(
"Extracting MFCC with extractor '%s': %s..." % (mfcc_extractor.__name__, file)
)
mfcc = mfcc_extractor(audio)
print("Segmenting audio file: %s..." % file)
segmented_audio = segment_signal(
audio, target_sr, segment_length_in_seconds, hop_length_in_seconds
)
print("Segmenting control signals: %s..." % file)
segmented_f0 = segment_signal(
f0,
target_sr / (control_decimation_factor or 1),
segment_length_in_seconds,
hop_length_in_seconds,
)
segmented_confidence = segment_signal(
confidence,
target_sr / (control_decimation_factor or 1),
segment_length_in_seconds,
hop_length_in_seconds,
)
segmented_loudness = segment_signal(
loudness,
target_sr / (control_decimation_factor or 1),
segment_length_in_seconds,
hop_length_in_seconds,
)
segmented_mfcc = segment_signal(
mfcc,
target_sr / (control_decimation_factor or 1),
segment_length_in_seconds,
hop_length_in_seconds,
)
(
filtered_audio,
filtered_f0,
filtered_confidence,
filtered_loudness,
filtered_mfcc,
) = filter_segments(
confidence_threshold,
segmented_confidence,
(
segmented_audio,
segmented_f0,
segmented_confidence,
segmented_loudness,
segmented_mfcc,
),
)
if filtered_audio.shape[-1] == 0:
print("No segments exceeding confidence threshold...")
audio_split, f0_split, confidence_split, loudness_split, mfcc_split = (
[],
[],
[],
[],
[],
)
else:
split = lambda x: [e.squeeze() for e in np.split(x, x.shape[-1], -1)]
audio_split = split(filtered_audio)
f0_split = split(filtered_f0)
confidence_split = split(filtered_confidence)
loudness_split = split(filtered_loudness)
mfcc_split = split(filtered_mfcc)
return audio_split, f0_split, confidence_split, loudness_split, mfcc_split
@gin.configurable
def preprocess_audio(
files: list,
control_decimation_factor: float,
target_sr: float = 16000,
segment_length_in_seconds: float = 4.0,
hop_length_in_seconds: float = 2.0,
confidence_threshold: float = 0.85,
f0_extractor: Callable = extract_f0_with_crepe,
loudness_extractor: Callable = extract_perceptual_loudness,
normalise_audio: bool = False,
):
if normalise_audio:
print("Finding normalisation factor...")
normalisation_factor = 0
for file in files:
_, audio = wavfile.read(file)
audio = convert_to_float32_audio(audio)
audio = make_monophonic(audio)
max_value = np.abs(audio).max()
normalisation_factor = (
max_value if max_value > normalisation_factor else normalisation_factor
)
processor = partial(
preprocess_single_audio_file,
control_decimation_factor=control_decimation_factor,
target_sr=target_sr,
segment_length_in_seconds=segment_length_in_seconds,
hop_length_in_seconds=hop_length_in_seconds,
f0_extractor=f0_extractor,
loudness_extractor=loudness_extractor,
normalisation_factor=None if not normalise_audio else normalisation_factor,
)
for file in files:
yield processor(file)