Spaces:

akhaliq
/

neural-waveshaping-synthesis

Runtime error

neural-waveshaping-synthesis / neural_waveshaping_synthesis /data /utils /preprocess_audio.py

akhaliq3

spaces demo

607ecc1 almost 4 years ago

7.31 kB

	from functools import partial
	from typing import Callable, Sequence, Union

	import gin
	import librosa
	import numpy as np
	import resampy
	import scipy.io.wavfile as wavfile

	from .f0_extraction import extract_f0_with_crepe, extract_f0_with_pyin
	from .loudness_extraction import extract_perceptual_loudness, extract_rms
	from .mfcc_extraction import extract_mfcc
	from ...utils import apply, apply_unpack, unzip


	def read_audio_files(files: list):
	rates_and_audios = apply(wavfile.read, files)
	return unzip(rates_and_audios)


	def convert_to_float32_audio(audio: np.ndarray):
	if audio.dtype == np.float32:
	return audio

	max_sample_value = np.iinfo(audio.dtype).max
	floating_point_audio = audio / max_sample_value
	return floating_point_audio.astype(np.float32)


	def make_monophonic(audio: np.ndarray, strategy: str = "keep_left"):
	# deal with non stereo array formats
	if len(audio.shape) == 1:
	return audio
	elif len(audio.shape) != 2:
	raise ValueError("Unknown audio array format.")

	# deal with single audio channel
	if audio.shape[0] == 1:
	return audio[0]
	elif audio.shape[1] == 1:
	return audio[:, 0]
	# deal with more than two channels
	elif audio.shape[0] != 2 and audio.shape[1] != 2:
	raise ValueError("Expected stereo input audio but got too many channels.")

	# put channel first
	if audio.shape[1] == 2:
	audio = audio.T

	# make stereo audio monophonic
	if strategy == "keep_left":
	return audio[0]
	elif strategy == "keep_right":
	return audio[1]
	elif strategy == "sum":
	return np.mean(audio, axis=0)
	elif strategy == "diff":
	return audio[0] - audio[1]


	def normalise_signal(audio: np.ndarray, factor: float):
	return audio / factor


	def resample_audio(audio: np.ndarray, original_sr: float, target_sr: float):
	return resampy.resample(audio, original_sr, target_sr)


	def segment_signal(
	signal: np.ndarray,
	sample_rate: float,
	segment_length_in_seconds: float,
	hop_length_in_seconds: float,
	):
	segment_length_in_samples = int(sample_rate * segment_length_in_seconds)
	hop_length_in_samples = int(sample_rate * hop_length_in_seconds)
	segments = librosa.util.frame(
	signal, segment_length_in_samples, hop_length_in_samples
	)
	return segments


	def filter_segments(
	threshold: float,
	key_segments: np.ndarray,
	segments: Sequence[np.ndarray],
	):
	mean_keys = key_segments.mean(axis=0)
	mask = mean_keys > threshold
	filtered_segments = apply(
	lambda x: x[:, mask] if len(x.shape) == 2 else x[:, :, mask], segments
	)
	return filtered_segments


	def preprocess_single_audio_file(
	file: str,
	control_decimation_factor: float,
	target_sr: float = 16000.0,
	segment_length_in_seconds: float = 4.0,
	hop_length_in_seconds: float = 2.0,
	confidence_threshold: float = 0.85,
	f0_extractor: Callable = extract_f0_with_crepe,
	loudness_extractor: Callable = extract_perceptual_loudness,
	mfcc_extractor: Callable = extract_mfcc,
	normalisation_factor: Union[float, None] = None,
	):
	print("Loading audio file: %s..." % file)
	original_sr, audio = wavfile.read(file)
	audio = convert_to_float32_audio(audio)
	audio = make_monophonic(audio)

	if normalisation_factor:
	audio = normalise_signal(audio, normalisation_factor)

	print("Resampling audio file: %s..." % file)
	audio = resample_audio(audio, original_sr, target_sr)

	print("Extracting f0 with extractor '%s': %s..." % (f0_extractor.__name__, file))
	f0, confidence = f0_extractor(audio)

	print(
	"Extracting loudness with extractor '%s': %s..."
	% (loudness_extractor.__name__, file)
	)
	loudness = loudness_extractor(audio)

	print(
	"Extracting MFCC with extractor '%s': %s..." % (mfcc_extractor.__name__, file)
	)
	mfcc = mfcc_extractor(audio)

	print("Segmenting audio file: %s..." % file)
	segmented_audio = segment_signal(
	audio, target_sr, segment_length_in_seconds, hop_length_in_seconds
	)

	print("Segmenting control signals: %s..." % file)
	segmented_f0 = segment_signal(
	f0,
	target_sr / (control_decimation_factor or 1),
	segment_length_in_seconds,
	hop_length_in_seconds,
	)
	segmented_confidence = segment_signal(
	confidence,
	target_sr / (control_decimation_factor or 1),
	segment_length_in_seconds,
	hop_length_in_seconds,
	)
	segmented_loudness = segment_signal(
	loudness,
	target_sr / (control_decimation_factor or 1),
	segment_length_in_seconds,
	hop_length_in_seconds,
	)
	segmented_mfcc = segment_signal(
	mfcc,
	target_sr / (control_decimation_factor or 1),
	segment_length_in_seconds,
	hop_length_in_seconds,
	)

	(
	filtered_audio,
	filtered_f0,
	filtered_confidence,
	filtered_loudness,
	filtered_mfcc,
	) = filter_segments(
	confidence_threshold,
	segmented_confidence,
	(
	segmented_audio,
	segmented_f0,
	segmented_confidence,
	segmented_loudness,
	segmented_mfcc,
	),
	)

	if filtered_audio.shape[-1] == 0:
	print("No segments exceeding confidence threshold...")
	audio_split, f0_split, confidence_split, loudness_split, mfcc_split = (
	[],
	[],
	[],
	[],
	[],
	)
	else:
	split = lambda x: [e.squeeze() for e in np.split(x, x.shape[-1], -1)]
	audio_split = split(filtered_audio)
	f0_split = split(filtered_f0)
	confidence_split = split(filtered_confidence)
	loudness_split = split(filtered_loudness)
	mfcc_split = split(filtered_mfcc)

	return audio_split, f0_split, confidence_split, loudness_split, mfcc_split


	@gin.configurable
	def preprocess_audio(
	files: list,
	control_decimation_factor: float,
	target_sr: float = 16000,
	segment_length_in_seconds: float = 4.0,
	hop_length_in_seconds: float = 2.0,
	confidence_threshold: float = 0.85,
	f0_extractor: Callable = extract_f0_with_crepe,
	loudness_extractor: Callable = extract_perceptual_loudness,
	normalise_audio: bool = False,
	):
	if normalise_audio:
	print("Finding normalisation factor...")
	normalisation_factor = 0
	for file in files:
	_, audio = wavfile.read(file)
	audio = convert_to_float32_audio(audio)
	audio = make_monophonic(audio)
	max_value = np.abs(audio).max()
	normalisation_factor = (
	max_value if max_value > normalisation_factor else normalisation_factor
	)

	processor = partial(
	preprocess_single_audio_file,
	control_decimation_factor=control_decimation_factor,
	target_sr=target_sr,
	segment_length_in_seconds=segment_length_in_seconds,
	hop_length_in_seconds=hop_length_in_seconds,
	f0_extractor=f0_extractor,
	loudness_extractor=loudness_extractor,
	normalisation_factor=None if not normalise_audio else normalisation_factor,
	)
	for file in files:
	yield processor(file)