Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	| from functools import partial | |
| from typing import Callable, Sequence, Union | |
| import gin | |
| import librosa | |
| import numpy as np | |
| import resampy | |
| import scipy.io.wavfile as wavfile | |
| from .f0_extraction import extract_f0_with_crepe, extract_f0_with_pyin | |
| from .loudness_extraction import extract_perceptual_loudness, extract_rms | |
| from .mfcc_extraction import extract_mfcc | |
| from ...utils import apply, apply_unpack, unzip | |
| def read_audio_files(files: list): | |
| rates_and_audios = apply(wavfile.read, files) | |
| return unzip(rates_and_audios) | |
| def convert_to_float32_audio(audio: np.ndarray): | |
| if audio.dtype == np.float32: | |
| return audio | |
| max_sample_value = np.iinfo(audio.dtype).max | |
| floating_point_audio = audio / max_sample_value | |
| return floating_point_audio.astype(np.float32) | |
| def make_monophonic(audio: np.ndarray, strategy: str = "keep_left"): | |
| # deal with non stereo array formats | |
| if len(audio.shape) == 1: | |
| return audio | |
| elif len(audio.shape) != 2: | |
| raise ValueError("Unknown audio array format.") | |
| # deal with single audio channel | |
| if audio.shape[0] == 1: | |
| return audio[0] | |
| elif audio.shape[1] == 1: | |
| return audio[:, 0] | |
| # deal with more than two channels | |
| elif audio.shape[0] != 2 and audio.shape[1] != 2: | |
| raise ValueError("Expected stereo input audio but got too many channels.") | |
| # put channel first | |
| if audio.shape[1] == 2: | |
| audio = audio.T | |
| # make stereo audio monophonic | |
| if strategy == "keep_left": | |
| return audio[0] | |
| elif strategy == "keep_right": | |
| return audio[1] | |
| elif strategy == "sum": | |
| return np.mean(audio, axis=0) | |
| elif strategy == "diff": | |
| return audio[0] - audio[1] | |
| def normalise_signal(audio: np.ndarray, factor: float): | |
| return audio / factor | |
| def resample_audio(audio: np.ndarray, original_sr: float, target_sr: float): | |
| return resampy.resample(audio, original_sr, target_sr) | |
| def segment_signal( | |
| signal: np.ndarray, | |
| sample_rate: float, | |
| segment_length_in_seconds: float, | |
| hop_length_in_seconds: float, | |
| ): | |
| segment_length_in_samples = int(sample_rate * segment_length_in_seconds) | |
| hop_length_in_samples = int(sample_rate * hop_length_in_seconds) | |
| segments = librosa.util.frame( | |
| signal, segment_length_in_samples, hop_length_in_samples | |
| ) | |
| return segments | |
| def filter_segments( | |
| threshold: float, | |
| key_segments: np.ndarray, | |
| segments: Sequence[np.ndarray], | |
| ): | |
| mean_keys = key_segments.mean(axis=0) | |
| mask = mean_keys > threshold | |
| filtered_segments = apply( | |
| lambda x: x[:, mask] if len(x.shape) == 2 else x[:, :, mask], segments | |
| ) | |
| return filtered_segments | |
| def preprocess_single_audio_file( | |
| file: str, | |
| control_decimation_factor: float, | |
| target_sr: float = 16000.0, | |
| segment_length_in_seconds: float = 4.0, | |
| hop_length_in_seconds: float = 2.0, | |
| confidence_threshold: float = 0.85, | |
| f0_extractor: Callable = extract_f0_with_crepe, | |
| loudness_extractor: Callable = extract_perceptual_loudness, | |
| mfcc_extractor: Callable = extract_mfcc, | |
| normalisation_factor: Union[float, None] = None, | |
| ): | |
| print("Loading audio file: %s..." % file) | |
| original_sr, audio = wavfile.read(file) | |
| audio = convert_to_float32_audio(audio) | |
| audio = make_monophonic(audio) | |
| if normalisation_factor: | |
| audio = normalise_signal(audio, normalisation_factor) | |
| print("Resampling audio file: %s..." % file) | |
| audio = resample_audio(audio, original_sr, target_sr) | |
| print("Extracting f0 with extractor '%s': %s..." % (f0_extractor.__name__, file)) | |
| f0, confidence = f0_extractor(audio) | |
| print( | |
| "Extracting loudness with extractor '%s': %s..." | |
| % (loudness_extractor.__name__, file) | |
| ) | |
| loudness = loudness_extractor(audio) | |
| print( | |
| "Extracting MFCC with extractor '%s': %s..." % (mfcc_extractor.__name__, file) | |
| ) | |
| mfcc = mfcc_extractor(audio) | |
| print("Segmenting audio file: %s..." % file) | |
| segmented_audio = segment_signal( | |
| audio, target_sr, segment_length_in_seconds, hop_length_in_seconds | |
| ) | |
| print("Segmenting control signals: %s..." % file) | |
| segmented_f0 = segment_signal( | |
| f0, | |
| target_sr / (control_decimation_factor or 1), | |
| segment_length_in_seconds, | |
| hop_length_in_seconds, | |
| ) | |
| segmented_confidence = segment_signal( | |
| confidence, | |
| target_sr / (control_decimation_factor or 1), | |
| segment_length_in_seconds, | |
| hop_length_in_seconds, | |
| ) | |
| segmented_loudness = segment_signal( | |
| loudness, | |
| target_sr / (control_decimation_factor or 1), | |
| segment_length_in_seconds, | |
| hop_length_in_seconds, | |
| ) | |
| segmented_mfcc = segment_signal( | |
| mfcc, | |
| target_sr / (control_decimation_factor or 1), | |
| segment_length_in_seconds, | |
| hop_length_in_seconds, | |
| ) | |
| ( | |
| filtered_audio, | |
| filtered_f0, | |
| filtered_confidence, | |
| filtered_loudness, | |
| filtered_mfcc, | |
| ) = filter_segments( | |
| confidence_threshold, | |
| segmented_confidence, | |
| ( | |
| segmented_audio, | |
| segmented_f0, | |
| segmented_confidence, | |
| segmented_loudness, | |
| segmented_mfcc, | |
| ), | |
| ) | |
| if filtered_audio.shape[-1] == 0: | |
| print("No segments exceeding confidence threshold...") | |
| audio_split, f0_split, confidence_split, loudness_split, mfcc_split = ( | |
| [], | |
| [], | |
| [], | |
| [], | |
| [], | |
| ) | |
| else: | |
| split = lambda x: [e.squeeze() for e in np.split(x, x.shape[-1], -1)] | |
| audio_split = split(filtered_audio) | |
| f0_split = split(filtered_f0) | |
| confidence_split = split(filtered_confidence) | |
| loudness_split = split(filtered_loudness) | |
| mfcc_split = split(filtered_mfcc) | |
| return audio_split, f0_split, confidence_split, loudness_split, mfcc_split | |
| def preprocess_audio( | |
| files: list, | |
| control_decimation_factor: float, | |
| target_sr: float = 16000, | |
| segment_length_in_seconds: float = 4.0, | |
| hop_length_in_seconds: float = 2.0, | |
| confidence_threshold: float = 0.85, | |
| f0_extractor: Callable = extract_f0_with_crepe, | |
| loudness_extractor: Callable = extract_perceptual_loudness, | |
| normalise_audio: bool = False, | |
| ): | |
| if normalise_audio: | |
| print("Finding normalisation factor...") | |
| normalisation_factor = 0 | |
| for file in files: | |
| _, audio = wavfile.read(file) | |
| audio = convert_to_float32_audio(audio) | |
| audio = make_monophonic(audio) | |
| max_value = np.abs(audio).max() | |
| normalisation_factor = ( | |
| max_value if max_value > normalisation_factor else normalisation_factor | |
| ) | |
| processor = partial( | |
| preprocess_single_audio_file, | |
| control_decimation_factor=control_decimation_factor, | |
| target_sr=target_sr, | |
| segment_length_in_seconds=segment_length_in_seconds, | |
| hop_length_in_seconds=hop_length_in_seconds, | |
| f0_extractor=f0_extractor, | |
| loudness_extractor=loudness_extractor, | |
| normalisation_factor=None if not normalise_audio else normalisation_factor, | |
| ) | |
| for file in files: | |
| yield processor(file) | |
