audio-styling

Build error

App Files Files Community

audio-styling / deepafx_st /data /dataset.py

JohnnyPittt

Duplicate from nateraw/deepafx-st

51da11a almost 3 years ago

raw

history blame contribute delete

13.5 kB

	import os
	import sys
	import csv
	import glob
	import torch
	import random
	from tqdm import tqdm
	from typing import List, Any

	from deepafx_st.data.audio import AudioFile
	import deepafx_st.utils as utils
	import deepafx_st.data.augmentations as augmentations


	class AudioDataset(torch.utils.data.Dataset):
	"""Audio dataset which returns an input and target file.

	Args:
	audio_dir (str): Path to the top level of the audio dataset.
	input_dir (List[str], optional): List of paths to the directories containing input audio files. Default: ["clean"]
	subset (str, optional): Dataset subset. One of ["train", "val", "test"]. Default: "train"
	length (int, optional): Number of samples to load for each example. Default: 65536
	train_frac (float, optional): Fraction of the files to use for training subset. Default: 0.8
	val_frac (float, optional): Fraction of the files to use for validation subset. Default: 0.1
	buffer_size_gb (float, optional): Size of audio to read into RAM in GB at any given time. Default: 10.0
	Note: This is the buffer size PER DataLoader worker. So total RAM = buffer_size_gb * num_workers
	buffer_reload_rate (int, optional): Number of items to generate before loading next chunk of dataset. Default: 10000
	half (bool, optional): Sotre audio samples as float 16. Default: False
	num_examples_per_epoch (int, optional): Define an epoch as certain number of audio examples. Default: 10000
	random_scale_input (bool, optional): Apply random gain scaling to input utterances. Default: False
	random_scale_target (bool, optional): Apply same random gain scaling to target utterances. Default: False
	augmentations (dict, optional): List of augmentation types to apply to inputs. Default: []
	freq_corrupt (bool, optional): Apply bad EQ filters. Default: False
	drc_corrupt (bool, optional): Apply an expander to corrupt dynamic range. Default: False
	ext (str, optional): Expected audio file extension. Default: "wav"
	"""

	def __init__(
	self,
	audio_dir,
	input_dirs: List[str] = ["cleanraw"],
	subset: str = "train",
	length: int = 65536,
	train_frac: float = 0.8,
	val_per: float = 0.1,
	buffer_size_gb: float = 1.0,
	buffer_reload_rate: float = 1000,
	half: bool = False,
	num_examples_per_epoch: int = 10000,
	random_scale_input: bool = False,
	random_scale_target: bool = False,
	augmentations: dict = {},
	freq_corrupt: bool = False,
	drc_corrupt: bool = False,
	ext: str = "wav",
	):
	super().__init__()
	self.audio_dir = audio_dir
	self.dataset_name = os.path.basename(audio_dir)
	self.input_dirs = input_dirs
	self.subset = subset
	self.length = length
	self.train_frac = train_frac
	self.val_per = val_per
	self.buffer_size_gb = buffer_size_gb
	self.buffer_reload_rate = buffer_reload_rate
	self.half = half
	self.num_examples_per_epoch = num_examples_per_epoch
	self.random_scale_input = random_scale_input
	self.random_scale_target = random_scale_target
	self.augmentations = augmentations
	self.freq_corrupt = freq_corrupt
	self.drc_corrupt = drc_corrupt
	self.ext = ext

	self.input_filepaths = []
	for input_dir in input_dirs:
	search_path = os.path.join(audio_dir, input_dir, f"*.{ext}")
	self.input_filepaths += glob.glob(search_path)
	self.input_filepaths = sorted(self.input_filepaths)

	# create dataset split based on subset
	self.input_filepaths = utils.split_dataset(
	self.input_filepaths,
	subset,
	train_frac,
	)

	# get details about input audio files
	input_files = {}
	input_dur_frames = 0
	for input_filepath in tqdm(self.input_filepaths, ncols=80):
	file_id = os.path.basename(input_filepath)
	audio_file = AudioFile(
	input_filepath,
	preload=False,
	half=half,
	)
	if audio_file.num_frames < (self.length * 2):
	continue
	input_files[file_id] = audio_file
	input_dur_frames += input_files[file_id].num_frames

	if len(list(input_files.items())) < 1:
	raise RuntimeError(f"No files found in {search_path}.")

	input_dur_hr = (input_dur_frames / input_files[file_id].sample_rate) / 3600
	print(
	f"\nLoaded {len(input_files)} files for {subset} = {input_dur_hr:0.2f} hours."
	)

	self.sample_rate = input_files[file_id].sample_rate

	# save a csv file with details about the train and test split
	splits_dir = os.path.join("configs", "splits")
	if not os.path.isdir(splits_dir):
	os.makedirs(splits_dir)
	csv_filepath = os.path.join(splits_dir, f"{self.dataset_name}_{self.subset}_set.csv")

	with open(csv_filepath, "w") as fp:
	dw = csv.DictWriter(fp, ["file_id", "filepath", "type", "subset"])
	dw.writeheader()
	for input_filepath in self.input_filepaths:
	dw.writerow(
	{
	"file_id": self.get_file_id(input_filepath),
	"filepath": input_filepath,
	"type": "input",
	"subset": self.subset,
	}
	)

	# some setup for iteratble loading of the dataset into RAM
	self.items_since_load = self.buffer_reload_rate

	def __len__(self):
	return self.num_examples_per_epoch

	def load_audio_buffer(self):
	self.input_files_loaded = {} # clear audio buffer
	self.items_since_load = 0 # reset iteration counter
	nbytes_loaded = 0 # counter for data in RAM

	# different subset in each
	random.shuffle(self.input_filepaths)

	# load files into RAM
	for input_filepath in self.input_filepaths:
	file_id = os.path.basename(input_filepath)
	audio_file = AudioFile(
	input_filepath,
	preload=True,
	half=self.half,
	)

	if audio_file.num_frames < (self.length * 2):
	continue

	self.input_files_loaded[file_id] = audio_file

	nbytes = audio_file.audio.element_size() * audio_file.audio.nelement()
	nbytes_loaded += nbytes

	# check the size of loaded data
	if nbytes_loaded > self.buffer_size_gb * 1e9:
	break

	def generate_pair(self):
	# ------------------------ Input audio ----------------------
	rand_input_file_id = None
	input_file = None
	start_idx = None
	stop_idx = None
	while True:
	rand_input_file_id = self.get_random_file_id(self.input_files_loaded.keys())

	# use this random key to retrieve an input file
	input_file = self.input_files_loaded[rand_input_file_id]

	# load the audio data if needed
	if not input_file.loaded:
	raise RuntimeError("Audio not loaded.")

	# get a random patch of size `self.length` x 2
	start_idx, stop_idx = self.get_random_patch(
	input_file, int(self.length * 2)
	)
	if start_idx >= 0:
	break

	input_audio = input_file.audio[:, start_idx:stop_idx].clone().detach()
	input_audio = input_audio.view(1, -1)

	if self.half:
	input_audio = input_audio.float()

	# peak normalize to -12 dBFS
	input_audio /= input_audio.abs().max()
	input_audio = 10 * (-12.0 / 20) # with min 3 dBFS headroom

	if len(list(self.augmentations.items())) > 0:
	if torch.rand(1).sum() < 0.5:
	input_audio_aug = augmentations.apply(
	[input_audio],
	self.sample_rate,
	self.augmentations,
	)[0]
	else:
	input_audio_aug = input_audio.clone()
	else:
	input_audio_aug = input_audio.clone()

	input_audio_corrupt = input_audio_aug.clone()
	# apply frequency and dynamic range corrpution (expander)
	if self.freq_corrupt and torch.rand(1).sum() < 0.75:
	input_audio_corrupt = augmentations.frequency_corruption(
	[input_audio_corrupt], self.sample_rate
	)[0]

	# peak normalize again before passing through dynamic range expander
	input_audio_corrupt /= input_audio_corrupt.abs().max()
	input_audio_corrupt = 10 * (-12.0 / 20) # with min 3 dBFS headroom

	if self.drc_corrupt and torch.rand(1).sum() < 0.10:
	input_audio_corrupt = augmentations.dynamic_range_corruption(
	[input_audio_corrupt], self.sample_rate
	)[0]

	# ------------------------ Target audio ----------------------
	# use the same augmented audio clip, add different random EQ and compressor

	target_audio_corrupt = input_audio_aug.clone()
	# apply frequency and dynamic range corrpution (expander)
	if self.freq_corrupt and torch.rand(1).sum() < 0.75:
	target_audio_corrupt = augmentations.frequency_corruption(
	[target_audio_corrupt], self.sample_rate
	)[0]

	# peak normalize again before passing through dynamic range compressor
	input_audio_corrupt /= input_audio_corrupt.abs().max()
	input_audio_corrupt = 10 * (-12.0 / 20) # with min 3 dBFS headroom

	if self.drc_corrupt and torch.rand(1).sum() < 0.75:
	target_audio_corrupt = augmentations.dynamic_range_compression(
	[target_audio_corrupt], self.sample_rate
	)[0]

	return input_audio_corrupt, target_audio_corrupt

	def __getitem__(self, _):
	""" """

	# increment counter
	self.items_since_load += 1

	# load next chunk into buffer if needed
	if self.items_since_load > self.buffer_reload_rate:
	self.load_audio_buffer()

	# generate pairs for style training
	input_audio, target_audio = self.generate_pair()

	# ------------------------ Conform length of files -------------------
	input_audio = utils.conform_length(input_audio, int(self.length * 2))
	target_audio = utils.conform_length(target_audio, int(self.length * 2))

	# ------------------------ Apply fade in and fade out -------------------
	input_audio = utils.linear_fade(input_audio, sample_rate=self.sample_rate)
	target_audio = utils.linear_fade(target_audio, sample_rate=self.sample_rate)

	# ------------------------ Final normalizeation ----------------------
	# always peak normalize final input to -12 dBFS
	input_audio /= input_audio.abs().max()
	input_audio = 10 * (-12.0 / 20.0)

	# always peak normalize the target to -12 dBFS
	target_audio /= target_audio.abs().max()
	target_audio = 10 * (-12.0 / 20.0)

	return input_audio, target_audio

	@staticmethod
	def get_random_file_id(keys):
	# generate a random index into the keys of the input files
	rand_input_idx = torch.randint(0, len(keys) - 1, [1])[0]
	# find the key (file_id) correponding to the random index
	rand_input_file_id = list(keys)[rand_input_idx]

	return rand_input_file_id

	@staticmethod
	def get_random_patch(audio_file, length, check_silence=True):
	silent = True
	count = 0
	while silent:
	count += 1
	start_idx = torch.randint(0, audio_file.num_frames - length - 1, [1])[0]
	# int(torch.rand(1) * (audio_file.num_frames - length))
	stop_idx = start_idx + length
	patch = audio_file.audio[:, start_idx:stop_idx].clone().detach()

	length = patch.shape[-1]
	first_patch = patch[..., : length // 2]
	second_patch = patch[..., length // 2 :]

	if (
	(first_patch2).mean() > 1e-5 and (second_patch2).mean() > 1e-5
	) or not check_silence:
	silent = False

	if count > 100:
	print("get_random_patch count", count)
	return -1, -1
	# break

	return start_idx, stop_idx

	def get_file_id(self, filepath):
	"""Given a filepath extract the DAPS file id.

	Args:
	filepath (str): Path to an audio files in the DAPS dataset.

	Returns:
	file_id (str): DAPS file id of the form <participant_id>_<script_id>
	file_set (str): The DAPS set to which the file belongs.
	"""
	file_id = os.path.basename(filepath).split("_")[:2]
	file_id = "_".join(file_id)
	return file_id

	def get_file_set(self, filepath):
	"""Given a filepath extract the DAPS file set name.

	Args:
	filepath (str): Path to an audio files in the DAPS dataset.

	Returns:
	file_set (str): The DAPS set to which the file belongs.
	"""
	file_set = os.path.basename(filepath).split("_")[2:]
	file_set = "_".join(file_set)
	file_set = file_set.replace(f".{self.ext}", "")
	return file_set