Spaces:
Running
on
Zero
Running
on
Zero
| import copy | |
| from contextlib import contextmanager | |
| from inspect import signature | |
| from typing import List | |
| import numpy as np | |
| import torch | |
| from flatten_dict import flatten | |
| from flatten_dict import unflatten | |
| from numpy.random import RandomState | |
| from .. import ml | |
| from ..core import AudioSignal | |
| from ..core import util | |
| from .datasets import AudioLoader | |
| tt = torch.tensor | |
| """Shorthand for converting things to torch.tensor.""" | |
| class BaseTransform: | |
| """This is the base class for all transforms that are implemented | |
| in this library. Transforms have two main operations: ``transform`` | |
| and ``instantiate``. | |
| ``instantiate`` sets the parameters randomly | |
| from distribution tuples for each parameter. For example, for the | |
| ``BackgroundNoise`` transform, the signal-to-noise ratio (``snr``) | |
| is chosen randomly by instantiate. By default, it chosen uniformly | |
| between 10.0 and 30.0 (the tuple is set to ``("uniform", 10.0, 30.0)``). | |
| ``transform`` applies the transform using the instantiated parameters. | |
| A simple example is as follows: | |
| >>> seed = 0 | |
| >>> signal = ... | |
| >>> transform = transforms.NoiseFloor(db = ("uniform", -50.0, -30.0)) | |
| >>> kwargs = transform.instantiate() | |
| >>> output = transform(signal.clone(), **kwargs) | |
| By breaking apart the instantiation of parameters from the actual audio | |
| processing of the transform, we can make things more reproducible, while | |
| also applying the transform on batches of data efficiently on GPU, | |
| rather than on individual audio samples. | |
| .. note:: | |
| We call ``signal.clone()`` for the input to the ``transform`` function | |
| because signals are modified in-place! If you don't clone the signal, | |
| you will lose the original data. | |
| Parameters | |
| ---------- | |
| keys : list, optional | |
| Keys that the transform looks for when | |
| calling ``self.transform``, by default []. In general this is | |
| set automatically, and you won't need to manipulate this argument. | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| Examples | |
| -------- | |
| >>> seed = 0 | |
| >>> | |
| >>> audio_path = "tests/audio/spk/f10_script4_produced.wav" | |
| >>> signal = AudioSignal(audio_path, offset=10, duration=2) | |
| >>> transform = tfm.Compose( | |
| >>> [ | |
| >>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), | |
| >>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), | |
| >>> ], | |
| >>> ) | |
| >>> | |
| >>> kwargs = transform.instantiate(seed, signal) | |
| >>> output = transform(signal, **kwargs) | |
| """ | |
| def __init__(self, keys: list = [], name: str = None, prob: float = 1.0): | |
| # Get keys from the _transform signature. | |
| tfm_keys = list(signature(self._transform).parameters.keys()) | |
| # Filter out signal and kwargs keys. | |
| ignore_keys = ["signal", "kwargs"] | |
| tfm_keys = [k for k in tfm_keys if k not in ignore_keys] | |
| # Combine keys specified by the child class, the keys found in | |
| # _transform signature, and the mask key. | |
| self.keys = keys + tfm_keys + ["mask"] | |
| self.prob = prob | |
| if name is None: | |
| name = self.__class__.__name__ | |
| self.name = name | |
| def _prepare(self, batch: dict): | |
| sub_batch = batch[self.name] | |
| for k in self.keys: | |
| assert k in sub_batch.keys(), f"{k} not in batch" | |
| return sub_batch | |
| def _transform(self, signal): | |
| return signal | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| return {} | |
| def apply_mask(batch: dict, mask: torch.Tensor): | |
| """Applies a mask to the batch. | |
| Parameters | |
| ---------- | |
| batch : dict | |
| Batch whose values will be masked in the ``transform`` pass. | |
| mask : torch.Tensor | |
| Mask to apply to batch. | |
| Returns | |
| ------- | |
| dict | |
| A dictionary that contains values only where ``mask = True``. | |
| """ | |
| masked_batch = {k: v[mask] for k, v in flatten(batch).items()} | |
| return unflatten(masked_batch) | |
| def transform(self, signal: AudioSignal, **kwargs): | |
| """Apply the transform to the audio signal, | |
| with given keyword arguments. | |
| Parameters | |
| ---------- | |
| signal : AudioSignal | |
| Signal that will be modified by the transforms in-place. | |
| kwargs: dict | |
| Keyword arguments to the specific transforms ``self._transform`` | |
| function. | |
| Returns | |
| ------- | |
| AudioSignal | |
| Transformed AudioSignal. | |
| Examples | |
| -------- | |
| >>> for seed in range(10): | |
| >>> kwargs = transform.instantiate(seed, signal) | |
| >>> output = transform(signal.clone(), **kwargs) | |
| """ | |
| tfm_kwargs = self._prepare(kwargs) | |
| mask = tfm_kwargs["mask"] | |
| if torch.any(mask): | |
| tfm_kwargs = self.apply_mask(tfm_kwargs, mask) | |
| tfm_kwargs = {k: v for k, v in tfm_kwargs.items() if k != "mask"} | |
| signal[mask] = self._transform(signal[mask], **tfm_kwargs) | |
| return signal | |
| def __call__(self, *args, **kwargs): | |
| return self.transform(*args, **kwargs) | |
| def instantiate( | |
| self, | |
| state: RandomState = None, | |
| signal: AudioSignal = None, | |
| ): | |
| """Instantiates parameters for the transform. | |
| Parameters | |
| ---------- | |
| state : RandomState, optional | |
| _description_, by default None | |
| signal : AudioSignal, optional | |
| _description_, by default None | |
| Returns | |
| ------- | |
| dict | |
| Dictionary containing instantiated arguments for every keyword | |
| argument to ``self._transform``. | |
| Examples | |
| -------- | |
| >>> for seed in range(10): | |
| >>> kwargs = transform.instantiate(seed, signal) | |
| >>> output = transform(signal.clone(), **kwargs) | |
| """ | |
| state = util.random_state(state) | |
| # Not all instantiates need the signal. Check if signal | |
| # is needed before passing it in, so that the end-user | |
| # doesn't need to have variables they're not using flowing | |
| # into their function. | |
| needs_signal = "signal" in set(signature(self._instantiate).parameters.keys()) | |
| kwargs = {} | |
| if needs_signal: | |
| kwargs = {"signal": signal} | |
| # Instantiate the parameters for the transform. | |
| params = self._instantiate(state, **kwargs) | |
| for k in list(params.keys()): | |
| v = params[k] | |
| if isinstance(v, (AudioSignal, torch.Tensor, dict)): | |
| params[k] = v | |
| else: | |
| params[k] = tt(v) | |
| mask = state.rand() <= self.prob | |
| params[f"mask"] = tt(mask) | |
| # Put the params into a nested dictionary that will be | |
| # used later when calling the transform. This is to avoid | |
| # collisions in the dictionary. | |
| params = {self.name: params} | |
| return params | |
| def batch_instantiate( | |
| self, | |
| states: list = None, | |
| signal: AudioSignal = None, | |
| ): | |
| """Instantiates arguments for every item in a batch, | |
| given a list of states. Each state in the list | |
| corresponds to one item in the batch. | |
| Parameters | |
| ---------- | |
| states : list, optional | |
| List of states, by default None | |
| signal : AudioSignal, optional | |
| AudioSignal to pass to the ``self.instantiate`` section | |
| if it is needed for this transform, by default None | |
| Returns | |
| ------- | |
| dict | |
| Collated dictionary of arguments. | |
| Examples | |
| -------- | |
| >>> batch_size = 4 | |
| >>> signal = AudioSignal(audio_path, offset=10, duration=2) | |
| >>> signal_batch = AudioSignal.batch([signal.clone() for _ in range(batch_size)]) | |
| >>> | |
| >>> states = [seed + idx for idx in list(range(batch_size))] | |
| >>> kwargs = transform.batch_instantiate(states, signal_batch) | |
| >>> batch_output = transform(signal_batch, **kwargs) | |
| """ | |
| kwargs = [] | |
| for state in states: | |
| kwargs.append(self.instantiate(state, signal)) | |
| kwargs = util.collate(kwargs) | |
| return kwargs | |
| class Identity(BaseTransform): | |
| """This transform just returns the original signal.""" | |
| pass | |
| class SpectralTransform(BaseTransform): | |
| """Spectral transforms require STFT data to exist, since manipulations | |
| of the STFT require the spectrogram. This just calls ``stft`` before | |
| the transform is called, and calls ``istft`` after the transform is | |
| called so that the audio data is written to after the spectral | |
| manipulation. | |
| """ | |
| def transform(self, signal, **kwargs): | |
| signal.stft() | |
| super().transform(signal, **kwargs) | |
| signal.istft() | |
| return signal | |
| class Compose(BaseTransform): | |
| """Compose applies transforms in sequence, one after the other. The | |
| transforms are passed in as positional arguments or as a list like so: | |
| >>> transform = tfm.Compose( | |
| >>> [ | |
| >>> tfm.RoomImpulseResponse(sources=["tests/audio/irs.csv"]), | |
| >>> tfm.BackgroundNoise(sources=["tests/audio/noises.csv"]), | |
| >>> ], | |
| >>> ) | |
| This will convolve the signal with a room impulse response, and then | |
| add background noise to the signal. Instantiate instantiates | |
| all the parameters for every transform in the transform list so the | |
| interface for using the Compose transform is the same as everything | |
| else: | |
| >>> kwargs = transform.instantiate() | |
| >>> output = transform(signal.clone(), **kwargs) | |
| Under the hood, the transform maps each transform to a unique name | |
| under the hood of the form ``{position}.{name}``, where ``position`` | |
| is the index of the transform in the list. ``Compose`` can nest | |
| within other ``Compose`` transforms, like so: | |
| >>> preprocess = transforms.Compose( | |
| >>> tfm.GlobalVolumeNorm(), | |
| >>> tfm.CrossTalk(), | |
| >>> name="preprocess", | |
| >>> ) | |
| >>> augment = transforms.Compose( | |
| >>> tfm.RoomImpulseResponse(), | |
| >>> tfm.BackgroundNoise(), | |
| >>> name="augment", | |
| >>> ) | |
| >>> postprocess = transforms.Compose( | |
| >>> tfm.VolumeChange(), | |
| >>> tfm.RescaleAudio(), | |
| >>> tfm.ShiftPhase(), | |
| >>> name="postprocess", | |
| >>> ) | |
| >>> transform = transforms.Compose(preprocess, augment, postprocess), | |
| This defines 3 composed transforms, and then composes them in sequence | |
| with one another. | |
| Parameters | |
| ---------- | |
| *transforms : list | |
| List of transforms to apply | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__(self, *transforms: list, name: str = None, prob: float = 1.0): | |
| if isinstance(transforms[0], list): | |
| transforms = transforms[0] | |
| for i, tfm in enumerate(transforms): | |
| tfm.name = f"{i}.{tfm.name}" | |
| keys = [tfm.name for tfm in transforms] | |
| super().__init__(keys=keys, name=name, prob=prob) | |
| self.transforms = transforms | |
| self.transforms_to_apply = keys | |
| def filter(self, *names: list): | |
| """This can be used to skip transforms entirely when applying | |
| the sequence of transforms to a signal. For example, take | |
| the following transforms with the names ``preprocess, augment, postprocess``. | |
| >>> preprocess = transforms.Compose( | |
| >>> tfm.GlobalVolumeNorm(), | |
| >>> tfm.CrossTalk(), | |
| >>> name="preprocess", | |
| >>> ) | |
| >>> augment = transforms.Compose( | |
| >>> tfm.RoomImpulseResponse(), | |
| >>> tfm.BackgroundNoise(), | |
| >>> name="augment", | |
| >>> ) | |
| >>> postprocess = transforms.Compose( | |
| >>> tfm.VolumeChange(), | |
| >>> tfm.RescaleAudio(), | |
| >>> tfm.ShiftPhase(), | |
| >>> name="postprocess", | |
| >>> ) | |
| >>> transform = transforms.Compose(preprocess, augment, postprocess) | |
| If we wanted to apply all 3 to a signal, we do: | |
| >>> kwargs = transform.instantiate() | |
| >>> output = transform(signal.clone(), **kwargs) | |
| But if we only wanted to apply the ``preprocess`` and ``postprocess`` | |
| transforms to the signal, we do: | |
| >>> with transform_fn.filter("preprocess", "postprocess"): | |
| >>> output = transform(signal.clone(), **kwargs) | |
| Parameters | |
| ---------- | |
| *names : list | |
| List of transforms, identified by name, to apply to signal. | |
| """ | |
| old_transforms = self.transforms_to_apply | |
| self.transforms_to_apply = names | |
| yield | |
| self.transforms_to_apply = old_transforms | |
| def _transform(self, signal, **kwargs): | |
| for transform in self.transforms: | |
| if any([x in transform.name for x in self.transforms_to_apply]): | |
| signal = transform(signal, **kwargs) | |
| return signal | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| parameters = {} | |
| for transform in self.transforms: | |
| parameters.update(transform.instantiate(state, signal=signal)) | |
| return parameters | |
| def __getitem__(self, idx): | |
| return self.transforms[idx] | |
| def __len__(self): | |
| return len(self.transforms) | |
| def __iter__(self): | |
| for transform in self.transforms: | |
| yield transform | |
| class Choose(Compose): | |
| """Choose logic is the same as :py:func:`audiotools.data.transforms.Compose`, | |
| but instead of applying all the transforms in sequence, it applies just a single transform, | |
| which is chosen for each item in the batch. | |
| Parameters | |
| ---------- | |
| *transforms : list | |
| List of transforms to apply | |
| weights : list | |
| Probability of choosing any specific transform. | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| Examples | |
| -------- | |
| >>> transforms.Choose(tfm.LowPass(), tfm.HighPass()) | |
| """ | |
| def __init__( | |
| self, | |
| *transforms: list, | |
| weights: list = None, | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(*transforms, name=name, prob=prob) | |
| if weights is None: | |
| _len = len(self.transforms) | |
| weights = [1 / _len for _ in range(_len)] | |
| self.weights = np.array(weights) | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| kwargs = super()._instantiate(state, signal) | |
| tfm_idx = list(range(len(self.transforms))) | |
| tfm_idx = state.choice(tfm_idx, p=self.weights) | |
| one_hot = [] | |
| for i, t in enumerate(self.transforms): | |
| mask = kwargs[t.name]["mask"] | |
| if mask.item(): | |
| kwargs[t.name]["mask"] = tt(i == tfm_idx) | |
| one_hot.append(kwargs[t.name]["mask"]) | |
| kwargs["one_hot"] = one_hot | |
| return kwargs | |
| class Repeat(Compose): | |
| """Repeatedly applies a given transform ``n_repeat`` times." | |
| Parameters | |
| ---------- | |
| transform : BaseTransform | |
| Transform to repeat. | |
| n_repeat : int, optional | |
| Number of times to repeat transform, by default 1 | |
| """ | |
| def __init__( | |
| self, | |
| transform, | |
| n_repeat: int = 1, | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| transforms = [copy.copy(transform) for _ in range(n_repeat)] | |
| super().__init__(transforms, name=name, prob=prob) | |
| self.n_repeat = n_repeat | |
| class RepeatUpTo(Choose): | |
| """Repeatedly applies a given transform up to ``max_repeat`` times." | |
| Parameters | |
| ---------- | |
| transform : BaseTransform | |
| Transform to repeat. | |
| max_repeat : int, optional | |
| Max number of times to repeat transform, by default 1 | |
| weights : list | |
| Probability of choosing any specific number up to ``max_repeat``. | |
| """ | |
| def __init__( | |
| self, | |
| transform, | |
| max_repeat: int = 5, | |
| weights: list = None, | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| transforms = [] | |
| for n in range(1, max_repeat): | |
| transforms.append(Repeat(transform, n_repeat=n)) | |
| super().__init__(transforms, name=name, prob=prob, weights=weights) | |
| self.max_repeat = max_repeat | |
| class ClippingDistortion(BaseTransform): | |
| """Adds clipping distortion to signal. Corresponds | |
| to :py:func:`audiotools.core.effects.EffectMixin.clip_distortion`. | |
| Parameters | |
| ---------- | |
| perc : tuple, optional | |
| Clipping percentile. Values are between 0.0 to 1.0. | |
| Typical values are 0.1 or below, by default ("uniform", 0.0, 0.1) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| perc: tuple = ("uniform", 0.0, 0.1), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.perc = perc | |
| def _instantiate(self, state: RandomState): | |
| return {"perc": util.sample_from_dist(self.perc, state)} | |
| def _transform(self, signal, perc): | |
| return signal.clip_distortion(perc) | |
| class Equalizer(BaseTransform): | |
| """Applies an equalization curve to the audio signal. Corresponds | |
| to :py:func:`audiotools.core.effects.EffectMixin.equalizer`. | |
| Parameters | |
| ---------- | |
| eq_amount : tuple, optional | |
| The maximum dB cut to apply to the audio in any band, | |
| by default ("const", 1.0 dB) | |
| n_bands : int, optional | |
| Number of bands in EQ, by default 6 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| eq_amount: tuple = ("const", 1.0), | |
| n_bands: int = 6, | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.eq_amount = eq_amount | |
| self.n_bands = n_bands | |
| def _instantiate(self, state: RandomState): | |
| eq_amount = util.sample_from_dist(self.eq_amount, state) | |
| eq = -eq_amount * state.rand(self.n_bands) | |
| return {"eq": eq} | |
| def _transform(self, signal, eq): | |
| return signal.equalizer(eq) | |
| class Quantization(BaseTransform): | |
| """Applies quantization to the input waveform. Corresponds | |
| to :py:func:`audiotools.core.effects.EffectMixin.quantization`. | |
| Parameters | |
| ---------- | |
| channels : tuple, optional | |
| Number of evenly spaced quantization channels to quantize | |
| to, by default ("choice", [8, 32, 128, 256, 1024]) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| channels: tuple = ("choice", [8, 32, 128, 256, 1024]), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.channels = channels | |
| def _instantiate(self, state: RandomState): | |
| return {"channels": util.sample_from_dist(self.channels, state)} | |
| def _transform(self, signal, channels): | |
| return signal.quantization(channels) | |
| class MuLawQuantization(BaseTransform): | |
| """Applies mu-law quantization to the input waveform. Corresponds | |
| to :py:func:`audiotools.core.effects.EffectMixin.mulaw_quantization`. | |
| Parameters | |
| ---------- | |
| channels : tuple, optional | |
| Number of mu-law spaced quantization channels to quantize | |
| to, by default ("choice", [8, 32, 128, 256, 1024]) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| channels: tuple = ("choice", [8, 32, 128, 256, 1024]), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.channels = channels | |
| def _instantiate(self, state: RandomState): | |
| return {"channels": util.sample_from_dist(self.channels, state)} | |
| def _transform(self, signal, channels): | |
| return signal.mulaw_quantization(channels) | |
| class NoiseFloor(BaseTransform): | |
| """Adds a noise floor of Gaussian noise to the signal at a specified | |
| dB. | |
| Parameters | |
| ---------- | |
| db : tuple, optional | |
| Level of noise to add to signal, by default ("const", -50.0) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| db: tuple = ("const", -50.0), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.db = db | |
| def _instantiate(self, state: RandomState, signal: AudioSignal): | |
| db = util.sample_from_dist(self.db, state) | |
| audio_data = state.randn(signal.num_channels, signal.signal_length) | |
| nz_signal = AudioSignal(audio_data, signal.sample_rate) | |
| nz_signal.normalize(db) | |
| return {"nz_signal": nz_signal} | |
| def _transform(self, signal, nz_signal): | |
| # Clone bg_signal so that transform can be repeatedly applied | |
| # to different signals with the same effect. | |
| return signal + nz_signal | |
| class BackgroundNoise(BaseTransform): | |
| """Adds background noise from audio specified by a set of CSV files. | |
| A valid CSV file looks like, and is typically generated by | |
| :py:func:`audiotools.data.preprocess.create_csv`: | |
| .. csv-table:: | |
| :header: path | |
| room_tone/m6_script2_clean.wav | |
| room_tone/m6_script2_cleanraw.wav | |
| room_tone/m6_script2_ipad_balcony1.wav | |
| room_tone/m6_script2_ipad_bedroom1.wav | |
| room_tone/m6_script2_ipad_confroom1.wav | |
| room_tone/m6_script2_ipad_confroom2.wav | |
| room_tone/m6_script2_ipad_livingroom1.wav | |
| room_tone/m6_script2_ipad_office1.wav | |
| .. note:: | |
| All paths are relative to an environment variable called ``PATH_TO_DATA``, | |
| so that CSV files are portable across machines where data may be | |
| located in different places. | |
| This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` | |
| and :py:func:`audiotools.core.effects.EffectMixin.equalizer` under the | |
| hood. | |
| Parameters | |
| ---------- | |
| snr : tuple, optional | |
| Signal-to-noise ratio, by default ("uniform", 10.0, 30.0) | |
| sources : List[str], optional | |
| Sources containing folders, or CSVs with paths to audio files, | |
| by default None | |
| weights : List[float], optional | |
| Weights to sample audio files from each source, by default None | |
| eq_amount : tuple, optional | |
| Amount of equalization to apply, by default ("const", 1.0) | |
| n_bands : int, optional | |
| Number of bands in equalizer, by default 3 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| loudness_cutoff : float, optional | |
| Loudness cutoff when loading from audio files, by default None | |
| """ | |
| def __init__( | |
| self, | |
| snr: tuple = ("uniform", 10.0, 30.0), | |
| sources: List[str] = None, | |
| weights: List[float] = None, | |
| eq_amount: tuple = ("const", 1.0), | |
| n_bands: int = 3, | |
| name: str = None, | |
| prob: float = 1.0, | |
| loudness_cutoff: float = None, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.snr = snr | |
| self.eq_amount = eq_amount | |
| self.n_bands = n_bands | |
| self.loader = AudioLoader(sources, weights) | |
| self.loudness_cutoff = loudness_cutoff | |
| def _instantiate(self, state: RandomState, signal: AudioSignal): | |
| eq_amount = util.sample_from_dist(self.eq_amount, state) | |
| eq = -eq_amount * state.rand(self.n_bands) | |
| snr = util.sample_from_dist(self.snr, state) | |
| bg_signal = self.loader( | |
| state, | |
| signal.sample_rate, | |
| duration=signal.signal_duration, | |
| loudness_cutoff=self.loudness_cutoff, | |
| num_channels=signal.num_channels, | |
| )["signal"] | |
| return {"eq": eq, "bg_signal": bg_signal, "snr": snr} | |
| def _transform(self, signal, bg_signal, snr, eq): | |
| # Clone bg_signal so that transform can be repeatedly applied | |
| # to different signals with the same effect. | |
| return signal.mix(bg_signal.clone(), snr, eq) | |
| class CrossTalk(BaseTransform): | |
| """Adds crosstalk between speakers, whose audio is drawn from a CSV file | |
| that was produced via :py:func:`audiotools.data.preprocess.create_csv`. | |
| This transform calls :py:func:`audiotools.core.effects.EffectMixin.mix` | |
| under the hood. | |
| Parameters | |
| ---------- | |
| snr : tuple, optional | |
| How loud cross-talk speaker is relative to original signal in dB, | |
| by default ("uniform", 0.0, 10.0) | |
| sources : List[str], optional | |
| Sources containing folders, or CSVs with paths to audio files, | |
| by default None | |
| weights : List[float], optional | |
| Weights to sample audio files from each source, by default None | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| loudness_cutoff : float, optional | |
| Loudness cutoff when loading from audio files, by default -40 | |
| """ | |
| def __init__( | |
| self, | |
| snr: tuple = ("uniform", 0.0, 10.0), | |
| sources: List[str] = None, | |
| weights: List[float] = None, | |
| name: str = None, | |
| prob: float = 1.0, | |
| loudness_cutoff: float = -40, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.snr = snr | |
| self.loader = AudioLoader(sources, weights) | |
| self.loudness_cutoff = loudness_cutoff | |
| def _instantiate(self, state: RandomState, signal: AudioSignal): | |
| snr = util.sample_from_dist(self.snr, state) | |
| crosstalk_signal = self.loader( | |
| state, | |
| signal.sample_rate, | |
| duration=signal.signal_duration, | |
| loudness_cutoff=self.loudness_cutoff, | |
| num_channels=signal.num_channels, | |
| )["signal"] | |
| return {"crosstalk_signal": crosstalk_signal, "snr": snr} | |
| def _transform(self, signal, crosstalk_signal, snr): | |
| # Clone bg_signal so that transform can be repeatedly applied | |
| # to different signals with the same effect. | |
| loudness = signal.loudness() | |
| mix = signal.mix(crosstalk_signal.clone(), snr) | |
| mix.normalize(loudness) | |
| return mix | |
| class RoomImpulseResponse(BaseTransform): | |
| """Convolves signal with a room impulse response, at a specified | |
| direct-to-reverberant ratio, with equalization applied. Room impulse | |
| response data is drawn from a CSV file that was produced via | |
| :py:func:`audiotools.data.preprocess.create_csv`. | |
| This transform calls :py:func:`audiotools.core.effects.EffectMixin.apply_ir` | |
| under the hood. | |
| Parameters | |
| ---------- | |
| drr : tuple, optional | |
| _description_, by default ("uniform", 0.0, 30.0) | |
| sources : List[str], optional | |
| Sources containing folders, or CSVs with paths to audio files, | |
| by default None | |
| weights : List[float], optional | |
| Weights to sample audio files from each source, by default None | |
| eq_amount : tuple, optional | |
| Amount of equalization to apply, by default ("const", 1.0) | |
| n_bands : int, optional | |
| Number of bands in equalizer, by default 6 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| use_original_phase : bool, optional | |
| Whether or not to use the original phase, by default False | |
| offset : float, optional | |
| Offset from each impulse response file to use, by default 0.0 | |
| duration : float, optional | |
| Duration of each impulse response, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| drr: tuple = ("uniform", 0.0, 30.0), | |
| sources: List[str] = None, | |
| weights: List[float] = None, | |
| eq_amount: tuple = ("const", 1.0), | |
| n_bands: int = 6, | |
| name: str = None, | |
| prob: float = 1.0, | |
| use_original_phase: bool = False, | |
| offset: float = 0.0, | |
| duration: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.drr = drr | |
| self.eq_amount = eq_amount | |
| self.n_bands = n_bands | |
| self.use_original_phase = use_original_phase | |
| self.loader = AudioLoader(sources, weights) | |
| self.offset = offset | |
| self.duration = duration | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| eq_amount = util.sample_from_dist(self.eq_amount, state) | |
| eq = -eq_amount * state.rand(self.n_bands) | |
| drr = util.sample_from_dist(self.drr, state) | |
| ir_signal = self.loader( | |
| state, | |
| signal.sample_rate, | |
| offset=self.offset, | |
| duration=self.duration, | |
| loudness_cutoff=None, | |
| num_channels=signal.num_channels, | |
| )["signal"] | |
| ir_signal.zero_pad_to(signal.sample_rate) | |
| return {"eq": eq, "ir_signal": ir_signal, "drr": drr} | |
| def _transform(self, signal, ir_signal, drr, eq): | |
| # Clone ir_signal so that transform can be repeatedly applied | |
| # to different signals with the same effect. | |
| return signal.apply_ir( | |
| ir_signal.clone(), drr, eq, use_original_phase=self.use_original_phase | |
| ) | |
| class VolumeChange(BaseTransform): | |
| """Changes the volume of the input signal. | |
| Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. | |
| Parameters | |
| ---------- | |
| db : tuple, optional | |
| Change in volume in decibels, by default ("uniform", -12.0, 0.0) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| db: tuple = ("uniform", -12.0, 0.0), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.db = db | |
| def _instantiate(self, state: RandomState): | |
| return {"db": util.sample_from_dist(self.db, state)} | |
| def _transform(self, signal, db): | |
| return signal.volume_change(db) | |
| class VolumeNorm(BaseTransform): | |
| """Normalizes the volume of the excerpt to a specified decibel. | |
| Uses :py:func:`audiotools.core.effects.EffectMixin.normalize`. | |
| Parameters | |
| ---------- | |
| db : tuple, optional | |
| dB to normalize signal to, by default ("const", -24) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| db: tuple = ("const", -24), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.db = db | |
| def _instantiate(self, state: RandomState): | |
| return {"db": util.sample_from_dist(self.db, state)} | |
| def _transform(self, signal, db): | |
| return signal.normalize(db) | |
| class GlobalVolumeNorm(BaseTransform): | |
| """Similar to :py:func:`audiotools.data.transforms.VolumeNorm`, this | |
| transform also normalizes the volume of a signal, but it uses | |
| the volume of the entire audio file the loaded excerpt comes from, | |
| rather than the volume of just the excerpt. The volume of the | |
| entire audio file is expected in ``signal.metadata["loudness"]``. | |
| If loading audio from a CSV generated by :py:func:`audiotools.data.preprocess.create_csv` | |
| with ``loudness = True``, like the following: | |
| .. csv-table:: | |
| :header: path,loudness | |
| daps/produced/f1_script1_produced.wav,-16.299999237060547 | |
| daps/produced/f1_script2_produced.wav,-16.600000381469727 | |
| daps/produced/f1_script3_produced.wav,-17.299999237060547 | |
| daps/produced/f1_script4_produced.wav,-16.100000381469727 | |
| daps/produced/f1_script5_produced.wav,-16.700000762939453 | |
| daps/produced/f3_script1_produced.wav,-16.5 | |
| The ``AudioLoader`` will automatically load the loudness column into | |
| the metadata of the signal. | |
| Uses :py:func:`audiotools.core.effects.EffectMixin.volume_change`. | |
| Parameters | |
| ---------- | |
| db : tuple, optional | |
| dB to normalize signal to, by default ("const", -24) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| db: tuple = ("const", -24), | |
| name: str = None, | |
| prob: float = 1.0, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.db = db | |
| def _instantiate(self, state: RandomState, signal: AudioSignal): | |
| if "loudness" not in signal.metadata: | |
| db_change = 0.0 | |
| elif float(signal.metadata["loudness"]) == float("-inf"): | |
| db_change = 0.0 | |
| else: | |
| db = util.sample_from_dist(self.db, state) | |
| db_change = db - float(signal.metadata["loudness"]) | |
| return {"db": db_change} | |
| def _transform(self, signal, db): | |
| return signal.volume_change(db) | |
| class Silence(BaseTransform): | |
| """Zeros out the signal with some probability. | |
| Parameters | |
| ---------- | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 0.1 | |
| """ | |
| def __init__(self, name: str = None, prob: float = 0.1): | |
| super().__init__(name=name, prob=prob) | |
| def _transform(self, signal): | |
| _loudness = signal._loudness | |
| signal = AudioSignal( | |
| torch.zeros_like(signal.audio_data), | |
| sample_rate=signal.sample_rate, | |
| stft_params=signal.stft_params, | |
| ) | |
| # So that the amound of noise added is as if it wasn't silenced. | |
| # TODO: improve this hack | |
| signal._loudness = _loudness | |
| return signal | |
| class LowPass(BaseTransform): | |
| """Applies a LowPass filter. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.low_pass`. | |
| Parameters | |
| ---------- | |
| cutoff : tuple, optional | |
| Cutoff frequency distribution, | |
| by default ``("choice", [4000, 8000, 16000])`` | |
| zeros : int, optional | |
| Number of zero-crossings in filter, argument to | |
| ``julius.LowPassFilters``, by default 51 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| cutoff: tuple = ("choice", [4000, 8000, 16000]), | |
| zeros: int = 51, | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.cutoff = cutoff | |
| self.zeros = zeros | |
| def _instantiate(self, state: RandomState): | |
| return {"cutoff": util.sample_from_dist(self.cutoff, state)} | |
| def _transform(self, signal, cutoff): | |
| return signal.low_pass(cutoff, zeros=self.zeros) | |
| class HighPass(BaseTransform): | |
| """Applies a HighPass filter. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.high_pass`. | |
| Parameters | |
| ---------- | |
| cutoff : tuple, optional | |
| Cutoff frequency distribution, | |
| by default ``("choice", [50, 100, 250, 500, 1000])`` | |
| zeros : int, optional | |
| Number of zero-crossings in filter, argument to | |
| ``julius.LowPassFilters``, by default 51 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| cutoff: tuple = ("choice", [50, 100, 250, 500, 1000]), | |
| zeros: int = 51, | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.cutoff = cutoff | |
| self.zeros = zeros | |
| def _instantiate(self, state: RandomState): | |
| return {"cutoff": util.sample_from_dist(self.cutoff, state)} | |
| def _transform(self, signal, cutoff): | |
| return signal.high_pass(cutoff, zeros=self.zeros) | |
| class RescaleAudio(BaseTransform): | |
| """Rescales the audio so it is in between ``-val`` and ``val`` | |
| only if the original audio exceeds those bounds. Useful if | |
| transforms have caused the audio to clip. | |
| Uses :py:func:`audiotools.core.effects.EffectMixin.ensure_max_of_audio`. | |
| Parameters | |
| ---------- | |
| val : float, optional | |
| Max absolute value of signal, by default 1.0 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__(self, val: float = 1.0, name: str = None, prob: float = 1): | |
| super().__init__(name=name, prob=prob) | |
| self.val = val | |
| def _transform(self, signal): | |
| return signal.ensure_max_of_audio(self.val) | |
| class ShiftPhase(SpectralTransform): | |
| """Shifts the phase of the audio. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.shift)phase`. | |
| Parameters | |
| ---------- | |
| shift : tuple, optional | |
| How much to shift phase by, by default ("uniform", -np.pi, np.pi) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| shift: tuple = ("uniform", -np.pi, np.pi), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.shift = shift | |
| def _instantiate(self, state: RandomState): | |
| return {"shift": util.sample_from_dist(self.shift, state)} | |
| def _transform(self, signal, shift): | |
| return signal.shift_phase(shift) | |
| class InvertPhase(ShiftPhase): | |
| """Inverts the phase of the audio. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.shift_phase`. | |
| Parameters | |
| ---------- | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__(self, name: str = None, prob: float = 1): | |
| super().__init__(shift=("const", np.pi), name=name, prob=prob) | |
| class CorruptPhase(SpectralTransform): | |
| """Corrupts the phase of the audio. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.corrupt_phase`. | |
| Parameters | |
| ---------- | |
| scale : tuple, optional | |
| How much to corrupt phase by, by default ("uniform", 0, np.pi) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, scale: tuple = ("uniform", 0, np.pi), name: str = None, prob: float = 1 | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.scale = scale | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| scale = util.sample_from_dist(self.scale, state) | |
| corruption = state.normal(scale=scale, size=signal.phase.shape[1:]) | |
| return {"corruption": corruption.astype("float32")} | |
| def _transform(self, signal, corruption): | |
| return signal.shift_phase(shift=corruption) | |
| class FrequencyMask(SpectralTransform): | |
| """Masks a band of frequencies at a center frequency | |
| from the audio. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_frequencies`. | |
| Parameters | |
| ---------- | |
| f_center : tuple, optional | |
| Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) | |
| f_width : tuple, optional | |
| Width of zero'd out band, by default ("const", 0.1) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| f_center: tuple = ("uniform", 0.0, 1.0), | |
| f_width: tuple = ("const", 0.1), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.f_center = f_center | |
| self.f_width = f_width | |
| def _instantiate(self, state: RandomState, signal: AudioSignal): | |
| f_center = util.sample_from_dist(self.f_center, state) | |
| f_width = util.sample_from_dist(self.f_width, state) | |
| fmin = max(f_center - (f_width / 2), 0.0) | |
| fmax = min(f_center + (f_width / 2), 1.0) | |
| fmin_hz = (signal.sample_rate / 2) * fmin | |
| fmax_hz = (signal.sample_rate / 2) * fmax | |
| return {"fmin_hz": fmin_hz, "fmax_hz": fmax_hz} | |
| def _transform(self, signal, fmin_hz: float, fmax_hz: float): | |
| return signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) | |
| class TimeMask(SpectralTransform): | |
| """Masks out contiguous time-steps from signal. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_timesteps`. | |
| Parameters | |
| ---------- | |
| t_center : tuple, optional | |
| Center time in terms of 0.0 and 1.0 (duration of signal), | |
| by default ("uniform", 0.0, 1.0) | |
| t_width : tuple, optional | |
| Width of dropped out portion, by default ("const", 0.025) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| t_center: tuple = ("uniform", 0.0, 1.0), | |
| t_width: tuple = ("const", 0.025), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.t_center = t_center | |
| self.t_width = t_width | |
| def _instantiate(self, state: RandomState, signal: AudioSignal): | |
| t_center = util.sample_from_dist(self.t_center, state) | |
| t_width = util.sample_from_dist(self.t_width, state) | |
| tmin = max(t_center - (t_width / 2), 0.0) | |
| tmax = min(t_center + (t_width / 2), 1.0) | |
| tmin_s = signal.signal_duration * tmin | |
| tmax_s = signal.signal_duration * tmax | |
| return {"tmin_s": tmin_s, "tmax_s": tmax_s} | |
| def _transform(self, signal, tmin_s: float, tmax_s: float): | |
| return signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s) | |
| class MaskLowMagnitudes(SpectralTransform): | |
| """Masks low magnitude regions out of signal. | |
| Uses :py:func:`audiotools.core.dsp.DSPMixin.mask_low_magnitudes`. | |
| Parameters | |
| ---------- | |
| db_cutoff : tuple, optional | |
| Decibel value for which things below it will be masked away, | |
| by default ("uniform", -10, 10) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| db_cutoff: tuple = ("uniform", -10, 10), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.db_cutoff = db_cutoff | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| return {"db_cutoff": util.sample_from_dist(self.db_cutoff, state)} | |
| def _transform(self, signal, db_cutoff: float): | |
| return signal.mask_low_magnitudes(db_cutoff) | |
| class Smoothing(BaseTransform): | |
| """Convolves the signal with a smoothing window. | |
| Uses :py:func:`audiotools.core.effects.EffectMixin.convolve`. | |
| Parameters | |
| ---------- | |
| window_type : tuple, optional | |
| Type of window to use, by default ("const", "average") | |
| window_length : tuple, optional | |
| Length of smoothing window, by | |
| default ("choice", [8, 16, 32, 64, 128, 256, 512]) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| window_type: tuple = ("const", "average"), | |
| window_length: tuple = ("choice", [8, 16, 32, 64, 128, 256, 512]), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(name=name, prob=prob) | |
| self.window_type = window_type | |
| self.window_length = window_length | |
| def _instantiate(self, state: RandomState, signal: AudioSignal = None): | |
| window_type = util.sample_from_dist(self.window_type, state) | |
| window_length = util.sample_from_dist(self.window_length, state) | |
| window = signal.get_window( | |
| window_type=window_type, window_length=window_length, device="cpu" | |
| ) | |
| return {"window": AudioSignal(window, signal.sample_rate)} | |
| def _transform(self, signal, window): | |
| sscale = signal.audio_data.abs().max(dim=-1, keepdim=True).values | |
| sscale[sscale == 0.0] = 1.0 | |
| out = signal.convolve(window) | |
| oscale = out.audio_data.abs().max(dim=-1, keepdim=True).values | |
| oscale[oscale == 0.0] = 1.0 | |
| out = out * (sscale / oscale) | |
| return out | |
| class TimeNoise(TimeMask): | |
| """Similar to :py:func:`audiotools.data.transforms.TimeMask`, but | |
| replaces with noise instead of zeros. | |
| Parameters | |
| ---------- | |
| t_center : tuple, optional | |
| Center time in terms of 0.0 and 1.0 (duration of signal), | |
| by default ("uniform", 0.0, 1.0) | |
| t_width : tuple, optional | |
| Width of dropped out portion, by default ("const", 0.025) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| t_center: tuple = ("uniform", 0.0, 1.0), | |
| t_width: tuple = ("const", 0.025), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(t_center=t_center, t_width=t_width, name=name, prob=prob) | |
| def _transform(self, signal, tmin_s: float, tmax_s: float): | |
| signal = signal.mask_timesteps(tmin_s=tmin_s, tmax_s=tmax_s, val=0.0) | |
| mag, phase = signal.magnitude, signal.phase | |
| mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) | |
| mask = (mag == 0.0) * (phase == 0.0) | |
| mag[mask] = mag_r[mask] | |
| phase[mask] = phase_r[mask] | |
| signal.magnitude = mag | |
| signal.phase = phase | |
| return signal | |
| class FrequencyNoise(FrequencyMask): | |
| """Similar to :py:func:`audiotools.data.transforms.FrequencyMask`, but | |
| replaces with noise instead of zeros. | |
| Parameters | |
| ---------- | |
| f_center : tuple, optional | |
| Center frequency between 0.0 and 1.0 (Nyquist), by default ("uniform", 0.0, 1.0) | |
| f_width : tuple, optional | |
| Width of zero'd out band, by default ("const", 0.1) | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| f_center: tuple = ("uniform", 0.0, 1.0), | |
| f_width: tuple = ("const", 0.1), | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(f_center=f_center, f_width=f_width, name=name, prob=prob) | |
| def _transform(self, signal, fmin_hz: float, fmax_hz: float): | |
| signal = signal.mask_frequencies(fmin_hz=fmin_hz, fmax_hz=fmax_hz) | |
| mag, phase = signal.magnitude, signal.phase | |
| mag_r, phase_r = torch.randn_like(mag), torch.randn_like(phase) | |
| mask = (mag == 0.0) * (phase == 0.0) | |
| mag[mask] = mag_r[mask] | |
| phase[mask] = phase_r[mask] | |
| signal.magnitude = mag | |
| signal.phase = phase | |
| return signal | |
| class SpectralDenoising(Equalizer): | |
| """Applies denoising algorithm detailed in | |
| :py:func:`audiotools.ml.layers.spectral_gate.SpectralGate`, | |
| using a randomly generated noise signal for denoising. | |
| Parameters | |
| ---------- | |
| eq_amount : tuple, optional | |
| Amount of eq to apply to noise signal, by default ("const", 1.0) | |
| denoise_amount : tuple, optional | |
| Amount to denoise by, by default ("uniform", 0.8, 1.0) | |
| nz_volume : float, optional | |
| Volume of noise to denoise with, by default -40 | |
| n_bands : int, optional | |
| Number of bands in equalizer, by default 6 | |
| n_freq : int, optional | |
| Number of frequency bins to smooth by, by default 3 | |
| n_time : int, optional | |
| Number of time bins to smooth by, by default 5 | |
| name : str, optional | |
| Name of this transform, used to identify it in the dictionary | |
| produced by ``self.instantiate``, by default None | |
| prob : float, optional | |
| Probability of applying this transform, by default 1.0 | |
| """ | |
| def __init__( | |
| self, | |
| eq_amount: tuple = ("const", 1.0), | |
| denoise_amount: tuple = ("uniform", 0.8, 1.0), | |
| nz_volume: float = -40, | |
| n_bands: int = 6, | |
| n_freq: int = 3, | |
| n_time: int = 5, | |
| name: str = None, | |
| prob: float = 1, | |
| ): | |
| super().__init__(eq_amount=eq_amount, n_bands=n_bands, name=name, prob=prob) | |
| self.nz_volume = nz_volume | |
| self.denoise_amount = denoise_amount | |
| self.spectral_gate = ml.layers.SpectralGate(n_freq, n_time) | |
| def _transform(self, signal, nz, eq, denoise_amount): | |
| nz = nz.normalize(self.nz_volume).equalizer(eq) | |
| self.spectral_gate = self.spectral_gate.to(signal.device) | |
| signal = self.spectral_gate(signal, nz, denoise_amount) | |
| return signal | |
| def _instantiate(self, state: RandomState): | |
| kwargs = super()._instantiate(state) | |
| kwargs["denoise_amount"] = util.sample_from_dist(self.denoise_amount, state) | |
| kwargs["nz"] = AudioSignal(state.randn(22050), 44100) | |
| return kwargs | |