RemFx

Running

App Files Files Community

mattricesound commited on Feb 23, 2023

Commit

3d26e07

2 Parent(s): 1b89540 f6e5f6d

Merge pull request #23 from mhrice/metric-collection

Browse files

Files changed (29) hide show

README.md +31 -27
config.yaml → cfg/config.yaml +31 -5
cfg/effects/all.yaml +70 -0
cfg/effects/chorus.yaml +20 -0
cfg/effects/compression.yaml +22 -0
cfg/effects/distortion.yaml +14 -0
cfg/effects/reverb.yaml +26 -0
cfg/exp/demucs_all.yaml +4 -0
cfg/exp/demucs_chorus.yaml +4 -0
cfg/exp/demucs_compression.yaml +4 -0
cfg/exp/demucs_distortion.yaml +4 -0
cfg/exp/demucs_reverb.yaml +4 -0
cfg/exp/umx_all.yaml +4 -0
cfg/exp/umx_chorus.yaml +4 -0
cfg/exp/umx_compression.yaml +4 -0
cfg/exp/umx_distortion.yaml +4 -0
cfg/exp/umx_reverb.yaml +4 -0
{exp → cfg/model}/audio_diffusion.yaml +0 -0
{exp → cfg/model}/demucs.yaml +1 -8
{exp → cfg/model}/umx.yaml +1 -8
config_guitfx.yaml +0 -52
remfx/datasets.py +90 -192
remfx/effects.py +1 -1
remfx/models.py +70 -47
remfx/utils.py +71 -1
scripts/test.py +55 -0
scripts/train.py +1 -2
setup.py +1 -0
shell_vars.sh +1 -1

README.md CHANGED Viewed

@@ -6,36 +6,40 @@
 4. `git submodule update --init --recursive`
 5. `pip install -e umx`
-## Download [GuitarFX Dataset](https://zenodo.org/record/7044411/)
-`./scripts/download_egfx.sh`
 ## Train model
-1. Change Wandb variables in `shell_vars.sh` and `source shell_vars.sh`
-2. `python scripts/train.py exp=audio_diffusion`
 or
-2. `python scripts/train.py exp=umx`
-or
-2. `python scripts/train.py exp=demucs`
 To add gpu, add `trainer.accelerator='gpu' trainer.devices=-1` to the command-line
-Ex. `python train.py exp=umx trainer.accelerator='gpu' trainer.devices=-1`
-### Effects
-Default effect is RAT (distortion). Effect choices:
-- BluesDriver
-- Clean
-- Flanger
-- Phaser
-- RAT
-- Sweep Echo
-- TubeScreamer
-- Chorus
-- Digital Delay
-- Hall Reverb
-- Plate Reverb
-- Spring Reverb
-- TapeEcho
-Change effect by adding `+datamodule.dataset.effect_types=["{Effect}"]` to the command-line

 4. `git submodule update --init --recursive`
 5. `pip install -e umx`
+## Download [VocalSet Dataset](https://zenodo.org/record/1193957)
+1. `wget https://zenodo.org/record/1193957/files/VocalSet.zip?download=1`
+2. `mv VocalSet.zip?download=1 VocalSet.zip`
+3. `unzip VocalSet.zip`
+4. Manually split singers into train, val, test directories
 ## Train model
+1. Change Wandb and data root variables in `shell_vars.sh` and `source shell_vars.sh`
+2. `python scripts/train.py +exp=umx_distortion`
 or
+2. `python scripts/train.py +exp=demucs_distortion`
+See cfg for more options. Generally they are `+exp={model}_{effect}`
+Models and effects detailed below.
 To add gpu, add `trainer.accelerator='gpu' trainer.devices=-1` to the command-line
+Ex. `python scripts/train.py +exp=umx_distortion trainer.accelerator='gpu' trainer.devices=-1`
+### Current Models
+- `umx`
+- `demucs`
+### Current Effects
+- `chorus`
+- `compressor`
+- `distortion`
+- `reverb`
+- `all` (choose random effect to apply to each file)
+### Testing
+Experiment dictates data, ckpt dictates model
+`python scripts/test.py +exp=umx_distortion.yaml +ckpt_path=test_ckpts/umx_dist.ckpt`
+## Misc.
+By default, files are rendered to `input_dir / processed / train/val/test`.
+To skip rendering files (use previously rendered), add `render_files=False` to the command-line (added to test by default).
+To change the rendered location, add `render_root={path/to/dir}` to the command-line (use this for train and test)

config.yaml → cfg/config.yaml RENAMED Viewed

@@ -1,11 +1,15 @@
 defaults:
   - _self_
-  - exp: null
 seed: 12345
 train: True
 sample_rate: 48000
 logs_dir: "./logs"
 log_every_n_steps: 1000
 callbacks:
   model_checkpoint:
@@ -19,13 +23,35 @@ callbacks:
     filename: '{epoch:02d}-{valid_loss:.3f}'
 datamodule:
-  _target_: remfx.datasets.Datamodule
-  dataset:
-    _target_: remfx.datasets.GuitarSet
     sample_rate: ${sample_rate}
     root: ${oc.env:DATASET_ROOT}
     chunk_size_in_sec: 6
-  val_split: 0.2
   batch_size: 16
   num_workers: 8
   pin_memory: True

 defaults:
   - _self_
+  - model: null
+  - effects: null
 seed: 12345
 train: True
 sample_rate: 48000
 logs_dir: "./logs"
 log_every_n_steps: 1000
+render_files: True
+render_root: "./data/processed"
 callbacks:
   model_checkpoint:
     filename: '{epoch:02d}-{valid_loss:.3f}'
 datamodule:
+  _target_: remfx.datasets.VocalSetDatamodule
+  train_dataset:
+    _target_: remfx.datasets.VocalSet
+    sample_rate: ${sample_rate}
+    root: ${oc.env:DATASET_ROOT}
+    chunk_size_in_sec: 6
+    mode: "train"
+    effect_types: ${effects.train_effects}
+    render_files: ${render_files}
+    render_root: ${render_root}
+  val_dataset:
+    _target_: remfx.datasets.VocalSet
     sample_rate: ${sample_rate}
     root: ${oc.env:DATASET_ROOT}
     chunk_size_in_sec: 6
+    mode: "val"
+    effect_types: ${effects.val_effects}
+    render_files: ${render_files}
+    render_root: ${render_root}
+  test_dataset:
+    _target_: remfx.datasets.VocalSet
+    sample_rate: ${sample_rate}
+    root: ${oc.env:DATASET_ROOT}
+    chunk_size_in_sec: 6
+    mode: "test"
+    effect_types: ${effects.val_effects}
+    render_files: ${render_files}
+    render_root: ${render_root}
   batch_size: 16
   num_workers: 8
   pin_memory: True

cfg/effects/all.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+# @package _global_
+effects:
+  train_effects:
+      Chorus:
+        _target_: remfx.effects.RandomPedalboardChorus
+        sample_rate: ${sample_rate}
+      Distortion:
+        _target_: remfx.effects.RandomPedalboardDistortion
+        sample_rate: ${sample_rate}
+        min_drive_db: -10
+        max_drive_db: 50
+      Compressor:
+        _target_: remfx.effects.RandomPedalboardCompressor
+        sample_rate: ${sample_rate}
+        min_threshold_db: -42.0
+        max_threshold_db: -20.0
+        min_ratio: 1.5
+        max_ratio: 6.0
+      Reverb:
+        _target_: remfx.effects.RandomPedalboardReverb
+        sample_rate: ${sample_rate}
+        min_room_size: 0.3
+        max_room_size: 1.0
+        min_damping: 0.2
+        max_damping: 1.0
+        min_wet_dry: 0.2
+        max_wet_dry: 0.8
+        min_width: 0.2
+        max_width: 1.0
+  val_effects:
+      Chorus:
+        _target_: remfx.effects.RandomPedalboardChorus
+        sample_rate: ${sample_rate}
+        min_rate_hz: 1.0
+        max_rate_hz: 1.0
+        min_depth: 0.3
+        max_depth: 0.3
+        min_centre_delay_ms: 7.5
+        max_centre_delay_ms: 7.5
+        min_feedback: 0.4
+        max_feedback: 0.4
+        min_mix: 0.4
+        max_mix: 0.4
+      Distortion:
+        _target_: remfx.effects.RandomPedalboardDistortion
+        sample_rate: ${sample_rate}
+        min_drive_db: 30
+        max_drive_db: 30
+      Compressor:
+        _target_: remfx.effects.RandomPedalboardCompressor
+        sample_rate: ${sample_rate}
+        min_threshold_db: -32
+        max_threshold_db: -32
+        min_ratio: 3.0
+        max_ratio: 3.0
+        min_attack_ms: 10.0
+        max_attack_ms: 10.0
+        min_release_ms: 40.0
+        max_release_ms: 40.0
+      Reverb:
+        _target_: remfx.effects.RandomPedalboardReverb
+        sample_rate: ${sample_rate}
+        min_room_size: 0.5
+        max_room_size: 0.5
+        min_damping: 0.5
+        max_damping: 0.5
+        min_wet_dry: 0.4
+        max_wet_dry: 0.4
+        min_width: 0.5
+        max_width: 0.5

cfg/effects/chorus.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# @package _global_
+effects:
+  train_effects:
+      Chorus:
+        _target_: remfx.effects.RandomPedalboardChorus
+        sample_rate: ${sample_rate}
+  val_effects:
+      Chorus:
+        _target_: remfx.effects.RandomPedalboardChorus
+        sample_rate: ${sample_rate}
+        min_rate_hz: 1.0
+        max_rate_hz: 1.0
+        min_depth: 0.3
+        max_depth: 0.3
+        min_centre_delay_ms: 7.5
+        max_centre_delay_ms: 7.5
+        min_feedback: 0.4
+        max_feedback: 0.4
+        min_mix: 0.4
+        max_mix: 0.4

cfg/effects/compression.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# @package _global_
+effects:
+  train_effects:
+      Compressor:
+        _target_: remfx.effects.RandomPedalboardCompressor
+        sample_rate: ${sample_rate}
+        min_threshold_db: -42.0
+        max_threshold_db: -20.0
+        min_ratio: 1.5
+        max_ratio: 6.0
+  val_effects:
+      Compressor:
+        _target_: remfx.effects.RandomPedalboardCompressor
+        sample_rate: ${sample_rate}
+        min_threshold_db: -32
+        max_threshold_db: -32
+        min_ratio: 3.0
+        max_ratio: 3.0
+        min_attack_ms: 10.0
+        max_attack_ms: 10.0
+        min_release_ms: 40.0
+        max_release_ms: 40.0

cfg/effects/distortion.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+# @package _global_
+effects:
+  train_effects:
+      Distortion:
+        _target_: remfx.effects.RandomPedalboardDistortion
+        sample_rate: ${sample_rate}
+        min_drive_db: -10
+        max_drive_db: 50
+  val_effects:
+      Distortion:
+        _target_: remfx.effects.RandomPedalboardDistortion
+        sample_rate: ${sample_rate}
+        min_drive_db: 30
+        max_drive_db: 30

cfg/effects/reverb.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+# @package _global_
+effects:
+  train_effects:
+      Reverb:
+        _target_: remfx.effects.RandomPedalboardReverb
+        sample_rate: ${sample_rate}
+        min_room_size: 0.3
+        max_room_size: 1.0
+        min_damping: 0.2
+        max_damping: 1.0
+        min_wet_dry: 0.2
+        max_wet_dry: 0.8
+        min_width: 0.2
+        max_width: 1.0
+  val_effects:
+      Reverb:
+        _target_: remfx.effects.RandomPedalboardReverb
+        sample_rate: ${sample_rate}
+        min_room_size: 0.5
+        max_room_size: 0.5
+        min_damping: 0.5
+        max_damping: 0.5
+        min_wet_dry: 0.4
+        max_wet_dry: 0.4
+        min_width: 0.5
+        max_width: 0.5

cfg/exp/demucs_all.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: demucs
+  - override /effects: all

cfg/exp/demucs_chorus.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: demucs
+  - override /effects: chorus

cfg/exp/demucs_compression.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: demucs
+  - override /effects: compression

cfg/exp/demucs_distortion.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: demucs
+  - override /effects: distortion

cfg/exp/demucs_reverb.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: demucs
+  - override /effects: reverb

cfg/exp/umx_all.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: umx
+  - override /effects: all

cfg/exp/umx_chorus.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: umx
+  - override /effects: chorus

cfg/exp/umx_compression.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: umx
+  - override /effects: compression

cfg/exp/umx_distortion.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: umx
+  - override /effects: distortion

cfg/exp/umx_reverb.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# @package _global_
+defaults:
+  - override /model: umx
+  - override /effects: reverb

{exp → cfg/model}/audio_diffusion.yaml RENAMED Viewed

File without changes

{exp → cfg/model}/demucs.yaml RENAMED Viewed

@@ -13,11 +13,4 @@ model:
     audio_channels: 1
     nfft: 4096
     sample_rate: ${sample_rate}
-datamodule:
-  dataset:
-    effect_types:
-        Distortion:
-          _target_: remfx.effects.RandomPedalboardDistortion
-          sample_rate: ${sample_rate}
-          min_drive_db: -10
-          max_drive_db: 50

     audio_channels: 1
     nfft: 4096
     sample_rate: ${sample_rate}

{exp → cfg/model}/umx.yaml RENAMED Viewed

@@ -14,11 +14,4 @@ model:
     n_channels: 1
     alpha: 0.3
     sample_rate: ${sample_rate}
-datamodule:
-  dataset:
-    effect_types:
-        Distortion:
-          _target_: remfx.effects.RandomPedalboardDistortion
-          sample_rate: ${sample_rate}
-          min_drive_db: -10
-          max_drive_db: 50

     n_channels: 1
     alpha: 0.3
     sample_rate: ${sample_rate}

config_guitfx.yaml DELETED Viewed

@@ -1,52 +0,0 @@
-defaults:
-  - _self_
-  - exp: null
-seed: 12345
-train: True
-sample_rate: 48000
-logs_dir: "./logs"
-log_every_n_steps: 1000
-callbacks:
-  model_checkpoint:
-    _target_: pytorch_lightning.callbacks.ModelCheckpoint
-    monitor: "valid_loss"   # name of the logged metric which determines when model is improving
-    save_top_k: 1           # save k best models (determined by above metric)
-    save_last: True         # additionaly always save model from last epoch
-    mode: "min"             # can be "max" or "min"
-    verbose: False
-    dirpath: ${logs_dir}/ckpts/${now:%Y-%m-%d-%H-%M-%S}
-    filename: '{epoch:02d}-{valid_loss:.3f}'
-datamodule:
-  _target_: remfx.datasets.Datamodule
-  dataset:
-    _target_: remfx.datasets.GuitarFXDataset
-    sample_rate: ${sample_rate}
-    root: ${oc.env:DATASET_ROOT}
-    chunk_size_in_sec: 6
-  val_split: 0.2
-  batch_size: 16
-  num_workers: 8
-  pin_memory: True
-  persistent_workers: True
-logger:
-  _target_: pytorch_lightning.loggers.WandbLogger
-  project: ${oc.env:WANDB_PROJECT}
-  entity: ${oc.env:WANDB_ENTITY}
-  # offline: False  # set True to store all logs only locally
-  job_type: "train"
-  group: ""
-  save_dir: "."
-trainer:
-  _target_: pytorch_lightning.Trainer
-  precision: 32 # Precision used for tensors, default `32`
-  min_epochs: 0
-  max_epochs: -1
-  enable_model_summary: False
-  log_every_n_steps: 1 # Logs metrics every N batches
-  accumulate_grad_batches: 1
-  accelerator: null
-  devices: 1

remfx/datasets.py CHANGED Viewed

@@ -1,240 +1,129 @@
 import torch
-from torch.utils.data import Dataset, DataLoader, random_split
 import torchaudio
-import torchaudio.transforms as T
 import torch.nn.functional as F
 from pathlib import Path
 import pytorch_lightning as pl
-from typing import Any, List, Tuple
 from remfx import effects
-from pedalboard import (
-    Pedalboard,
-    Chorus,
-    Reverb,
-    Compressor,
-    Phaser,
-    Delay,
-    Distortion,
-    Limiter,
-)
-# https://zenodo.org/record/7044411/ -> GuitarFX
-# https://zenodo.org/record/3371780  -> GuitarSet
-deterministic_effects = {
-    "Distortion": Pedalboard([Distortion()]),
-    "Compressor": Pedalboard([Compressor()]),
-    "Chorus": Pedalboard([Chorus()]),
-    "Phaser": Pedalboard([Phaser()]),
-    "Delay": Pedalboard([Delay()]),
-    "Reverb": Pedalboard([Reverb()]),
-    "Limiter": Pedalboard([Limiter()]),
-}
-class GuitarFXDataset(Dataset):
     def __init__(
         self,
         root: str,
         sample_rate: int,
         chunk_size_in_sec: int = 3,
-        effect_types: List[str] = None,
     ):
         super().__init__()
-        self.wet_files = []
-        self.dry_files = []
         self.chunks = []
-        self.labels = []
         self.song_idx = []
         self.root = Path(root)
         self.chunk_size_in_sec = chunk_size_in_sec
         self.sample_rate = sample_rate
-        if effect_types is None:
-            effect_types = [
-                d.name for d in self.root.iterdir() if d.is_dir() and d != "Clean"
-            ]
-        current_file = 0
-        for i, effect in enumerate(effect_types):
-            for pickup in Path(self.root / effect).iterdir():
-                wet_files = sorted(list(pickup.glob("*.wav")))
-                dry_files = sorted(
-                    list(self.root.glob(f"Clean/{pickup.name}/**/*.wav"))
                 )
-                self.wet_files += wet_files
-                self.dry_files += dry_files
-                self.labels += [i] * len(wet_files)
-                for audio_file in wet_files:
-                    chunk_starts, orig_sr = create_sequential_chunks(
-                        audio_file, self.chunk_size_in_sec
                     )
-                    self.chunks += chunk_starts
-                    self.song_idx += [current_file] * len(chunk_starts)
-                    current_file += 1
         print(
-            f"Found {len(self.wet_files)} wet files and {len(self.dry_files)} dry files.\n"
-            f"Total chunks: {len(self.chunks)}"
         )
-        self.resampler = T.Resample(orig_sr, sample_rate)
     def __len__(self):
-        return len(self.chunks)
     def __getitem__(self, idx):
-        # Load effected and "clean" audio
-        song_idx = self.song_idx[idx]
-        x, sr = torchaudio.load(self.wet_files[song_idx])
-        y, sr = torchaudio.load(self.dry_files[song_idx])
-        effect_label = self.labels[song_idx]  # Effect label
-        chunk_start = self.chunks[idx]
-        chunk_size_in_samples = self.chunk_size_in_sec * sr
-        x = x[:, chunk_start : chunk_start + chunk_size_in_samples]
-        y = y[:, chunk_start : chunk_start + chunk_size_in_samples]
-        resampled_x = self.resampler(x)
-        resampled_y = self.resampler(y)
-        # Reset chunk size to be new sample rate
-        chunk_size_in_samples = self.chunk_size_in_sec * self.sample_rate
-        # Pad to chunk_size if needed
-        if resampled_x.shape[-1] < chunk_size_in_samples:
-            resampled_x = F.pad(
-                resampled_x, (0, chunk_size_in_samples - resampled_x.shape[1])
-            )
-        if resampled_y.shape[-1] < chunk_size_in_samples:
-            resampled_y = F.pad(
-                resampled_y, (0, chunk_size_in_samples - resampled_y.shape[1])
-            )
-        return (resampled_x, resampled_y, effect_label)
-class GuitarSet(Dataset):
-    def __init__(
-        self,
-        root: str,
-        sample_rate: int,
-        chunk_size_in_sec: int = 3,
-        effect_types: List[torch.nn.Module] = None,
-    ):
-        super().__init__()
-        self.chunks = []
-        self.song_idx = []
-        self.root = Path(root)
-        self.chunk_size_in_sec = chunk_size_in_sec
-        self.files = sorted(list(self.root.glob("./**/*.wav")))
-        self.sample_rate = sample_rate
-        for i, audio_file in enumerate(self.files):
-            chunk_starts, orig_sr = create_sequential_chunks(
-                audio_file, self.chunk_size_in_sec
-            )
-            self.chunks += chunk_starts
-            self.song_idx += [i] * len(chunk_starts)
-        print(f"Found {len(self.files)} files .\n" f"Total chunks: {len(self.chunks)}")
-        self.resampler = T.Resample(orig_sr, sample_rate)
-        self.effect_types = effect_types
-        self.normalize = effects.LoudnessNormalize(sample_rate, target_lufs_db=-20)
-        self.mode = "train"
-    def __len__(self):
-        return len(self.chunks)
-    def __getitem__(self, idx):
-        # Load and effect audio
-        song_idx = self.song_idx[idx]
-        x, sr = torchaudio.load(self.files[song_idx])
-        chunk_start = self.chunks[idx]
-        chunk_size_in_samples = self.chunk_size_in_sec * sr
-        x = x[:, chunk_start : chunk_start + chunk_size_in_samples]
-        resampled_x = self.resampler(x)
-        # Reset chunk size to be new sample rate
-        chunk_size_in_samples = self.chunk_size_in_sec * self.sample_rate
-        # Pad to chunk_size if needed
-        if resampled_x.shape[-1] < chunk_size_in_samples:
-            resampled_x = F.pad(
-                resampled_x, (0, chunk_size_in_samples - resampled_x.shape[1])
-            )
-        # Add random effect if train
-        if self.mode == "train":
-            random_effect_idx = torch.rand(1).item() * len(self.effect_types.keys())
-            effect_name = list(self.effect_types.keys())[int(random_effect_idx)]
-            effect = self.effect_types[effect_name]
-            effected_input = effect(resampled_x)
-        else:
-            # deterministic static effect for eval
-            effect_idx = idx % len(self.effect_types.keys())
-            effect_name = list(self.effect_types.keys())[effect_idx]
-            effect = deterministic_effects[effect_name]
-            effected_input = torch.from_numpy(
-                effect(resampled_x.numpy(), self.sample_rate)
-            )
-        normalized_input = self.normalize(effected_input)
-        normalized_target = self.normalize(resampled_x)
-        return (normalized_input, normalized_target, effect_name)
-def create_random_chunks(
-    audio_file: str, chunk_size: int, num_chunks: int
-) -> Tuple[List[Tuple[int, int]], int]:
-    """Create num_chunks random chunks of size chunk_size (seconds)
-    from an audio file.
-    Return sample_index of start of each chunk and original sr
-    """
-    audio, sr = torchaudio.load(audio_file)
-    chunk_size_in_samples = chunk_size * sr
-    if chunk_size_in_samples >= audio.shape[-1]:
-        chunk_size_in_samples = audio.shape[-1] - 1
-    chunks = []
-    for i in range(num_chunks):
-        start = torch.randint(0, audio.shape[-1] - chunk_size_in_samples, (1,)).item()
-        chunks.append(start)
-    return chunks, sr
-def create_sequential_chunks(
-    audio_file: str, chunk_size: int
-) -> Tuple[List[Tuple[int, int]], int]:
-    """Create sequential chunks of size chunk_size (seconds) from an audio file.
-    Return sample_index of start of each chunk and original sr
-    """
-    audio, sr = torchaudio.load(audio_file)
-    chunk_size_in_samples = chunk_size * sr
-    chunk_starts = torch.arange(0, audio.shape[-1], chunk_size_in_samples)
-    return chunk_starts, sr
-class Datamodule(pl.LightningDataModule):
     def __init__(
         self,
-        dataset,
         *,
-        val_split: float,
         batch_size: int,
         num_workers: int,
         pin_memory: bool = False,
         **kwargs: int,
     ) -> None:
         super().__init__()
-        self.dataset = dataset
-        self.val_split = val_split
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.pin_memory = pin_memory
-        self.data_train: Any = None
-        self.data_val: Any = None
     def setup(self, stage: Any = None) -> None:
-        split = [1.0 - self.val_split, self.val_split]
-        train_size = round(split[0] * len(self.dataset))
-        val_size = round(split[1] * len(self.dataset))
-        self.data_train, self.data_val = random_split(
-            self.dataset, [train_size, val_size]
-        )
-        self.data_val.dataset.mode = "val"
     def train_dataloader(self) -> DataLoader:
         return DataLoader(
-            dataset=self.data_train,
             batch_size=self.batch_size,
             num_workers=self.num_workers,
             pin_memory=self.pin_memory,
@@ -243,7 +132,16 @@ class Datamodule(pl.LightningDataModule):
     def val_dataloader(self) -> DataLoader:
         return DataLoader(
-            dataset=self.data_val,
             batch_size=self.batch_size,
             num_workers=self.num_workers,
             pin_memory=self.pin_memory,

 import torch
+from torch.utils.data import Dataset, DataLoader
 import torchaudio
 import torch.nn.functional as F
 from pathlib import Path
 import pytorch_lightning as pl
+from typing import Any, List
 from remfx import effects
+from tqdm import tqdm
+from remfx.utils import create_sequential_chunks
+# https://zenodo.org/record/1193957 -> VocalSet
+class VocalSet(Dataset):
     def __init__(
         self,
         root: str,
         sample_rate: int,
         chunk_size_in_sec: int = 3,
+        effect_types: List[torch.nn.Module] = None,
+        render_files: bool = True,
+        render_root: str = None,
+        mode: str = "train",
     ):
         super().__init__()
         self.chunks = []
         self.song_idx = []
         self.root = Path(root)
+        self.render_root = Path(render_root)
         self.chunk_size_in_sec = chunk_size_in_sec
         self.sample_rate = sample_rate
+        self.mode = mode
+        mode_path = self.root / self.mode
+        self.files = sorted(list(mode_path.glob("./**/*.wav")))
+        self.normalize = effects.LoudnessNormalize(sample_rate, target_lufs_db=-20)
+        self.effect_types = effect_types
+        self.processed_root = self.render_root / "processed" / self.mode
+        self.num_chunks = 0
+        print("Total files:", len(self.files))
+        print("Processing files...")
+        if render_files:
+            # Split audio file into chunks, resample, then apply random effects
+            self.processed_root.mkdir(parents=True, exist_ok=True)
+            for audio_file in tqdm(self.files, total=len(self.files)):
+                chunks, orig_sr = create_sequential_chunks(
+                    audio_file, self.chunk_size_in_sec
                 )
+                for chunk in chunks:
+                    resampled_chunk = torchaudio.functional.resample(
+                        chunk, orig_sr, sample_rate
+                    )
+                    chunk_size_in_samples = self.chunk_size_in_sec * self.sample_rate
+                    if resampled_chunk.shape[-1] < chunk_size_in_samples:
+                        resampled_chunk = F.pad(
+                            resampled_chunk,
+                            (0, chunk_size_in_samples - resampled_chunk.shape[1]),
+                        )
+                    # Apply effect
+                    effect_idx = torch.rand(1).item() * len(self.effect_types.keys())
+                    effect_name = list(self.effect_types.keys())[int(effect_idx)]
+                    effect = self.effect_types[effect_name]
+                    effected_input = effect(resampled_chunk)
+                    # Normalize
+                    normalized_input = self.normalize(effected_input)
+                    normalized_target = self.normalize(resampled_chunk)
+                    output_dir = self.processed_root / str(self.num_chunks)
+                    output_dir.mkdir(exist_ok=True)
+                    torchaudio.save(
+                        output_dir / "input.wav", normalized_input, self.sample_rate
                     )
+                    torchaudio.save(
+                        output_dir / "target.wav", normalized_target, self.sample_rate
+                    )
+                    torch.save(effect_name, output_dir / "effect_name.pt")
+                    self.num_chunks += 1
+        else:
+            self.num_chunks = len(list(self.processed_root.iterdir()))
         print(
+            f"Found {len(self.files)} {self.mode} files .\n"
+            f"Total chunks: {self.num_chunks}"
         )
     def __len__(self):
+        return self.num_chunks
     def __getitem__(self, idx):
+        input_file = self.processed_root / str(idx) / "input.wav"
+        target_file = self.processed_root / str(idx) / "target.wav"
+        effect_name = torch.load(self.processed_root / str(idx) / "effect_name.pt")
+        input, sr = torchaudio.load(input_file)
+        target, sr = torchaudio.load(target_file)
+        return (input, target, effect_name)
+class VocalSetDatamodule(pl.LightningDataModule):
     def __init__(
         self,
+        train_dataset,
+        val_dataset,
+        test_dataset,
         *,
         batch_size: int,
         num_workers: int,
         pin_memory: bool = False,
         **kwargs: int,
     ) -> None:
         super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.test_dataset = test_dataset
         self.batch_size = batch_size
         self.num_workers = num_workers
         self.pin_memory = pin_memory
     def setup(self, stage: Any = None) -> None:
+        pass
     def train_dataloader(self) -> DataLoader:
         return DataLoader(
+            dataset=self.train_dataset,
             batch_size=self.batch_size,
             num_workers=self.num_workers,
             pin_memory=self.pin_memory,
     def val_dataloader(self) -> DataLoader:
         return DataLoader(
+            dataset=self.val_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            shuffle=False,
+        )
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(
+            dataset=self.test_dataset,
             batch_size=self.batch_size,
             num_workers=self.num_workers,
             pin_memory=self.pin_memory,

remfx/effects.py CHANGED Viewed

@@ -574,7 +574,7 @@ class RandomSoxReverb(torch.nn.Module):
         return (x * (1 - wet_dry)) + (y * wet_dry)
-class RandomPebalboardReverb(torch.nn.Module):
     def __init__(
         self,
         sample_rate: float,

         return (x * (1 - wet_dry)) + (y * wet_dry)
+class RandomPedalboardReverb(torch.nn.Module):
     def __init__(
         self,
         sample_rate: float,

remfx/models.py CHANGED Viewed

@@ -5,8 +5,8 @@ from einops import rearrange
 import wandb
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
-from auraloss.freq import MultiResolutionSTFTLoss, STFTLoss
-from torch.nn import L1Loss
 from umx.openunmix.model import OpenUnmix, Separator
 from torchaudio.models import HDemucs
@@ -34,12 +34,11 @@ class RemFXModel(pl.LightningModule):
         self.metrics = torch.nn.ModuleDict(
             {
                 "SISDR": SISDRLoss(),
-                "STFT": STFTLoss(),
-                "L1": L1Loss(),
             }
         )
         # Log first batch metrics input vs output only once
-        self.log_first_metrics = True
         self.log_train_audio = True
     @property
@@ -64,30 +63,39 @@ class RemFXModel(pl.LightningModule):
         loss = self.common_step(batch, batch_idx, mode="valid")
         return loss
     def common_step(self, batch, batch_idx, mode: str = "train"):
         loss, output = self.model(batch)
         self.log(f"{mode}_loss", loss)
         x, y, label = batch
         # Metric logging
-        for metric in self.metrics:
-            # SISDR returns negative values, so negate them
-            if metric == "SISDR":
-                negate = -1
-            else:
-                negate = 1
-            self.log(
-                f"{mode}_{metric}",
-                negate * self.metrics[metric](output, y),
-                on_step=False,
-                on_epoch=True,
-                logger=True,
-                prog_bar=True,
-                sync_dist=True,
-            )
         return loss
     def on_train_batch_start(self, batch, batch_idx):
         if self.log_train_audio:
             x, y, label = batch
             # Concat samples together for easier viewing in dashboard
@@ -110,29 +118,29 @@ class RemFXModel(pl.LightningModule):
             )
             self.log_train_audio = False
-    def on_validation_epoch_start(self):
-        self.log_next = True
     def on_validation_batch_start(self, batch, batch_idx, dataloader_idx):
-        if self.log_next:
-            x, target, label = batch
-            # Log Input Metrics
-            for metric in self.metrics:
-                # SISDR returns negative values, so negate them
-                if metric == "SISDR":
-                    negate = -1
-                else:
-                    negate = 1
-                self.log(
-                    f"Input_{metric}",
-                    negate * self.metrics[metric](x, target),
-                    on_step=False,
-                    on_epoch=True,
-                    logger=True,
-                    prog_bar=True,
-                    sync_dist=True,
-                )
             self.model.eval()
             with torch.no_grad():
                 y = self.model.sample(x)
@@ -150,9 +158,22 @@ class RemFXModel(pl.LightningModule):
                 sampling_rate=self.sample_rate,
                 caption=f"Epoch {self.current_epoch}",
             )
-            self.log_next = False
             self.model.train()
 class OpenUnmixModel(torch.nn.Module):
     def __init__(
@@ -184,16 +205,17 @@ class OpenUnmixModel(torch.nn.Module):
             n_fft=self.n_fft,
             n_hop=self.hop_length,
         )
-        self.loss_fn = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=self.sample_rate
         )
     def forward(self, batch):
         x, target, label = batch
         X = spectrogram(x, self.window, self.n_fft, self.hop_length, self.alpha)
         Y = self.model(X)
         sep_out = self.separator(x).squeeze(1)
-        loss = self.loss_fn(sep_out, target)
         return loss, sep_out
@@ -206,14 +228,15 @@ class DemucsModel(torch.nn.Module):
         super().__init__()
         self.model = HDemucs(**kwargs)
         self.num_bins = kwargs["nfft"] // 2 + 1
-        self.loss_fn = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
     def forward(self, batch):
         x, target, label = batch
         output = self.model(x).squeeze(1)
-        loss = self.loss_fn(output, target)
         return loss, output
     def sample(self, x: Tensor) -> Tensor:

 import wandb
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
+from auraloss.freq import MultiResolutionSTFTLoss
+from remfx.utils import FADLoss
 from umx.openunmix.model import OpenUnmix, Separator
 from torchaudio.models import HDemucs
         self.metrics = torch.nn.ModuleDict(
             {
                 "SISDR": SISDRLoss(),
+                "STFT": MultiResolutionSTFTLoss(),
+                "FAD": FADLoss(sample_rate=sample_rate),
             }
         )
         # Log first batch metrics input vs output only once
         self.log_train_audio = True
     @property
         loss = self.common_step(batch, batch_idx, mode="valid")
         return loss
+    def test_step(self, batch, batch_idx):
+        loss = self.common_step(batch, batch_idx, mode="test")
+        return loss
     def common_step(self, batch, batch_idx, mode: str = "train"):
         loss, output = self.model(batch)
         self.log(f"{mode}_loss", loss)
         x, y, label = batch
         # Metric logging
+        with torch.no_grad():
+            for metric in self.metrics:
+                # SISDR returns negative values, so negate them
+                if metric == "SISDR":
+                    negate = -1
+                else:
+                    negate = 1
+                # Only Log FAD on test set
+                if metric == "FAD" and mode != "test":
+                    continue
+                self.log(
+                    f"{mode}_{metric}",
+                    negate * self.metrics[metric](output, y),
+                    on_step=False,
+                    on_epoch=True,
+                    logger=True,
+                    prog_bar=True,
+                    sync_dist=True,
+                )
         return loss
     def on_train_batch_start(self, batch, batch_idx):
+        # Log initial audio
         if self.log_train_audio:
             x, y, label = batch
             # Concat samples together for easier viewing in dashboard
             )
             self.log_train_audio = False
     def on_validation_batch_start(self, batch, batch_idx, dataloader_idx):
+        x, target, label = batch
+        # Log Input Metrics
+        for metric in self.metrics:
+            # SISDR returns negative values, so negate them
+            if metric == "SISDR":
+                negate = -1
+            else:
+                negate = 1
+            # Only Log FAD on test set
+            if metric == "FAD":
+                continue
+            self.log(
+                f"Input_{metric}",
+                negate * self.metrics[metric](x, target),
+                on_step=False,
+                on_epoch=True,
+                logger=True,
+                prog_bar=True,
+                sync_dist=True,
+            )
+        # Only run on first batch
+        if batch_idx == 0:
             self.model.eval()
             with torch.no_grad():
                 y = self.model.sample(x)
                 sampling_rate=self.sample_rate,
                 caption=f"Epoch {self.current_epoch}",
             )
             self.model.train()
+    def on_test_batch_start(self, batch, batch_idx, dataloader_idx):
+        self.on_validation_batch_start(batch, batch_idx, dataloader_idx)
+        # Log FAD
+        x, target, label = batch
+        self.log(
+            "Input_FAD",
+            self.metrics["FAD"](x, target),
+            on_step=False,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
 class OpenUnmixModel(torch.nn.Module):
     def __init__(
             n_fft=self.n_fft,
             n_hop=self.hop_length,
         )
+        self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=self.sample_rate
         )
+        self.l1loss = torch.nn.L1Loss()
     def forward(self, batch):
         x, target, label = batch
         X = spectrogram(x, self.window, self.n_fft, self.hop_length, self.alpha)
         Y = self.model(X)
         sep_out = self.separator(x).squeeze(1)
+        loss = self.mrstftloss(sep_out, target) + self.l1loss(sep_out, target)
         return loss, sep_out
         super().__init__()
         self.model = HDemucs(**kwargs)
         self.num_bins = kwargs["nfft"] // 2 + 1
+        self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
+        self.l1loss = torch.nn.L1Loss()
     def forward(self, batch):
         x, target, label = batch
         output = self.model(x).squeeze(1)
+        loss = self.mrstftloss(output, target) + self.l1loss(output, target)
         return loss, output
     def sample(self, x: Tensor) -> Tensor:

remfx/utils.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import logging
-from typing import List
 import pytorch_lightning as pl
 from omegaconf import DictConfig
 from pytorch_lightning.utilities import rank_zero_only
 def get_logger(name=__name__) -> logging.Logger:
@@ -69,3 +73,69 @@ def log_hyperparameters(
         hparams["callbacks"] = config["callbacks"]
     logger.experiment.config.update(hparams)

 import logging
+from typing import List, Tuple
 import pytorch_lightning as pl
 from omegaconf import DictConfig
 from pytorch_lightning.utilities import rank_zero_only
+from frechet_audio_distance import FrechetAudioDistance
+import numpy as np
+import torch
+import torchaudio
 def get_logger(name=__name__) -> logging.Logger:
         hparams["callbacks"] = config["callbacks"]
     logger.experiment.config.update(hparams)
+class FADLoss(torch.nn.Module):
+    def __init__(self, sample_rate: float):
+        super().__init__()
+        self.fad = FrechetAudioDistance(
+            use_pca=False, use_activation=False, verbose=False
+        )
+        self.fad.model = self.fad.model.to("cpu")
+        self.sr = sample_rate
+    def forward(self, audio_background, audio_eval):
+        embds_background = []
+        embds_eval = []
+        for sample in audio_background:
+            embd = self.fad.model.forward(sample.T.cpu().detach().numpy(), self.sr)
+            embds_background.append(embd.cpu().detach().numpy())
+        for sample in audio_eval:
+            embd = self.fad.model.forward(sample.T.cpu().detach().numpy(), self.sr)
+            embds_eval.append(embd.cpu().detach().numpy())
+        embds_background = np.concatenate(embds_background, axis=0)
+        embds_eval = np.concatenate(embds_eval, axis=0)
+        mu_background, sigma_background = self.fad.calculate_embd_statistics(
+            embds_background
+        )
+        mu_eval, sigma_eval = self.fad.calculate_embd_statistics(embds_eval)
+        fad_score = self.fad.calculate_frechet_distance(
+            mu_background, sigma_background, mu_eval, sigma_eval
+        )
+        return fad_score
+def create_random_chunks(
+    audio_file: str, chunk_size: int, num_chunks: int
+) -> Tuple[List[Tuple[int, int]], int]:
+    """Create num_chunks random chunks of size chunk_size (seconds)
+    from an audio file.
+    Return sample_index of start of each chunk and original sr
+    """
+    audio, sr = torchaudio.load(audio_file)
+    chunk_size_in_samples = chunk_size * sr
+    if chunk_size_in_samples >= audio.shape[-1]:
+        chunk_size_in_samples = audio.shape[-1] - 1
+    chunks = []
+    for i in range(num_chunks):
+        start = torch.randint(0, audio.shape[-1] - chunk_size_in_samples, (1,)).item()
+        chunks.append(start)
+    return chunks, sr
+def create_sequential_chunks(
+    audio_file: str, chunk_size: int
+) -> Tuple[List[Tuple[int, int]], int]:
+    """Create sequential chunks of size chunk_size (seconds) from an audio file.
+    Return sample_index of start of each chunk and original sr
+    """
+    chunks = []
+    audio, sr = torchaudio.load(audio_file)
+    chunk_size_in_samples = chunk_size * sr
+    chunk_starts = torch.arange(0, audio.shape[-1], chunk_size_in_samples)
+    for start in chunk_starts:
+        if start + chunk_size_in_samples > audio.shape[-1]:
+            break
+        chunks.append(audio[:, start : start + chunk_size_in_samples])
+    return chunks, sr

scripts/test.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import pytorch_lightning as pl
+import hydra
+from omegaconf import DictConfig
+import remfx.utils as utils
+from pytorch_lightning.utilities.model_summary import ModelSummary
+from remfx.models import RemFXModel
+import torch
+log = utils.get_logger(__name__)
+@hydra.main(version_base=None, config_path="../cfg", config_name="config.yaml")
+def main(cfg: DictConfig):
+    # Apply seed for reproducibility
+    if cfg.seed:
+        pl.seed_everything(cfg.seed)
+    cfg.render_files = False
+    log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>.")
+    datamodule = hydra.utils.instantiate(cfg.datamodule, _convert_="partial")
+    log.info(f"Instantiating model <{cfg.model._target_}>.")
+    model = hydra.utils.instantiate(cfg.model, _convert_="partial")
+    state_dict = torch.load(cfg.ckpt_path, map_location=torch.device("cpu"))[
+        "state_dict"
+    ]
+    model.load_state_dict(state_dict)
+    # Init all callbacks
+    callbacks = []
+    if "callbacks" in cfg:
+        for _, cb_conf in cfg["callbacks"].items():
+            if "_target_" in cb_conf:
+                log.info(f"Instantiating callback <{cb_conf._target_}>.")
+                callbacks.append(hydra.utils.instantiate(cb_conf, _convert_="partial"))
+    logger = hydra.utils.instantiate(cfg.logger, _convert_="partial")
+    log.info(f"Instantiating trainer <{cfg.trainer._target_}>.")
+    trainer = hydra.utils.instantiate(
+        cfg.trainer, callbacks=callbacks, logger=logger, _convert_="partial"
+    )
+    log.info("Logging hyperparameters!")
+    utils.log_hyperparameters(
+        config=cfg,
+        model=model,
+        datamodule=datamodule,
+        trainer=trainer,
+        callbacks=callbacks,
+        logger=logger,
+    )
+    summary = ModelSummary(model)
+    print(summary)
+    trainer.test(model=model, datamodule=datamodule)
+if __name__ == "__main__":
+    main()

scripts/train.py CHANGED Viewed

@@ -7,12 +7,11 @@ from pytorch_lightning.utilities.model_summary import ModelSummary
 log = utils.get_logger(__name__)
-@hydra.main(version_base=None, config_path="../", config_name="config.yaml")
 def main(cfg: DictConfig):
     # Apply seed for reproducibility
     if cfg.seed:
         pl.seed_everything(cfg.seed)
     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>.")
     datamodule = hydra.utils.instantiate(cfg.datamodule, _convert_="partial")
     log.info(f"Instantiating model <{cfg.model._target_}>.")

 log = utils.get_logger(__name__)
+@hydra.main(version_base=None, config_path="../cfg", config_name="config.yaml")
 def main(cfg: DictConfig):
     # Apply seed for reproducibility
     if cfg.seed:
         pl.seed_everything(cfg.seed)
     log.info(f"Instantiating datamodule <{cfg.datamodule._target_}>.")
     datamodule = hydra.utils.instantiate(cfg.datamodule, _convert_="partial")
     log.info(f"Instantiating model <{cfg.model._target_}>.")

setup.py CHANGED Viewed

@@ -46,6 +46,7 @@ setup(
         "auraloss",
         "pyloudnorm",
         "pedalboard",
     ],
     include_package_data=True,
     license="Apache License 2.0",

         "auraloss",
         "pyloudnorm",
         "pedalboard",
+        "frechet_audio_distance",
     ],
     include_package_data=True,
     license="Apache License 2.0",

shell_vars.sh CHANGED Viewed

@@ -1,3 +1,3 @@
-export DATASET_ROOT="./data/GuitarSet"
 export WANDB_PROJECT="RemFX"
 export WANDB_ENTITY="mattricesound"

+export DATASET_ROOT="./data/VocalSet"
 export WANDB_PROJECT="RemFX"
 export WANDB_ENTITY="mattricesound"