Spaces:

mattricesound
/

RemFx

Runtime error

App Files Files Community

mattricesound commited on Mar 13, 2023

Commit

9a9a2c9

1 Parent(s): 7d6f241

Update callbacks, debug new models

Browse files

Files changed (12) hide show

cfg/config.yaml +5 -1
cfg/model/audio_diffusion.yaml +2 -2
cfg/model/dcunet.yaml +5 -3
cfg/model/demucs.yaml +1 -2
cfg/model/dptnet.yaml +3 -1
cfg/model/umx.yaml +1 -2
remfx/callbacks.py +128 -0
remfx/datasets.py +0 -1
remfx/dcunet.py +2 -2
remfx/dptnet.py +1 -2
remfx/models.py +40 -121
remfx/utils.py +3 -29

cfg/config.yaml CHANGED Viewed

@@ -41,6 +41,11 @@ callbacks:
   learning_rate_monitor:
     _target_: pytorch_lightning.callbacks.LearningRateMonitor
     logging_interval: "step"
 datamodule:
   _target_: remfx.datasets.VocalSetDatamodule
@@ -116,4 +121,3 @@ trainer:
   devices: 1
   gradient_clip_val: 10.0
   max_steps: 50000

   learning_rate_monitor:
     _target_: pytorch_lightning.callbacks.LearningRateMonitor
     logging_interval: "step"
+  audio_logging:
+    _target_: remfx.callbacks.AudioCallback
+    sample_rate: ${sample_rate}
+  metric_logging:
+    _target_: remfx.callbacks.MetricCallback
 datamodule:
   _target_: remfx.datasets.VocalSetDatamodule
   devices: 1
   gradient_clip_val: 10.0
   max_steps: 50000

cfg/model/audio_diffusion.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -13,4 +13,4 @@ model:
 datamodule:
   dataset:
     effect_types: ["Clean"]
-  batch_size: 2

 # @package _global_
 model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
 datamodule:
   dataset:
     effect_types: ["Clean"]
+  batch_size: 2

cfg/model/dcunet.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -9,7 +9,7 @@ model:
   sample_rate: ${sample_rate}
   network:
     _target_: remfx.models.DCUNetModel
-    spec_dim: 256 + 1
     hidden_dim: 768
     filter_len: 512
     hop_len: 64
@@ -19,4 +19,6 @@ model:
     refine_layers: 1
     is_mask: True
     norm: 'ins'
-    act: 'comp'

 # @package _global_
 model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
   sample_rate: ${sample_rate}
   network:
     _target_: remfx.models.DCUNetModel
+    spec_dim: 257
     hidden_dim: 768
     filter_len: 512
     hop_len: 64
     refine_layers: 1
     is_mask: True
     norm: 'ins'
+    act: 'comp'
+    sample_rate: ${sample_rate}
+    num_bins: 1025

cfg/model/demucs.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -13,4 +13,3 @@ model:
     audio_channels: 1
     nfft: 4096
     sample_rate: ${sample_rate}

 # @package _global_
 model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
     audio_channels: 1
     nfft: 4096
     sample_rate: ${sample_rate}

cfg/model/dptnet.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -16,3 +16,5 @@ model:
     segment_size: 250
     nspk: 1
     win_len: 2

 # @package _global_
 model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
     segment_size: 250
     nspk: 1
     win_len: 2
+    sample_rate: ${sample_rate}
+    num_bins: 1025

cfg/model/umx.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFx
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -14,4 +14,3 @@ model:
     n_channels: 1
     alpha: 0.3
     sample_rate: ${sample_rate}

 # @package _global_
 model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
     n_channels: 1
     alpha: 0.3
     sample_rate: ${sample_rate}

remfx/callbacks.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from pytorch_lightning.callbacks import Callback
+import pytorch_lightning as pl
+from einops import rearrange
+import torch
+import wandb
+from torch import Tensor
+class AudioCallback(Callback):
+    def __init__(self, sample_rate, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.log_train_audio = True
+        self.sample_rate = sample_rate
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        # Log initial audio
+        if self.log_train_audio:
+            x, y, _, _ = batch
+            # Concat samples together for easier viewing in dashboard
+            input_samples = rearrange(x, "b c t -> c (b t)").unsqueeze(0)
+            target_samples = rearrange(y, "b c t -> c (b t)").unsqueeze(0)
+            log_wandb_audio_batch(
+                logger=trainer.logger,
+                id="input_effected_audio",
+                samples=input_samples.cpu(),
+                sampling_rate=self.sample_rate,
+                caption="Training Data",
+            )
+            log_wandb_audio_batch(
+                logger=trainer.logger,
+                id="target_audio",
+                samples=target_samples.cpu(),
+                sampling_rate=self.sample_rate,
+                caption="Target Data",
+            )
+            self.log_train_audio = False
+    def on_validation_batch_start(
+        self, trainer, pl_module, batch, batch_idx, dataloader_idx
+    ):
+        x, target, _, _ = batch
+        # Only run on first batch
+        if batch_idx == 0:
+            with torch.no_grad():
+                y = pl_module.model.sample(x)
+            # Concat samples together for easier viewing in dashboard
+            # 2 seconds of silence between each sample
+            silence = torch.zeros_like(x)
+            silence = silence[:, : self.sample_rate * 2]
+            concat_samples = torch.cat([y, silence, x, silence, target], dim=-1)
+            log_wandb_audio_batch(
+                logger=trainer.logger,
+                id="prediction_input_target",
+                samples=concat_samples.cpu(),
+                sampling_rate=self.sample_rate,
+                caption=f"Epoch {trainer.current_epoch}",
+            )
+    def on_test_batch_start(self, *args):
+        self.on_validation_batch_start(*args)
+class MetricCallback(Callback):
+    def on_validation_batch_start(
+        self, trainer, pl_module, batch, batch_idx, dataloader_idx
+    ):
+        x, target, _, _ = batch
+        # Log Input Metrics
+        for metric in pl_module.metrics:
+            # SISDR returns negative values, so negate them
+            if metric == "SISDR":
+                negate = -1
+            else:
+                negate = 1
+            # Only Log FAD on test set
+            if metric == "FAD":
+                continue
+            pl_module.log(
+                f"Input_{metric}",
+                negate * pl_module.metrics[metric](x, target),
+                on_step=False,
+                on_epoch=True,
+                logger=True,
+                prog_bar=True,
+                sync_dist=True,
+            )
+    def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+        self.on_validation_batch_start(
+            trainer, pl_module, batch, batch_idx, dataloader_idx
+        )
+        # Log FAD
+        x, target, _, _ = batch
+        pl_module.log(
+            "Input_FAD",
+            pl_module.metrics["FAD"](x, target),
+            on_step=False,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+def log_wandb_audio_batch(
+    logger: pl.loggers.WandbLogger,
+    id: str,
+    samples: Tensor,
+    sampling_rate: int,
+    caption: str = "",
+    max_items: int = 10,
+):
+    num_items = samples.shape[0]
+    samples = rearrange(samples, "b c t -> b t c")
+    for idx in range(num_items):
+        if idx >= max_items:
+            break
+        logger.experiment.log(
+            {
+                f"{id}_{idx}": wandb.Audio(
+                    samples[idx].cpu().numpy(),
+                    caption=caption,
+                    sample_rate=sampling_rate,
+                )
+            }
+        )

remfx/datasets.py CHANGED Viewed

@@ -5,7 +5,6 @@ import torch
 import shutil
 import torchaudio
 import pytorch_lightning as pl
-import torch.nn.functional as F
 from tqdm import tqdm
 from pathlib import Path

 import shutil
 import torchaudio
 import pytorch_lightning as pl
 from tqdm import tqdm
 from pathlib import Path

remfx/dcunet.py CHANGED Viewed

@@ -5,11 +5,11 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
-from utils import single, concat_complex
 from torch.nn.init import calculate_gain
 from typing import Tuple
 from scipy.signal import get_window
 from librosa.util import pad_center
 class ComplexConvBlock(nn.Module):
@@ -549,7 +549,7 @@ class ComplexActLayer(nn.Module):
     def forward(self, x):
         real, img = x.chunk(2, 1)
-        return torch.cat([F.leaky_relu_(real), torch.tanh(img) * np.pi], dim=1)
 class STFT(nn.Module):

 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 from torch.nn.init import calculate_gain
 from typing import Tuple
 from scipy.signal import get_window
 from librosa.util import pad_center
+from remfx.utils import single, concat_complex
 class ComplexConvBlock(nn.Module):
     def forward(self, x):
         real, img = x.chunk(2, 1)
+        return torch.cat([F.leaky_relu(real), torch.tanh(img) * np.pi], dim=1)
 class STFT(nn.Module):

remfx/dptnet.py CHANGED Viewed

@@ -57,11 +57,10 @@ class DPTNet_base(nn.Module):
         self.mask_conv1x1 = nn.Conv1d(self.feature_dim, self.enc_dim, 1, bias=False)
         self.decoder = DPTDecoder(n_filters=enc_dim, window_size=win_len)
-    def forward(self, batch):
         """
         mix: shape (batch, T)
         """
-        mix, target = batch
         batch_size = mix.shape[0]
         mix = self.dpt_encoder(mix)  # (B, E, L)

         self.mask_conv1x1 = nn.Conv1d(self.feature_dim, self.enc_dim, 1, bias=False)
         self.decoder = DPTDecoder(n_filters=enc_dim, window_size=win_len)
+    def forward(self, mix):
         """
         mix: shape (batch, T)
         """
         batch_size = mix.shape[0]
         mix = self.dpt_encoder(mix)  # (B, E, L)

remfx/models.py CHANGED Viewed

@@ -2,16 +2,16 @@ import torch
 import torchmetrics
 import pytorch_lightning as pl
 from torch import Tensor, nn
-from einops import rearrange
 from torchaudio.models import HDemucs
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
-from utils import FADLoss, spectrogram, log_wandb_audio_batch
-from dptnet import DPTNet_base
-from dcunet import RefineSpectrogramUnet
 class RemFX(pl.LightningModule):
@@ -55,41 +55,29 @@ class RemFX(pl.LightningModule):
             eps=self.lr_eps,
             weight_decay=self.lr_weight_decay,
         )
-        return optimizer
-    # Add step-based learning rate scheduler
-    def optimizer_step(
-        self,
-        epoch,
-        batch_idx,
-        optimizer,
-        optimizer_idx,
-        optimizer_closure,
-        on_tpu,
-        using_lbfgs,
-    ):
-        # update params
-        optimizer.step(closure=optimizer_closure)
-        # update learning rate. Reduce by factor of 10 at 80% and 95% of training
-        if self.trainer.global_step == 0.8 * self.trainer.max_steps:
-            for pg in optimizer.param_groups:
-                pg["lr"] = 0.1 * pg["lr"]
-        if self.trainer.global_step == 0.95 * self.trainer.max_steps:
-            for pg in optimizer.param_groups:
-                pg["lr"] = 0.1 * pg["lr"]
     def training_step(self, batch, batch_idx):
-        loss = self.common_step(batch, batch_idx, mode="train")
-        return loss
     def validation_step(self, batch, batch_idx):
-        loss = self.common_step(batch, batch_idx, mode="valid")
-        return loss
     def test_step(self, batch, batch_idx):
-        loss = self.common_step(batch, batch_idx, mode="test")
-        return loss
     def common_step(self, batch, batch_idx, mode: str = "train"):
         x, y, _, _ = batch  # x, y = (B, C, T), (B, C, T)
@@ -116,89 +104,8 @@ class RemFX(pl.LightningModule):
                     prog_bar=True,
                     sync_dist=True,
                 )
         return loss
-    def on_train_batch_start(self, batch, batch_idx):
-        # Log initial audio
-        if self.log_train_audio:
-            x, y, _, _ = batch
-            # Concat samples together for easier viewing in dashboard
-            input_samples = rearrange(x, "b c t -> c (b t)").unsqueeze(0)
-            target_samples = rearrange(y, "b c t -> c (b t)").unsqueeze(0)
-            log_wandb_audio_batch(
-                logger=self.logger,
-                id="input_effected_audio",
-                samples=input_samples.cpu(),
-                sampling_rate=self.sample_rate,
-                caption="Training Data",
-            )
-            log_wandb_audio_batch(
-                logger=self.logger,
-                id="target_audio",
-                samples=target_samples.cpu(),
-                sampling_rate=self.sample_rate,
-                caption="Target Data",
-            )
-            self.log_train_audio = False
-    def on_validation_batch_start(self, batch, batch_idx, dataloader_idx):
-        x, target, _, _ = batch
-        # Log Input Metrics
-        for metric in self.metrics:
-            # SISDR returns negative values, so negate them
-            if metric == "SISDR":
-                negate = -1
-            else:
-                negate = 1
-            # Only Log FAD on test set
-            if metric == "FAD":
-                continue
-            self.log(
-                f"Input_{metric}",
-                negate * self.metrics[metric](x, target),
-                on_step=False,
-                on_epoch=True,
-                logger=True,
-                prog_bar=True,
-                sync_dist=True,
-            )
-        # Only run on first batch
-        if batch_idx == 0:
-            self.model.eval()
-            with torch.no_grad():
-                y = self.model.sample(x)
-            # Concat samples together for easier viewing in dashboard
-            # 2 seconds of silence between each sample
-            silence = torch.zeros_like(x)
-            silence = silence[:, : self.sample_rate * 2]
-            concat_samples = torch.cat([y, silence, x, silence, target], dim=-1)
-            log_wandb_audio_batch(
-                logger=self.logger,
-                id="prediction_input_target",
-                samples=concat_samples.cpu(),
-                sampling_rate=self.sample_rate,
-                caption=f"Epoch {self.current_epoch}",
-            )
-            self.model.train()
-    def on_test_batch_start(self, batch, batch_idx, dataloader_idx):
-        self.on_validation_batch_start(batch, batch_idx, dataloader_idx)
-        # Log FAD
-        x, target, _, _ = batch
-        self.log(
-            "Input_FAD",
-            self.metrics["FAD"](x, target),
-            on_step=False,
-            on_epoch=True,
-            logger=True,
-            prog_bar=True,
-            sync_dist=True,
-        )
 class OpenUnmixModel(nn.Module):
     def __init__(
@@ -284,9 +191,10 @@ class DiffusionGenerationModel(nn.Module):
 class DPTNetModel(nn.Module):
-    def __init__(self, sample_rate, **kwargs):
         super().__init__()
         self.model = DPTNet_base(**kwargs)
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
@@ -294,31 +202,42 @@ class DPTNetModel(nn.Module):
     def forward(self, batch):
         x, target = batch
-        output = self.model(x).squeeze(1)
         loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
         return loss, output
     def sample(self, x: Tensor) -> Tensor:
-        return self.model.sample(x)
 class DCUNetModel(nn.Module):
-    def __init__(self, sample_rate, **kwargs):
         super().__init__()
         self.model = RefineSpectrogramUnet(**kwargs)
         self.mrstftloss = MultiResolutionSTFTLoss(
-            n_bins=self.num_bins, sample_rate=sample_rate
         )
         self.l1loss = nn.L1Loss()
     def forward(self, batch):
         x, target = batch
-        output = self.model(x).squeeze(1)
         loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
         return loss, output
     def sample(self, x: Tensor) -> Tensor:
-        return self.model.sample(x)
 class FXClassifier(pl.LightningModule):

 import torchmetrics
 import pytorch_lightning as pl
 from torch import Tensor, nn
+from torch.nn import functional as F
 from torchaudio.models import HDemucs
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
+from remfx.utils import FADLoss, spectrogram
+from remfx.dptnet import DPTNet_base
+from remfx.dcunet import RefineSpectrogramUnet
 class RemFX(pl.LightningModule):
             eps=self.lr_eps,
             weight_decay=self.lr_weight_decay,
         )
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            [0.8 * self.trainer.max_steps, 0.95 * self.trainer.max_steps],
+            gamma=0.1,
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": lr_scheduler,
+                "monitor": "val_loss",
+                "interval": "step",
+                "frequency": 1,
+            },
+        }
     def training_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="train")
     def validation_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="valid")
     def test_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="test")
     def common_step(self, batch, batch_idx, mode: str = "train"):
         x, y, _, _ = batch  # x, y = (B, C, T), (B, C, T)
                     prog_bar=True,
                     sync_dist=True,
                 )
         return loss
 class OpenUnmixModel(nn.Module):
     def __init__(
 class DPTNetModel(nn.Module):
+    def __init__(self, sample_rate, num_bins, **kwargs):
         super().__init__()
         self.model = DPTNet_base(**kwargs)
+        self.num_bins = num_bins
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
     def forward(self, batch):
         x, target = batch
+        output = self.model(x.squeeze(1))
         loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
         return loss, output
     def sample(self, x: Tensor) -> Tensor:
+        return self.model(x.squeeze(1))
 class DCUNetModel(nn.Module):
+    def __init__(self, sample_rate, num_bins, **kwargs):
         super().__init__()
         self.model = RefineSpectrogramUnet(**kwargs)
         self.mrstftloss = MultiResolutionSTFTLoss(
+            n_bins=num_bins, sample_rate=sample_rate
         )
         self.l1loss = nn.L1Loss()
     def forward(self, batch):
         x, target = batch
+        output = self.model(x.squeeze(1)).unsqueeze(1)  # B x 1 x T
+        # Pad or crop to match target
+        if output.shape[-1] > target.shape[-1]:
+            output = output[:, : target.shape[-1]]
+        elif output.shape[-1] < target.shape[-1]:
+            output = F.pad(output, (0, target.shape[-1] - output.shape[-1]))
         loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
         return loss, output
     def sample(self, x: Tensor) -> Tensor:
+        output = self.model(x.squeeze(1)).unsqueeze(1)  # B x 1 x T
+        # Pad or crop to match target
+        if output.shape[-1] > x.shape[-1]:
+            output = output[:, : x.shape[-1]]
+        elif output.shape[-1] < x.shape[-1]:
+            output = F.pad(output, (0, x.shape[-1] - output.shape[-1]))
+        return output
 class FXClassifier(pl.LightningModule):

remfx/utils.py CHANGED Viewed

@@ -7,10 +7,8 @@ from frechet_audio_distance import FrechetAudioDistance
 import numpy as np
 import torch
 import torchaudio
-from torch import Tensor, nn
-import wandb
-from einops import rearrange
-from torch._six import container_abcs
 def get_logger(name=__name__) -> logging.Logger:
@@ -144,30 +142,6 @@ def create_sequential_chunks(
     return chunks, sr
-def log_wandb_audio_batch(
-    logger: pl.loggers.WandbLogger,
-    id: str,
-    samples: Tensor,
-    sampling_rate: int,
-    caption: str = "",
-    max_items: int = 10,
-):
-    num_items = samples.shape[0]
-    samples = rearrange(samples, "b c t -> b t c")
-    for idx in range(num_items):
-        if idx >= max_items:
-            break
-        logger.experiment.log(
-            {
-                f"{id}_{idx}": wandb.Audio(
-                    samples[idx].cpu().numpy(),
-                    caption=caption,
-                    sample_rate=sampling_rate,
-                )
-            }
-        )
 def spectrogram(
     x: torch.Tensor,
     window: torch.Tensor,
@@ -209,7 +183,7 @@ def init_bn(bn):
 def _ntuple(n: int):
     def parse(x):
-        if isinstance(x, container_abcs.Iterable):
             return x
         return tuple([x] * n)

 import numpy as np
 import torch
 import torchaudio
+from torch import nn
+import collections.abc
 def get_logger(name=__name__) -> logging.Logger:
     return chunks, sr
 def spectrogram(
     x: torch.Tensor,
     window: torch.Tensor,
 def _ntuple(n: int):
     def parse(x):
+        if isinstance(x, collections.abc.Iterable):
             return x
         return tuple([x] * n)