Spaces:

mattricesound
/

RemFx

Runtime error

App Files Files Community

mattricesound commited on Mar 16, 2023

Commit

a5db556

2 Parent(s): e4fc05d 0e3a05d

Merge pull request #34 from mhrice/new-networks

Browse files

Files changed (20) hide show

cfg/config.yaml +11 -2
cfg/effects/all.yaml +10 -1
cfg/exp/default.yaml +6 -1
cfg/model/audio_diffusion.yaml +3 -3
cfg/model/classifier.yaml +1 -1
cfg/model/dcunet.yaml +24 -0
cfg/model/demucs.yaml +1 -2
cfg/model/dptnet.yaml +20 -0
cfg/model/tcn.yaml +27 -0
cfg/model/umx.yaml +2 -3
remfx/callbacks.py +131 -0
remfx/cnn14.py +138 -0
remfx/datasets.py +4 -4
remfx/dcunet.py +649 -0
remfx/dptnet.py +459 -0
remfx/effects.py +1 -1
remfx/models.py +82 -305
remfx/tcn.py +143 -0
remfx/utils.py +78 -0
scripts/test.py +0 -1

cfg/config.yaml CHANGED Viewed

@@ -11,22 +11,26 @@ logs_dir: "./logs"
 render_files: True
 render_root: "./data"
 accelerator: null
 max_kept_effects: -1
 max_removed_effects: -1
 shuffle_kept_effects: True
 shuffle_removed_effects: False
-num_classes: 4
 effects_to_use:
   - compressor
   - distortion
   - reverb
   - chorus
 effects_to_remove:
   - compressor
   - distortion
   - reverb
   - chorus
 callbacks:
   model_checkpoint:
@@ -41,6 +45,12 @@ callbacks:
   learning_rate_monitor:
     _target_: pytorch_lightning.callbacks.LearningRateMonitor
     logging_interval: "step"
 datamodule:
   _target_: remfx.datasets.VocalSetDatamodule
@@ -116,4 +126,3 @@ trainer:
   devices: 1
   gradient_clip_val: 10.0
   max_steps: 50000

 render_files: True
 render_root: "./data"
 accelerator: null
+log_audio: True
+# Effects
 max_kept_effects: -1
 max_removed_effects: -1
 shuffle_kept_effects: True
 shuffle_removed_effects: False
+num_classes: 5
 effects_to_use:
   - compressor
   - distortion
   - reverb
   - chorus
+  - delay
 effects_to_remove:
   - compressor
   - distortion
   - reverb
   - chorus
+  - delay
 callbacks:
   model_checkpoint:
   learning_rate_monitor:
     _target_: pytorch_lightning.callbacks.LearningRateMonitor
     logging_interval: "step"
+  audio_logging:
+    _target_: remfx.callbacks.AudioCallback
+    sample_rate: ${sample_rate}
+    log_audio: ${log_audio}
+  metric_logging:
+    _target_: remfx.callbacks.MetricCallback
 datamodule:
   _target_: remfx.datasets.VocalSetDatamodule
   devices: 1
   gradient_clip_val: 10.0
   max_steps: 50000

cfg/effects/all.yaml CHANGED Viewed

@@ -28,4 +28,13 @@ effects:
       min_wet_dry: 0.2
       max_wet_dry: 0.8
       min_width: 0.2
-      max_width: 1.0

       min_wet_dry: 0.2
       max_wet_dry: 0.8
       min_width: 0.2
+      max_width: 1.0
+  delay:
+      _target_: remfx.effects.RandomPedalboardDelay
+      sample_rate: ${sample_rate}
+      min_delay_seconds: 0.1
+      max_delay_sconds: 1.0
+      min_feedback: 0.05
+      max_feedback: 0.6
+      min_mix: 0.2
+      max_mix: 0.7

cfg/exp/default.yaml CHANGED Viewed

@@ -9,20 +9,25 @@ logs_dir: "./logs"
 render_files: True
 render_root: "./data"
 accelerator: null
 max_kept_effects: -1
 max_removed_effects: -1
 shuffle_kept_effects: True
-shuffle_removed_effects: True
 effects_to_use:
   - compressor
   - distortion
   - reverb
   - chorus
 effects_to_remove:
   - compressor
   - distortion
   - reverb
   - chorus
 datamodule:
   batch_size: 16
   num_workers: 8

 render_files: True
 render_root: "./data"
 accelerator: null
+log_audio: True
+# Effects
 max_kept_effects: -1
 max_removed_effects: -1
 shuffle_kept_effects: True
+shuffle_removed_effects: False
+num_classes: 5
 effects_to_use:
   - compressor
   - distortion
   - reverb
   - chorus
+  - delay
 effects_to_remove:
   - compressor
   - distortion
   - reverb
   - chorus
+  - delay
 datamodule:
   batch_size: 16
   num_workers: 8

cfg/model/audio_diffusion.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
-model:
-  _target_: remfx.models.RemFXModel
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -13,4 +13,4 @@ model:
 datamodule:
   dataset:
     effect_types: ["Clean"]
-  batch_size: 2

 # @package _global_
+model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
 datamodule:
   dataset:
     effect_types: ["Clean"]
+  batch_size: 2

cfg/model/classifier.yaml CHANGED Viewed

@@ -5,7 +5,7 @@ model:
   lr_weight_decay: 1e-3
   sample_rate: ${sample_rate}
   network:
-    _target_: remfx.models.Cnn14
     num_classes: ${num_classes}
     n_fft: 4096
     hop_length: 512

   lr_weight_decay: 1e-3
   sample_rate: ${sample_rate}
   network:
+    _target_: remfx.cnn14.Cnn14
     num_classes: ${num_classes}
     n_fft: 4096
     hop_length: 512

cfg/model/dcunet.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# @package _global_
+model:
+  _target_: remfx.models.RemFX
+  lr: 1e-4
+  lr_beta1: 0.95
+  lr_beta2: 0.999
+  lr_eps: 1e-6
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  network:
+    _target_: remfx.models.DCUNetModel
+    spec_dim: 257
+    hidden_dim: 768
+    filter_len: 512
+    hop_len: 64
+    block_layers: 4
+    layers: 4
+    kernel_size: 3
+    refine_layers: 1
+    is_mask: True
+    norm: 'ins'
+    act: 'comp'
+    sample_rate: ${sample_rate}
+    num_bins: 1025

cfg/model/demucs.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
 model:
-  _target_: remfx.models.RemFXModel
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -13,4 +13,3 @@ model:
     audio_channels: 1
     nfft: 4096
     sample_rate: ${sample_rate}

 # @package _global_
 model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
     audio_channels: 1
     nfft: 4096
     sample_rate: ${sample_rate}

cfg/model/dptnet.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# @package _global_
+model:
+  _target_: remfx.models.RemFX
+  lr: 1e-4
+  lr_beta1: 0.95
+  lr_beta2: 0.999
+  lr_eps: 1e-6
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  network:
+    _target_: remfx.models.DPTNetModel
+    enc_dim: 256
+    feature_dim: 64
+    hidden_dim: 128
+    layer: 6
+    segment_size: 250
+    nspk: 1
+    win_len: 2
+    sample_rate: ${sample_rate}
+    num_bins: 1025

cfg/model/tcn.yaml ADDED Viewed

	@@ -0,0 +1,27 @@

+# @package _global_
+model:
+  _target_: remfx.models.RemFX
+  lr: 1e-4
+  lr_beta1: 0.95
+  lr_beta2: 0.999
+  lr_eps: 1e-6
+  lr_weight_decay: 1e-3
+  sample_rate: ${sample_rate}
+  network:
+    _target_: remfx.models.TCNModel
+    ninputs: 1
+    noutputs: 1
+    nblocks: 4
+    channel_growth: 0
+    channel_width: 32
+    kernel_size: 13
+    stack_size: 10
+    dilation_growth: 10
+    condition: False
+    latent_dim: 2
+    norm_type: "identity"
+    causal: False
+    estimate_loudness: False
+    sample_rate: ${sample_rate}
+    num_bins: 1025

cfg/model/umx.yaml CHANGED Viewed

@@ -1,6 +1,6 @@
 # @package _global_
-model:
-  _target_: remfx.models.RemFXModel
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
@@ -14,4 +14,3 @@ model:
     n_channels: 1
     alpha: 0.3
     sample_rate: ${sample_rate}

 # @package _global_
+model:
+  _target_: remfx.models.RemFX
   lr: 1e-4
   lr_beta1: 0.95
   lr_beta2: 0.999
     n_channels: 1
     alpha: 0.3
     sample_rate: ${sample_rate}

remfx/callbacks.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from pytorch_lightning.callbacks import Callback
+import pytorch_lightning as pl
+from einops import rearrange
+import torch
+import wandb
+from torch import Tensor
+class AudioCallback(Callback):
+    def __init__(self, sample_rate, log_audio, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.log_audio = log_audio
+        self.log_train_audio = True
+        self.sample_rate = sample_rate
+        if not self.log_audio:
+            self.log_train_audio = False
+    def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
+        # Log initial audio
+        if self.log_train_audio:
+            x, y, _, _ = batch
+            # Concat samples together for easier viewing in dashboard
+            input_samples = rearrange(x, "b c t -> c (b t)").unsqueeze(0)
+            target_samples = rearrange(y, "b c t -> c (b t)").unsqueeze(0)
+            log_wandb_audio_batch(
+                logger=trainer.logger,
+                id="input_effected_audio",
+                samples=input_samples.cpu(),
+                sampling_rate=self.sample_rate,
+                caption="Training Data",
+            )
+            log_wandb_audio_batch(
+                logger=trainer.logger,
+                id="target_audio",
+                samples=target_samples.cpu(),
+                sampling_rate=self.sample_rate,
+                caption="Target Data",
+            )
+            self.log_train_audio = False
+    def on_validation_batch_start(
+        self, trainer, pl_module, batch, batch_idx, dataloader_idx
+    ):
+        x, target, _, _ = batch
+        # Only run on first batch
+        if batch_idx == 0 and self.log_audio:
+            with torch.no_grad():
+                y = pl_module.model.sample(x)
+            # Concat samples together for easier viewing in dashboard
+            # 2 seconds of silence between each sample
+            silence = torch.zeros_like(x)
+            silence = silence[:, : self.sample_rate * 2]
+            concat_samples = torch.cat([y, silence, x, silence, target], dim=-1)
+            log_wandb_audio_batch(
+                logger=trainer.logger,
+                id="prediction_input_target",
+                samples=concat_samples.cpu(),
+                sampling_rate=self.sample_rate,
+                caption=f"Epoch {trainer.current_epoch}",
+            )
+    def on_test_batch_start(self, *args):
+        self.on_validation_batch_start(*args)
+class MetricCallback(Callback):
+    def on_validation_batch_start(
+        self, trainer, pl_module, batch, batch_idx, dataloader_idx
+    ):
+        x, target, _, _ = batch
+        # Log Input Metrics
+        for metric in pl_module.metrics:
+            # SISDR returns negative values, so negate them
+            if metric == "SISDR":
+                negate = -1
+            else:
+                negate = 1
+            # Only Log FAD on test set
+            if metric == "FAD":
+                continue
+            pl_module.log(
+                f"Input_{metric}",
+                negate * pl_module.metrics[metric](x, target),
+                on_step=False,
+                on_epoch=True,
+                logger=True,
+                prog_bar=True,
+                sync_dist=True,
+            )
+    def on_test_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
+        self.on_validation_batch_start(
+            trainer, pl_module, batch, batch_idx, dataloader_idx
+        )
+        # Log FAD
+        x, target, _, _ = batch
+        pl_module.log(
+            "Input_FAD",
+            pl_module.metrics["FAD"](x, target),
+            on_step=False,
+            on_epoch=True,
+            logger=True,
+            prog_bar=True,
+            sync_dist=True,
+        )
+def log_wandb_audio_batch(
+    logger: pl.loggers.WandbLogger,
+    id: str,
+    samples: Tensor,
+    sampling_rate: int,
+    caption: str = "",
+    max_items: int = 10,
+):
+    num_items = samples.shape[0]
+    samples = rearrange(samples, "b c t -> b t c")
+    for idx in range(num_items):
+        if idx >= max_items:
+            break
+        logger.experiment.log(
+            {
+                f"{id}_{idx}": wandb.Audio(
+                    samples[idx].cpu().numpy(),
+                    caption=caption,
+                    sample_rate=sampling_rate,
+                )
+            }
+        )

remfx/cnn14.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import torchaudio
+import torch.nn as nn
+import torch.nn.functional as F
+from utils import init_bn, init_layer
+# adapted from https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
+class Cnn14(nn.Module):
+    def __init__(
+        self,
+        num_classes: int,
+        sample_rate: float,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        n_mels: int = 128,
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        window = torch.hann_window(n_fft)
+        self.register_buffer("window", window)
+        self.melspec = torchaudio.transforms.MelSpectrogram(
+            sample_rate,
+            n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+        )
+        self.bn0 = nn.BatchNorm2d(n_mels)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        self.fc_audioset = nn.Linear(2048, num_classes, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+    def forward(self, x: torch.Tensor):
+        """
+        Input: (batch_size, data_length)"""
+        x = self.melspec(x)
+        x = x.permute(0, 2, 1, 3)
+        x = self.bn0(x)
+        x = x.permute(0, 2, 1, 3)
+        if self.training:
+            pass
+            # x = self.spec_augmenter(x)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        (x1, _) = torch.max(x, dim=2)
+        x2 = torch.mean(x, dim=2)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        clipwise_output = self.fc_audioset(x)
+        return clipwise_output
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == "max":
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg":
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == "avg+max":
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception("Incorrect argument!")
+        return x

remfx/datasets.py CHANGED Viewed

@@ -5,7 +5,6 @@ import torch
 import shutil
 import torchaudio
 import pytorch_lightning as pl
-import torch.nn.functional as F
 from tqdm import tqdm
 from pathlib import Path
@@ -224,10 +223,10 @@ class VocalSet(Dataset):
             effect_indices = torch.arange(len(self.effects_to_remove))
         # Up to max_removed_effects
         if self.max_removed_effects != -1:
-            num_kept_effects = int(torch.rand(1).item() * (self.max_removed_effects))
         else:
-            num_kept_effects = len(self.effects_to_remove)
-        effect_indices = effect_indices[: self.max_removed_effects]
         # Index in effect settings
         effect_names_to_apply = [self.effects_to_remove[i] for i in effect_indices]
         effects_to_apply = [self.effects[i] for i in effect_names_to_apply]
@@ -250,6 +249,7 @@ class VocalSet(Dataset):
         # Normalize
         normalized_dry = self.normalize(dry)
         normalized_wet = self.normalize(wet)
         return normalized_dry, normalized_wet, dry_labels_tensor, wet_labels_tensor

 import shutil
 import torchaudio
 import pytorch_lightning as pl
 from tqdm import tqdm
 from pathlib import Path
             effect_indices = torch.arange(len(self.effects_to_remove))
         # Up to max_removed_effects
         if self.max_removed_effects != -1:
+            num_removed_effects = int(torch.rand(1).item() * (self.max_removed_effects))
         else:
+            num_removed_effects = len(self.effects_to_remove)
+        effect_indices = effect_indices[:num_removed_effects]
         # Index in effect settings
         effect_names_to_apply = [self.effects_to_remove[i] for i in effect_indices]
         effects_to_apply = [self.effects[i] for i in effect_names_to_apply]
         # Normalize
         normalized_dry = self.normalize(dry)
         normalized_wet = self.normalize(wet)
         return normalized_dry, normalized_wet, dry_labels_tensor, wet_labels_tensor

remfx/dcunet.py ADDED Viewed

	@@ -0,0 +1,649 @@

+# Adapted from https://github.com/AppleHolic/source_separation/tree/master/source_separation
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch.nn.init import calculate_gain
+from typing import Tuple
+from scipy.signal import get_window
+from librosa.util import pad_center
+from remfx.utils import single, concat_complex
+class ComplexConvBlock(nn.Module):
+    """
+    Convolution block
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        padding: int = 0,
+        layers: int = 4,
+        bn_func=nn.BatchNorm1d,
+        act_func=nn.LeakyReLU,
+        skip_res: bool = False,
+    ):
+        super().__init__()
+        # modules
+        self.blocks = nn.ModuleList()
+        self.skip_res = skip_res
+        for idx in range(layers):
+            in_ = in_channels if idx == 0 else out_channels
+            self.blocks.append(
+                nn.Sequential(
+                    *[
+                        bn_func(in_),
+                        act_func(),
+                        ComplexConv1d(in_, out_channels, kernel_size, padding=padding),
+                    ]
+                )
+            )
+    def forward(self, x: torch.tensor) -> torch.tensor:
+        temp = x
+        for idx, block in enumerate(self.blocks):
+            x = block(x)
+        if temp.size() != x.size() or self.skip_res:
+            return x
+        else:
+            return x + temp
+class SpectrogramUnet(nn.Module):
+    def __init__(
+        self,
+        spec_dim: int,
+        hidden_dim: int,
+        filter_len: int,
+        hop_len: int,
+        layers: int = 3,
+        block_layers: int = 3,
+        kernel_size: int = 5,
+        is_mask: bool = False,
+        norm: str = "bn",
+        act: str = "tanh",
+    ):
+        super().__init__()
+        self.layers = layers
+        self.is_mask = is_mask
+        # stft modules
+        self.stft = STFT(filter_len, hop_len)
+        if norm == "bn":
+            self.bn_func = nn.BatchNorm1d
+        elif norm == "ins":
+            self.bn_func = lambda x: nn.InstanceNorm1d(x, affine=True)
+        else:
+            raise NotImplementedError("{} is not implemented !".format(norm))
+        if act == "tanh":
+            self.act_func = nn.Tanh
+            self.act_out = nn.Tanh
+        elif act == "comp":
+            self.act_func = ComplexActLayer
+            self.act_out = lambda: ComplexActLayer(is_out=True)
+        else:
+            raise NotImplementedError("{} is not implemented !".format(act))
+        # prev conv
+        self.prev_conv = ComplexConv1d(spec_dim * 2, hidden_dim, 1)
+        # down
+        self.down = nn.ModuleList()
+        self.down_pool = nn.MaxPool1d(3, stride=2, padding=1)
+        for idx in range(self.layers):
+            block = ComplexConvBlock(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                bn_func=self.bn_func,
+                act_func=self.act_func,
+                layers=block_layers,
+            )
+            self.down.append(block)
+        # up
+        self.up = nn.ModuleList()
+        for idx in range(self.layers):
+            in_c = hidden_dim if idx == 0 else hidden_dim * 2
+            self.up.append(
+                nn.Sequential(
+                    ComplexConvBlock(
+                        in_c,
+                        hidden_dim,
+                        kernel_size=kernel_size,
+                        padding=kernel_size // 2,
+                        bn_func=self.bn_func,
+                        act_func=self.act_func,
+                        layers=block_layers,
+                    ),
+                    self.bn_func(hidden_dim),
+                    self.act_func(),
+                    ComplexTransposedConv1d(
+                        hidden_dim, hidden_dim, kernel_size=2, stride=2
+                    ),
+                )
+            )
+        # out_conv
+        self.out_conv = nn.Sequential(
+            ComplexConvBlock(
+                hidden_dim * 2,
+                spec_dim * 2,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                bn_func=self.bn_func,
+                act_func=self.act_func,
+            ),
+            self.bn_func(spec_dim * 2),
+            self.act_func(),
+        )
+        # refine conv
+        self.refine_conv = nn.Sequential(
+            ComplexConvBlock(
+                spec_dim * 4,
+                spec_dim * 2,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                bn_func=self.bn_func,
+                act_func=self.act_func,
+            ),
+            self.bn_func(spec_dim * 2),
+            self.act_func(),
+        )
+    def log_stft(self, wav):
+        # stft
+        mag, phase = self.stft.transform(wav)
+        return torch.log(mag + 1), phase
+    def exp_istft(self, log_mag, phase):
+        # exp
+        mag = np.e**log_mag - 1
+        # istft
+        wav = self.stft.inverse(mag, phase)
+        return wav
+    def adjust_diff(self, x, target):
+        size_diff = target.size()[-1] - x.size()[-1]
+        assert size_diff >= 0
+        if size_diff > 0:
+            x = F.pad(
+                x.unsqueeze(1), (size_diff // 2, size_diff // 2), "reflect"
+            ).squeeze(1)
+        return x
+    def masking(self, mag, phase, origin_mag, origin_phase):
+        abs_mag = torch.abs(mag)
+        mag_mask = torch.tanh(abs_mag)
+        phase_mask = mag / abs_mag
+        # masking
+        mag = mag_mask * origin_mag
+        phase = phase_mask * (origin_phase + phase)
+        return mag, phase
+    def forward(self, wav):
+        # stft
+        origin_mag, origin_phase = self.log_stft(wav)
+        origin_x = torch.cat([origin_mag, origin_phase], dim=1)
+        # prev
+        x = self.prev_conv(origin_x)
+        # body
+        # down
+        down_cache = []
+        for idx, block in enumerate(self.down):
+            x = block(x)
+            down_cache.append(x)
+            x = self.down_pool(x)
+        # up
+        for idx, block in enumerate(self.up):
+            x = block(x)
+            res = F.interpolate(
+                down_cache[self.layers - (idx + 1)],
+                size=[x.size()[2]],
+                mode="linear",
+                align_corners=False,
+            )
+            x = concat_complex(x, res, dim=1)
+        # match spec dimension
+        x = self.out_conv(x)
+        if origin_mag.size(2) != x.size(2):
+            x = F.interpolate(
+                x, size=[origin_mag.size(2)], mode="linear", align_corners=False
+            )
+        # refine
+        x = self.refine_conv(concat_complex(x, origin_x))
+        def to_wav(stft):
+            mag, phase = stft.chunk(2, 1)
+            if self.is_mask:
+                mag, phase = self.masking(mag, phase, origin_mag, origin_phase)
+            out = self.exp_istft(mag, phase)
+            out = self.adjust_diff(out, wav)
+            return out
+        refine_wav = to_wav(x)
+        return refine_wav
+class RefineSpectrogramUnet(SpectrogramUnet):
+    def __init__(
+        self,
+        spec_dim: int,
+        hidden_dim: int,
+        filter_len: int,
+        hop_len: int,
+        layers: int = 4,
+        block_layers: int = 4,
+        kernel_size: int = 3,
+        is_mask: bool = True,
+        norm: str = "ins",
+        act: str = "comp",
+        refine_layers: int = 1,
+        add_spec_results: bool = False,
+    ):
+        super().__init__(
+            spec_dim,
+            hidden_dim,
+            filter_len,
+            hop_len,
+            layers,
+            block_layers,
+            kernel_size,
+            is_mask,
+            norm,
+            act,
+        )
+        self.add_spec_results = add_spec_results
+        # refine conv
+        self.refine_conv = nn.ModuleList(
+            [
+                nn.Sequential(
+                    ComplexConvBlock(
+                        spec_dim * 2,
+                        spec_dim * 2,
+                        kernel_size=kernel_size,
+                        padding=kernel_size // 2,
+                        bn_func=self.bn_func,
+                        act_func=self.act_func,
+                    ),
+                    self.bn_func(spec_dim * 2),
+                    self.act_func(),
+                )
+            ]
+            * refine_layers
+        )
+    def forward(self, wav):
+        # stft
+        origin_mag, origin_phase = self.log_stft(wav)
+        origin_x = torch.cat([origin_mag, origin_phase], dim=1)
+        # prev
+        x = self.prev_conv(origin_x)
+        # body
+        # down
+        down_cache = []
+        for idx, block in enumerate(self.down):
+            x = block(x)
+            down_cache.append(x)
+            x = self.down_pool(x)
+        # up
+        for idx, block in enumerate(self.up):
+            x = block(x)
+            res = F.interpolate(
+                down_cache[self.layers - (idx + 1)],
+                size=[x.size()[2]],
+                mode="linear",
+                align_corners=False,
+            )
+            x = concat_complex(x, res, dim=1)
+        # match spec dimension
+        x = self.out_conv(x)
+        if origin_mag.size(2) != x.size(2):
+            x = F.interpolate(
+                x, size=[origin_mag.size(2)], mode="linear", align_corners=False
+            )
+        # refine
+        for idx, refine_module in enumerate(self.refine_conv):
+            x = refine_module(x)
+            mag, phase = x.chunk(2, 1)
+            mag, phase = self.masking(mag, phase, origin_mag, origin_phase)
+            if idx < len(self.refine_conv) - 1:
+                x = torch.cat([mag, phase], dim=1)
+        # clamp phase
+        phase = phase.clamp(-np.pi, np.pi)
+        out = self.exp_istft(mag, phase)
+        out = self.adjust_diff(out, wav)
+        if self.add_spec_results:
+            out = (out, mag, phase)
+        return out
+class _ComplexConvNd(nn.Module):
+    """
+    Implement Complex Convolution
+    A: real weight
+    B: img weight
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.output_padding = output_padding
+        self.transposed = transposed
+        self.A = self.make_weight(in_channels, out_channels, kernel_size)
+        self.B = self.make_weight(in_channels, out_channels, kernel_size)
+        self.reset_parameters()
+    def make_weight(self, in_ch, out_ch, kernel_size):
+        if self.transposed:
+            tensor = nn.Parameter(torch.Tensor(in_ch, out_ch // 2, *kernel_size))
+        else:
+            tensor = nn.Parameter(torch.Tensor(out_ch, in_ch // 2, *kernel_size))
+        return tensor
+    def reset_parameters(self):
+        # init real weight
+        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.A)
+        # init A
+        gain = calculate_gain("leaky_relu", 0)
+        std = gain / np.sqrt(fan_in)
+        bound = np.sqrt(3.0) * std
+        with torch.no_grad():
+            # TODO: find more stable initial values
+            self.A.uniform_(-bound * (1 / (np.pi**2)), bound * (1 / (np.pi**2)))
+            #
+            # B is initialized by pi
+            # -pi and pi is too big, so it is powed by -1
+            self.B.uniform_(-1 / np.pi, 1 / np.pi)
+class ComplexConv1d(_ComplexConvNd):
+    """
+    Complex Convolution 1d
+    """
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1
+    ):
+        kernel_size = single(kernel_size)
+        stride = single(stride)
+        # edit padding
+        padding = padding
+        dilation = single(dilation)
+        super(ComplexConv1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            False,
+            single(0),
+        )
+    def forward(self, x):
+        """
+        Implemented complex convolution using combining 'grouped convolution' and
+        'real / img weight'
+        :param x: data (N, C, T) C is concatenated with C/2 real channels and C/2 idea channels
+        :return: complex conved result
+        """
+        # adopt reflect padding
+        if self.padding:
+            x = F.pad(x, (self.padding, self.padding), "reflect")
+        # forward real
+        real_part = F.conv1d(
+            x,
+            self.A,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        # forward idea
+        spl = self.in_channels // 2
+        weight_B = torch.cat([self.B[:spl].data * (-1), self.B[spl:].data])
+        idea_part = F.conv1d(
+            x,
+            weight_B,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        return real_part + idea_part
+class ComplexTransposedConv1d(_ComplexConvNd):
+    """
+    Complex Transposed Convolution 1d
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+    ):
+        kernel_size = single(kernel_size)
+        stride = single(stride)
+        padding = padding
+        dilation = single(dilation)
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            True,
+            output_padding,
+        )
+    def forward(self, x, output_size=None):
+        """
+        Implemented complex transposed convolution using combining 'grouped convolution'
+        and 'real / img weight'
+        :param x: data (N, C, T) C is concatenated with C/2 real channels and C/2 idea channels
+        :return: complex transposed convolution result
+        """
+        # forward real
+        if self.padding:
+            x = F.pad(x, (self.padding, self.padding), "reflect")
+        real_part = F.conv_transpose1d(
+            x,
+            self.A,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        # forward idea
+        spl = self.out_channels // 2
+        weight_B = torch.cat([self.B[:spl] * (-1), self.B[spl:]])
+        idea_part = F.conv_transpose1d(
+            x,
+            weight_B,
+            None,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=2,
+        )
+        if self.output_padding:
+            real_part = F.pad(
+                real_part, (self.output_padding, self.output_padding), "reflect"
+            )
+            idea_part = F.pad(
+                idea_part, (self.output_padding, self.output_padding), "reflect"
+            )
+        return real_part + idea_part
+class ComplexActLayer(nn.Module):
+    """
+    Activation differently 'real' part and 'img' part
+    In implemented DCUnet on this repository, Real part is activated to log space.
+    And Phase(img) part, it is distributed in [-pi, pi]...
+    """
+    def forward(self, x):
+        real, img = x.chunk(2, 1)
+        return torch.cat([F.leaky_relu(real), torch.tanh(img) * np.pi], dim=1)
+class STFT(nn.Module):
+    """
+    Re-construct stft for calculating backward operation
+    refer on : https://github.com/pseeth/torch-stft/blob/master/torch_stft/stft.py
+    """
+    def __init__(
+        self,
+        filter_length: int = 1024,
+        hop_length: int = 512,
+        win_length: int = None,
+        window: str = "hann",
+    ):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = hop_length
+        self.win_length = win_length if win_length else filter_length
+        self.window = window
+        self.pad_amount = self.filter_length // 2
+        # make fft window
+        assert filter_length >= self.win_length
+        # get window and zero center pad it to filter_length
+        fft_window = get_window(window, self.win_length, fftbins=True)
+        fft_window = pad_center(fft_window, filter_length)
+        fft_window = torch.from_numpy(fft_window).float()
+        # calculate fourer_basis
+        cut_off = int((self.filter_length / 2 + 1))
+        fourier_basis = np.fft.fft(np.eye(self.filter_length))
+        fourier_basis = np.vstack(
+            [np.real(fourier_basis[:cut_off, :]), np.imag(fourier_basis[:cut_off, :])]
+        )
+        # make forward & inverse basis
+        self.register_buffer("square_window", fft_window**2)
+        forward_basis = torch.FloatTensor(fourier_basis[:, np.newaxis, :]) * fft_window
+        inverse_basis = (
+            torch.FloatTensor(
+                np.linalg.pinv(self.filter_length / self.hop_length * fourier_basis).T[
+                    :, np.newaxis, :
+                ]
+            )
+            * fft_window
+        )
+        # torch.pinverse has a bug, so at this time, it is separated into two parts..
+        self.register_buffer("forward_basis", forward_basis)
+        self.register_buffer("inverse_basis", inverse_basis)
+    def transform(self, wav: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        # reflect padding
+        wav = wav.unsqueeze(1).unsqueeze(1)
+        wav = F.pad(
+            wav, (self.pad_amount, self.pad_amount, 0, 0), mode="reflect"
+        ).squeeze(1)
+        # conv
+        forward_trans = F.conv1d(
+            wav, self.forward_basis, stride=self.hop_length, padding=0
+        )
+        real_part, imag_part = forward_trans.chunk(2, 1)
+        return torch.sqrt(real_part**2 + imag_part**2), torch.atan2(
+            imag_part.data, real_part.data
+        )
+    def inverse(
+        self, magnitude: torch.Tensor, phase: torch.Tensor, eps: float = 1e-9
+    ) -> torch.Tensor:
+        comp = torch.cat(
+            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
+        )
+        inverse_transform = F.conv_transpose1d(
+            comp, self.inverse_basis, stride=self.hop_length, padding=0
+        )
+        # remove window effect
+        n_frames = comp.size(-1)
+        inverse_size = inverse_transform.size(-1)
+        window_filter = torch.ones(1, 1, n_frames).type_as(inverse_transform)
+        weight = self.square_window[: self.filter_length].unsqueeze(0).unsqueeze(0)
+        window_filter = F.conv_transpose1d(
+            window_filter, weight, stride=self.hop_length, padding=0
+        )
+        window_filter = window_filter.squeeze()[:inverse_size] + eps
+        inverse_transform /= window_filter
+        # scale by hop ratio
+        inverse_transform *= self.filter_length / self.hop_length
+        return inverse_transform[..., self.pad_amount : -self.pad_amount].squeeze(1)

remfx/dptnet.py ADDED Viewed

	@@ -0,0 +1,459 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.container import ModuleList
+from torch.nn.modules.activation import MultiheadAttention
+from torch.nn.modules.dropout import Dropout
+from torch.nn.modules.linear import Linear
+from torch.nn.modules.rnn import LSTM
+from torch.nn.modules.normalization import LayerNorm
+from torch.autograd import Variable
+import copy
+import math
+# adapted from https://github.com/ujscjj/DPTNet
+class DPTNet_base(nn.Module):
+    def __init__(
+        self,
+        enc_dim,
+        feature_dim,
+        hidden_dim,
+        layer,
+        segment_size=250,
+        nspk=2,
+        win_len=2,
+    ):
+        super().__init__()
+        # parameters
+        self.window = win_len
+        self.stride = self.window // 2
+        self.enc_dim = enc_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.segment_size = segment_size
+        self.layer = layer
+        self.num_spk = nspk
+        self.eps = 1e-8
+        self.dpt_encoder = DPTEncoder(
+            n_filters=enc_dim,
+            window_size=win_len,
+        )
+        self.enc_LN = nn.GroupNorm(1, self.enc_dim, eps=1e-8)
+        self.dpt_separation = DPTSeparation(
+            self.enc_dim,
+            self.feature_dim,
+            self.hidden_dim,
+            self.num_spk,
+            self.layer,
+            self.segment_size,
+        )
+        self.mask_conv1x1 = nn.Conv1d(self.feature_dim, self.enc_dim, 1, bias=False)
+        self.decoder = DPTDecoder(n_filters=enc_dim, window_size=win_len)
+    def forward(self, mix):
+        """
+        mix: shape (batch, T)
+        """
+        batch_size = mix.shape[0]
+        mix = self.dpt_encoder(mix)  # (B, E, L)
+        score_ = self.enc_LN(mix)  # B, E, L
+        score_ = self.dpt_separation(score_)  # B, nspk, T, N
+        score_ = (
+            score_.view(batch_size * self.num_spk, -1, self.feature_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )  # B*nspk, N, T
+        score = self.mask_conv1x1(score_)  # [B*nspk, N, L] -> [B*nspk, E, L]
+        score = score.view(
+            batch_size, self.num_spk, self.enc_dim, -1
+        )  # [B*nspk, E, L] -> [B, nspk, E, L]
+        est_mask = F.relu(score)
+        est_source = self.decoder(
+            mix, est_mask
+        )  # [B, E, L] + [B, nspk, E, L]--> [B, nspk, T]
+        return est_source
+class DPTEncoder(nn.Module):
+    def __init__(self, n_filters: int = 64, window_size: int = 2):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            1, n_filters, kernel_size=window_size, stride=window_size // 2, bias=False
+        )
+    def forward(self, x):
+        x = x.unsqueeze(1)
+        x = F.relu(self.conv(x))
+        return x
+class TransformerEncoderLayer(torch.nn.Module):
+    def __init__(
+        self, d_model, nhead, hidden_size, dim_feedforward, dropout, activation="relu"
+    ):
+        super(TransformerEncoderLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of improved part
+        self.lstm = LSTM(d_model, hidden_size, 1, bidirectional=True)
+        self.dropout = Dropout(dropout)
+        self.linear = Linear(hidden_size * 2, d_model)
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(dropout)
+        self.dropout2 = Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+    def __setstate__(self, state):
+        if "activation" not in state:
+            state["activation"] = F.relu
+        super(TransformerEncoderLayer, self).__setstate__(state)
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        r"""Pass the input through the encoder layer.
+        Args:
+            src: the sequnce to the encoder layer (required).
+            src_mask: the mask for the src sequence (optional).
+            src_key_padding_mask: the mask for the src keys per batch (optional).
+        Shape:
+            see the docs in Transformer class.
+        """
+        src2 = self.self_attn(
+            src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear(self.dropout(self.activation(self.lstm(src)[0])))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+def _get_clones(module, N):
+    return ModuleList([copy.deepcopy(module) for i in range(N)])
+def _get_activation_fn(activation):
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return F.gelu
+    raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
+class SingleTransformer(nn.Module):
+    """
+    Container module for a single Transformer layer.
+    args: input_size: int, dimension of the input feature.
+    The input should have shape (batch, seq_len, input_size).
+    """
+    def __init__(self, input_size, hidden_size, dropout):
+        super(SingleTransformer, self).__init__()
+        self.transformer = TransformerEncoderLayer(
+            d_model=input_size,
+            nhead=4,
+            hidden_size=hidden_size,
+            dim_feedforward=hidden_size * 2,
+            dropout=dropout,
+        )
+    def forward(self, input):
+        # input shape: batch, seq, dim
+        output = input
+        transformer_output = (
+            self.transformer(output.permute(1, 0, 2).contiguous())
+            .permute(1, 0, 2)
+            .contiguous()
+        )
+        return transformer_output
+# dual-path transformer
+class DPT(nn.Module):
+    """
+    Deep dual-path transformer.
+    args:
+        input_size: int, dimension of the input feature. The input should have shape
+                    (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        num_layers: int, number of stacked Transformer layers. Default is 1.
+        dropout: float, dropout ratio. Default is 0.
+    """
+    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0):
+        super(DPT, self).__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        # dual-path transformer
+        self.row_transformer = nn.ModuleList([])
+        self.col_transformer = nn.ModuleList([])
+        for i in range(num_layers):
+            self.row_transformer.append(
+                SingleTransformer(input_size, hidden_size, dropout)
+            )
+            self.col_transformer.append(
+                SingleTransformer(input_size, hidden_size, dropout)
+            )
+        # output layer
+        self.output = nn.Sequential(nn.PReLU(), nn.Conv2d(input_size, output_size, 1))
+    def forward(self, input):
+        # input shape: batch, N, dim1, dim2
+        # apply transformer on dim1 first and then dim2
+        # output shape: B, output_size, dim1, dim2
+        # input = input.to(device)
+        batch_size, _, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_transformer)):
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * dim2, dim1, -1)
+            )  # B*dim2, dim1, N
+            row_output = self.row_transformer[i](row_input)  # B*dim2, dim1, H
+            row_output = (
+                row_output.view(batch_size, dim2, dim1, -1)
+                .permute(0, 3, 2, 1)
+                .contiguous()
+            )  # B, N, dim1, dim2
+            output = row_output
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * dim1, dim2, -1)
+            )  # B*dim1, dim2, N
+            col_output = self.col_transformer[i](col_input)  # B*dim1, dim2, H
+            col_output = (
+                col_output.view(batch_size, dim1, dim2, -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+            )  # B, N, dim1, dim2
+            output = col_output
+        output = self.output(output)  # B, output_size, dim1, dim2
+        return output
+# base module for deep DPT
+class DPT_base(nn.Module):
+    def __init__(
+        self, input_dim, feature_dim, hidden_dim, num_spk=2, layer=6, segment_size=250
+    ):
+        super(DPT_base, self).__init__()
+        self.input_dim = input_dim
+        self.feature_dim = feature_dim
+        self.hidden_dim = hidden_dim
+        self.layer = layer
+        self.segment_size = segment_size
+        self.num_spk = num_spk
+        self.eps = 1e-8
+        # bottleneck
+        self.BN = nn.Conv1d(self.input_dim, self.feature_dim, 1, bias=False)
+        # DPT model
+        self.DPT = DPT(
+            self.feature_dim,
+            self.hidden_dim,
+            self.feature_dim * self.num_spk,
+            num_layers=layer,
+        )
+    def pad_segment(self, input, segment_size):
+        # input is the features: (B, N, T)
+        batch_size, dim, seq_len = input.shape
+        segment_stride = segment_size // 2
+        rest = segment_size - (segment_stride + seq_len % segment_size) % segment_size
+        if rest > 0:
+            pad = Variable(torch.zeros(batch_size, dim, rest)).type(input.type())
+            input = torch.cat([input, pad], 2)
+        pad_aux = Variable(torch.zeros(batch_size, dim, segment_stride)).type(
+            input.type()
+        )
+        input = torch.cat([pad_aux, input, pad_aux], 2)
+        return input, rest
+    def split_feature(self, input, segment_size):
+        # split the feature into chunks of segment size
+        # input is the features: (B, N, T)
+        input, rest = self.pad_segment(input, segment_size)
+        batch_size, dim, seq_len = input.shape
+        segment_stride = segment_size // 2
+        segments1 = (
+            input[:, :, :-segment_stride]
+            .contiguous()
+            .view(batch_size, dim, -1, segment_size)
+        )
+        segments2 = (
+            input[:, :, segment_stride:]
+            .contiguous()
+            .view(batch_size, dim, -1, segment_size)
+        )
+        segments = (
+            torch.cat([segments1, segments2], 3)
+            .view(batch_size, dim, -1, segment_size)
+            .transpose(2, 3)
+        )
+        return segments.contiguous(), rest
+    def merge_feature(self, input, rest):
+        # merge the splitted features into full utterance
+        # input is the features: (B, N, L, K)
+        batch_size, dim, segment_size, _ = input.shape
+        segment_stride = segment_size // 2
+        input = (
+            input.transpose(2, 3)
+            .contiguous()
+            .view(batch_size, dim, -1, segment_size * 2)
+        )  # B, N, K, L
+        input1 = (
+            input[:, :, :, :segment_size]
+            .contiguous()
+            .view(batch_size, dim, -1)[:, :, segment_stride:]
+        )
+        input2 = (
+            input[:, :, :, segment_size:]
+            .contiguous()
+            .view(batch_size, dim, -1)[:, :, :-segment_stride]
+        )
+        output = input1 + input2
+        if rest > 0:
+            output = output[:, :, :-rest]
+        return output.contiguous()  # B, N, T
+    def forward(self, input):
+        pass
+class DPTSeparation(DPT_base):
+    def __init__(self, *args, **kwargs):
+        super(DPTSeparation, self).__init__(*args, **kwargs)
+        # gated output layer
+        self.output = nn.Sequential(
+            nn.Conv1d(self.feature_dim, self.feature_dim, 1), nn.Tanh()
+        )
+        self.output_gate = nn.Sequential(
+            nn.Conv1d(self.feature_dim, self.feature_dim, 1), nn.Sigmoid()
+        )
+    def forward(self, input):
+        # input = input.to(device)
+        # input: (B, E, T)
+        batch_size, E, seq_length = input.shape
+        enc_feature = self.BN(input)  # (B, E, L)-->(B, N, L)
+        # split the encoder output into overlapped, longer segments
+        enc_segments, enc_rest = self.split_feature(
+            enc_feature, self.segment_size
+        )  # B, N, L, K: L is the segment_size
+        # print('enc_segments.shape {}'.format(enc_segments.shape))
+        # pass to DPT
+        output = self.DPT(enc_segments).view(
+            batch_size * self.num_spk, self.feature_dim, self.segment_size, -1
+        )  # B*nspk, N, L, K
+        # overlap-and-add of the outputs
+        output = self.merge_feature(output, enc_rest)  # B*nspk, N, T
+        # gated output layer for filter generation
+        bf_filter = self.output(output) * self.output_gate(output)  # B*nspk, K, T
+        bf_filter = (
+            bf_filter.transpose(1, 2)
+            .contiguous()
+            .view(batch_size, self.num_spk, -1, self.feature_dim)
+        )  # B, nspk, T, N
+        return bf_filter
+class DPTDecoder(nn.Module):
+    def __init__(self, n_filters: int = 64, window_size: int = 2):
+        super().__init__()
+        self.W = window_size
+        self.basis_signals = nn.Linear(n_filters, window_size, bias=False)
+    def forward(self, mixture, mask):
+        """
+        mixture: (batch, n_filters, L)
+        mask: (batch, sources, n_filters, L)
+        """
+        source_w = torch.unsqueeze(mixture, 1) * mask  # [B, C, E, L]
+        source_w = torch.transpose(source_w, 2, 3)  # [B, C, L, E]
+        # S = DV
+        est_source = self.basis_signals(source_w)  # [B, C, L, W]
+        est_source = overlap_and_add(est_source, self.W // 2)  # B x C x T
+        return est_source
+def overlap_and_add(signal, frame_step):
+    """Reconstructs a signal from a framed representation.
+    Adds potentially overlapping frames of a signal with shape
+    `[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
+    The resulting tensor has shape `[..., output_size]` where
+        output_size = (frames - 1) * frame_step + frame_length
+    Args:
+        signal: A [..., frames, frame_length] Tensor.
+        All dimensions may be unknown, and rank must be at least 2.
+        frame_step: An integer denoting overlap offsets. Must be less than or equal to frame_length.
+    Returns:
+        A Tensor with shape [..., output_size] containing the overlap-added frames of signal's
+        inner-most two dimensions.
+        output_size = (frames - 1) * frame_step + frame_length
+    Based on https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
+    """
+    outer_dimensions = signal.size()[:-2]
+    frames, frame_length = signal.size()[-2:]
+    subframe_length = math.gcd(frame_length, frame_step)  # gcd=Greatest Common Divisor
+    subframe_step = frame_step // subframe_length
+    subframes_per_frame = frame_length // subframe_length
+    output_size = frame_step * (frames - 1) + frame_length
+    output_subframes = output_size // subframe_length
+    subframe_signal = signal.reshape(*outer_dimensions, -1, subframe_length)
+    frame = torch.arange(0, output_subframes).unfold(
+        0, subframes_per_frame, subframe_step
+    )
+    frame = signal.new_tensor(frame).long()  # signal may in GPU or CPU
+    frame = frame.contiguous().view(-1)
+    result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
+    result.index_add_(-2, frame, subframe_signal)
+    result = result.view(*outer_dimensions, -1)
+    return result

remfx/effects.py CHANGED Viewed

@@ -701,7 +701,7 @@ class RandomAudioEffectsChannel(torch.nn.Module):
 Pedalboard_Effects = [
     RandomPedalboardReverb,
     RandomPedalboardChorus,
-    # RandomPedalboardDelay,
     RandomPedalboardDistortion,
     RandomPedalboardCompressor,
     # RandomPedalboardPhaser,

 Pedalboard_Effects = [
     RandomPedalboardReverb,
     RandomPedalboardChorus,
+    RandomPedalboardDelay,
     RandomPedalboardDistortion,
     RandomPedalboardCompressor,
     # RandomPedalboardPhaser,

remfx/models.py CHANGED Viewed

@@ -1,22 +1,22 @@
-import wandb
 import torch
-import torchaudio
 import torchmetrics
 import pytorch_lightning as pl
-import torch.nn.functional as F
 from torch import Tensor, nn
-from einops import rearrange
 from torchaudio.models import HDemucs
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
-from remfx.utils import FADLoss
-class RemFXModel(pl.LightningModule):
     def __init__(
         self,
         lr: float,
@@ -35,7 +35,7 @@ class RemFXModel(pl.LightningModule):
         self.lr_weight_decay = lr_weight_decay
         self.sample_rate = sample_rate
         self.model = network
-        self.metrics = torch.nn.ModuleDict(
             {
                 "SISDR": SISDRLoss(),
                 "STFT": MultiResolutionSTFTLoss(),
@@ -57,44 +57,33 @@ class RemFXModel(pl.LightningModule):
             eps=self.lr_eps,
             weight_decay=self.lr_weight_decay,
         )
-        return optimizer
-    # Add step-based learning rate scheduler
-    def optimizer_step(
-        self,
-        epoch,
-        batch_idx,
-        optimizer,
-        optimizer_idx,
-        optimizer_closure,
-        on_tpu,
-        using_lbfgs,
-    ):
-        # update params
-        optimizer.step(closure=optimizer_closure)
-        # update learning rate. Reduce by factor of 10 at 80% and 95% of training
-        if self.trainer.global_step == 0.8 * self.trainer.max_steps:
-            for pg in optimizer.param_groups:
-                pg["lr"] = 0.1 * pg["lr"]
-        if self.trainer.global_step == 0.95 * self.trainer.max_steps:
-            for pg in optimizer.param_groups:
-                pg["lr"] = 0.1 * pg["lr"]
     def training_step(self, batch, batch_idx):
-        loss = self.common_step(batch, batch_idx, mode="train")
-        return loss
     def validation_step(self, batch, batch_idx):
-        loss = self.common_step(batch, batch_idx, mode="valid")
-        return loss
     def test_step(self, batch, batch_idx):
-        loss = self.common_step(batch, batch_idx, mode="test")
-        return loss
     def common_step(self, batch, batch_idx, mode: str = "train"):
-        x, y, _, _ = batch
         loss, output = self.model((x, y))
         self.log(f"{mode}_loss", loss)
         # Metric logging
@@ -117,91 +106,10 @@ class RemFXModel(pl.LightningModule):
                     prog_bar=True,
                     sync_dist=True,
                 )
         return loss
-    def on_train_batch_start(self, batch, batch_idx):
-        # Log initial audio
-        if self.log_train_audio:
-            x, y, _, _ = batch
-            # Concat samples together for easier viewing in dashboard
-            input_samples = rearrange(x, "b c t -> c (b t)").unsqueeze(0)
-            target_samples = rearrange(y, "b c t -> c (b t)").unsqueeze(0)
-            log_wandb_audio_batch(
-                logger=self.logger,
-                id="input_effected_audio",
-                samples=input_samples.cpu(),
-                sampling_rate=self.sample_rate,
-                caption="Training Data",
-            )
-            log_wandb_audio_batch(
-                logger=self.logger,
-                id="target_audio",
-                samples=target_samples.cpu(),
-                sampling_rate=self.sample_rate,
-                caption="Target Data",
-            )
-            self.log_train_audio = False
-    def on_validation_batch_start(self, batch, batch_idx, dataloader_idx):
-        x, target, _, _ = batch
-        # Log Input Metrics
-        for metric in self.metrics:
-            # SISDR returns negative values, so negate them
-            if metric == "SISDR":
-                negate = -1
-            else:
-                negate = 1
-            # Only Log FAD on test set
-            if metric == "FAD":
-                continue
-            self.log(
-                f"Input_{metric}",
-                negate * self.metrics[metric](x, target),
-                on_step=False,
-                on_epoch=True,
-                logger=True,
-                prog_bar=True,
-                sync_dist=True,
-            )
-        # Only run on first batch
-        if batch_idx == 0:
-            self.model.eval()
-            with torch.no_grad():
-                y = self.model.sample(x)
-            # Concat samples together for easier viewing in dashboard
-            # 2 seconds of silence between each sample
-            silence = torch.zeros_like(x)
-            silence = silence[:, : self.sample_rate * 2]
-            concat_samples = torch.cat([y, silence, x, silence, target], dim=-1)
-            log_wandb_audio_batch(
-                logger=self.logger,
-                id="prediction_input_target",
-                samples=concat_samples.cpu(),
-                sampling_rate=self.sample_rate,
-                caption=f"Epoch {self.current_epoch}",
-            )
-            self.model.train()
-    def on_test_batch_start(self, batch, batch_idx, dataloader_idx):
-        self.on_validation_batch_start(batch, batch_idx, dataloader_idx)
-        # Log FAD
-        x, target, _, _ = batch
-        self.log(
-            "Input_FAD",
-            self.metrics["FAD"](x, target),
-            on_step=False,
-            on_epoch=True,
-            logger=True,
-            prog_bar=True,
-            sync_dist=True,
-        )
-class OpenUnmixModel(torch.nn.Module):
     def __init__(
         self,
         n_fft: int = 2048,
@@ -234,7 +142,7 @@ class OpenUnmixModel(torch.nn.Module):
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=self.sample_rate
         )
-        self.l1loss = torch.nn.L1Loss()
     def forward(self, batch):
         x, target = batch
@@ -249,7 +157,7 @@ class OpenUnmixModel(torch.nn.Module):
         return self.separator(x).squeeze(1)
-class DemucsModel(torch.nn.Module):
     def __init__(self, sample_rate, **kwargs) -> None:
         super().__init__()
         self.model = HDemucs(**kwargs)
@@ -257,7 +165,7 @@ class DemucsModel(torch.nn.Module):
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
-        self.l1loss = torch.nn.L1Loss()
     def forward(self, batch):
         x, target = batch
@@ -284,201 +192,70 @@ class DiffusionGenerationModel(nn.Module):
         return self.model.sample(noise, num_steps=num_steps)
-def log_wandb_audio_batch(
-    logger: pl.loggers.WandbLogger,
-    id: str,
-    samples: Tensor,
-    sampling_rate: int,
-    caption: str = "",
-    max_items: int = 10,
-):
-    num_items = samples.shape[0]
-    samples = rearrange(samples, "b c t -> b t c")
-    for idx in range(num_items):
-        if idx >= max_items:
-            break
-        logger.experiment.log(
-            {
-                f"{id}_{idx}": wandb.Audio(
-                    samples[idx].cpu().numpy(),
-                    caption=caption,
-                    sample_rate=sampling_rate,
-                )
-            }
         )
-def spectrogram(
-    x: torch.Tensor,
-    window: torch.Tensor,
-    n_fft: int,
-    hop_length: int,
-    alpha: float,
-) -> torch.Tensor:
-    bs, chs, samp = x.size()
-    x = x.view(bs * chs, -1)  # move channels onto batch dim
-    X = torch.stft(
-        x,
-        n_fft=n_fft,
-        hop_length=hop_length,
-        window=window,
-        return_complex=True,
-    )
-    # move channels back
-    X = X.view(bs, chs, X.shape[-2], X.shape[-1])
-    return torch.pow(X.abs() + 1e-8, alpha)
-# adapted from https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
-def init_layer(layer):
-    """Initialize a Linear or Convolutional layer."""
-    nn.init.xavier_uniform_(layer.weight)
-    if hasattr(layer, "bias"):
-        if layer.bias is not None:
-            layer.bias.data.fill_(0.0)
-def init_bn(bn):
-    """Initialize a Batchnorm layer."""
-    bn.bias.data.fill_(0.0)
-    bn.weight.data.fill_(1.0)
-class ConvBlock(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super(ConvBlock, self).__init__()
-        self.conv1 = nn.Conv2d(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
-            bias=False,
-        )
-        self.conv2 = nn.Conv2d(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=(3, 3),
-            stride=(1, 1),
-            padding=(1, 1),
-            bias=False,
         )
-        self.bn1 = nn.BatchNorm2d(out_channels)
-        self.bn2 = nn.BatchNorm2d(out_channels)
-        self.init_weight()
-    def init_weight(self):
-        init_layer(self.conv1)
-        init_layer(self.conv2)
-        init_bn(self.bn1)
-        init_bn(self.bn2)
-    def forward(self, input, pool_size=(2, 2), pool_type="avg"):
-        x = input
-        x = F.relu_(self.bn1(self.conv1(x)))
-        x = F.relu_(self.bn2(self.conv2(x)))
-        if pool_type == "max":
-            x = F.max_pool2d(x, kernel_size=pool_size)
-        elif pool_type == "avg":
-            x = F.avg_pool2d(x, kernel_size=pool_size)
-        elif pool_type == "avg+max":
-            x1 = F.avg_pool2d(x, kernel_size=pool_size)
-            x2 = F.max_pool2d(x, kernel_size=pool_size)
-            x = x1 + x2
-        else:
-            raise Exception("Incorrect argument!")
-        return x
-class Cnn14(nn.Module):
-    def __init__(
-        self,
-        num_classes: int,
-        sample_rate: float,
-        n_fft: int = 2048,
-        hop_length: int = 512,
-        n_mels: int = 128,
-    ):
         super().__init__()
-        self.num_classes = num_classes
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        window = torch.hann_window(n_fft)
-        self.register_buffer("window", window)
-        self.melspec = torchaudio.transforms.MelSpectrogram(
-            sample_rate,
-            n_fft,
-            hop_length=hop_length,
-            n_mels=n_mels,
         )
-        self.bn0 = nn.BatchNorm2d(n_mels)
-        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
-        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
-        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
-        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
-        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
-        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
-        self.fc1 = nn.Linear(2048, 2048, bias=True)
-        self.fc_audioset = nn.Linear(2048, num_classes, bias=True)
-        self.init_weight()
-    def init_weight(self):
-        init_bn(self.bn0)
-        init_layer(self.fc1)
-        init_layer(self.fc_audioset)
-    def forward(self, x: torch.Tensor):
-        """
-        Input: (batch_size, data_length)"""
-        x = self.melspec(x)
-        x = x.permute(0, 2, 1, 3)
-        x = self.bn0(x)
-        x = x.permute(0, 2, 1, 3)
-        if self.training:
-            pass
-            # x = self.spec_augmenter(x)
-        x = self.conv_block1(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block2(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block3(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block4(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block5(x, pool_size=(2, 2), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = self.conv_block6(x, pool_size=(1, 1), pool_type="avg")
-        x = F.dropout(x, p=0.2, training=self.training)
-        x = torch.mean(x, dim=3)
-        (x1, _) = torch.max(x, dim=2)
-        x2 = torch.mean(x, dim=2)
-        x = x1 + x2
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = F.relu_(self.fc1(x))
-        clipwise_output = self.fc_audioset(x)
-        return clipwise_output
 class FXClassifier(pl.LightningModule):
@@ -501,7 +278,7 @@ class FXClassifier(pl.LightningModule):
     def common_step(self, batch, batch_idx, mode: str = "train"):
         x, y, dry_label, wet_label = batch
         pred_label = self.network(x)
-        loss = torch.nn.functional.cross_entropy(pred_label, dry_label)
         self.log(
             f"{mode}_loss",
             loss,

 import torch
 import torchmetrics
 import pytorch_lightning as pl
 from torch import Tensor, nn
+from torch.nn import functional as F
 from torchaudio.models import HDemucs
 from audio_diffusion_pytorch import DiffusionModel
 from auraloss.time import SISDRLoss
 from auraloss.freq import MultiResolutionSTFTLoss
 from umx.openunmix.model import OpenUnmix, Separator
+from remfx.utils import FADLoss, spectrogram
+from remfx.dptnet import DPTNet_base
+from remfx.dcunet import RefineSpectrogramUnet
+from remfx.tcn import TCN
+from remfx.utils import causal_crop
+class RemFX(pl.LightningModule):
     def __init__(
         self,
         lr: float,
         self.lr_weight_decay = lr_weight_decay
         self.sample_rate = sample_rate
         self.model = network
+        self.metrics = nn.ModuleDict(
             {
                 "SISDR": SISDRLoss(),
                 "STFT": MultiResolutionSTFTLoss(),
             eps=self.lr_eps,
             weight_decay=self.lr_weight_decay,
         )
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer,
+            [0.8 * self.trainer.max_steps, 0.95 * self.trainer.max_steps],
+            gamma=0.1,
+        )
+        return {
+            "optimizer": optimizer,
+            "lr_scheduler": {
+                "scheduler": lr_scheduler,
+                "monitor": "val_loss",
+                "interval": "step",
+                "frequency": 1,
+            },
+        }
     def training_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="train")
     def validation_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="valid")
     def test_step(self, batch, batch_idx):
+        return self.common_step(batch, batch_idx, mode="test")
     def common_step(self, batch, batch_idx, mode: str = "train"):
+        x, y, _, _ = batch  # x, y = (B, C, T), (B, C, T)
         loss, output = self.model((x, y))
         self.log(f"{mode}_loss", loss)
         # Metric logging
                     prog_bar=True,
                     sync_dist=True,
                 )
         return loss
+class OpenUnmixModel(nn.Module):
     def __init__(
         self,
         n_fft: int = 2048,
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=self.sample_rate
         )
+        self.l1loss = nn.L1Loss()
     def forward(self, batch):
         x, target = batch
         return self.separator(x).squeeze(1)
+class DemucsModel(nn.Module):
     def __init__(self, sample_rate, **kwargs) -> None:
         super().__init__()
         self.model = HDemucs(**kwargs)
         self.mrstftloss = MultiResolutionSTFTLoss(
             n_bins=self.num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
     def forward(self, batch):
         x, target = batch
         return self.model.sample(noise, num_steps=num_steps)
+class DPTNetModel(nn.Module):
+    def __init__(self, sample_rate, num_bins, **kwargs):
+        super().__init__()
+        self.model = DPTNet_base(**kwargs)
+        self.num_bins = num_bins
+        self.mrstftloss = MultiResolutionSTFTLoss(
+            n_bins=self.num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
+    def forward(self, batch):
+        x, target = batch
+        output = self.model(x.squeeze(1))
+        loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
+        return loss, output
+    def sample(self, x: Tensor) -> Tensor:
+        return self.model(x.squeeze(1))
+class DCUNetModel(nn.Module):
+    def __init__(self, sample_rate, num_bins, **kwargs):
+        super().__init__()
+        self.model = RefineSpectrogramUnet(**kwargs)
+        self.mrstftloss = MultiResolutionSTFTLoss(
+            n_bins=num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
+    def forward(self, batch):
+        x, target = batch
+        output = self.model(x.squeeze(1)).unsqueeze(1)  # B x 1 x T
+        # Crop target to match output
+        if output.shape[-1] < target.shape[-1]:
+            target = causal_crop(target, output.shape[-1])
+        loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
+        return loss, output
+    def sample(self, x: Tensor) -> Tensor:
+        output = self.model(x.squeeze(1)).unsqueeze(1)  # B x 1 x T
+        return output
+class TCNModel(nn.Module):
+    def __init__(self, sample_rate, num_bins, **kwargs):
         super().__init__()
+        self.model = TCN(**kwargs)
+        self.mrstftloss = MultiResolutionSTFTLoss(
+            n_bins=num_bins, sample_rate=sample_rate
         )
+        self.l1loss = nn.L1Loss()
+    def forward(self, batch):
+        x, target = batch
+        output = self.model(x)  # B x 1 x T
+        # Crop target to match output
+        if output.shape[-1] < target.shape[-1]:
+            target = causal_crop(target, output.shape[-1])
+        loss = self.mrstftloss(output, target) + self.l1loss(output, target) * 100
+        return loss, output
+    def sample(self, x: Tensor) -> Tensor:
+        output = self.model(x)  # B x 1 x T
+        return output
 class FXClassifier(pl.LightningModule):
     def common_step(self, batch, batch_idx, mode: str = "train"):
         x, y, dry_label, wet_label = batch
         pred_label = self.network(x)
+        loss = nn.functional.cross_entropy(pred_label, dry_label)
         self.log(
             f"{mode}_loss",
             loss,

remfx/tcn.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# This code is based on the following repository written by Christian J. Steinmetz
+# https://github.com/csteinmetz1/micro-tcn
+from typing import Callable
+import torch
+import torch.nn as nn
+from torch import Tensor
+from remfx.utils import causal_crop, center_crop
+class TCNBlock(nn.Module):
+    def __init__(
+        self,
+        in_ch: int,
+        out_ch: int,
+        kernel_size: int = 3,
+        dilation: int = 1,
+        stride: int = 1,
+        crop_fn: Callable = causal_crop,
+    ) -> None:
+        super().__init__()
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.crop_fn = crop_fn
+        self.conv1 = nn.Conv1d(
+            in_ch,
+            out_ch,
+            kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            bias=True,
+        )
+        # residual connection
+        self.res = nn.Conv1d(
+            in_ch,
+            out_ch,
+            kernel_size=1,
+            groups=1,
+            stride=stride,
+            bias=False,
+        )
+        self.relu = nn.PReLU(out_ch)
+    def forward(self, x: Tensor) -> Tensor:
+        x_in = x
+        x = self.conv1(x)
+        x = self.relu(x)
+        # residual
+        x_res = self.res(x_in)
+        # causal crop
+        x = x + self.crop_fn(x_res, x.shape[-1])
+        return x
+class TCN(nn.Module):
+    def __init__(
+        self,
+        ninputs: int = 1,
+        noutputs: int = 1,
+        nblocks: int = 4,
+        channel_growth: int = 0,
+        channel_width: int = 32,
+        kernel_size: int = 13,
+        stack_size: int = 10,
+        dilation_growth: int = 10,
+        condition: bool = False,
+        latent_dim: int = 2,
+        norm_type: str = "identity",
+        causal: bool = False,
+        estimate_loudness: bool = False,
+    ) -> None:
+        super().__init__()
+        self.ninputs = ninputs
+        self.noutputs = noutputs
+        self.nblocks = nblocks
+        self.channel_growth = channel_growth
+        self.channel_width = channel_width
+        self.kernel_size = kernel_size
+        self.stack_size = stack_size
+        self.dilation_growth = dilation_growth
+        self.condition = condition
+        self.latent_dim = latent_dim
+        self.norm_type = norm_type
+        self.causal = causal
+        self.estimate_loudness = estimate_loudness
+        print(f"Causal: {self.causal}")
+        if self.causal:
+            self.crop_fn = causal_crop
+        else:
+            self.crop_fn = center_crop
+        if estimate_loudness:
+            self.loudness = torch.nn.Linear(latent_dim, 1)
+        # audio model
+        self.process_blocks = torch.nn.ModuleList()
+        out_ch = -1
+        for n in range(nblocks):
+            in_ch = out_ch if n > 0 else ninputs
+            out_ch = in_ch * channel_growth if channel_growth > 1 else channel_width
+            dilation = dilation_growth ** (n % stack_size)
+            self.process_blocks.append(
+                TCNBlock(
+                    in_ch,
+                    out_ch,
+                    kernel_size,
+                    dilation,
+                    stride=1,
+                    crop_fn=self.crop_fn,
+                )
+            )
+        self.output = nn.Conv1d(out_ch, noutputs, kernel_size=1)
+        # model configuration
+        self.receptive_field = self.compute_receptive_field()
+        self.block_size = 2048
+        self.buffer = torch.zeros(2, self.receptive_field + self.block_size - 1)
+    def forward(self, x: Tensor) -> Tensor:
+        x_in = x
+        for _, block in enumerate(self.process_blocks):
+            x = block(x)
+        # y_hat = torch.tanh(self.output(x))
+        x_in = causal_crop(x_in, x.shape[-1])
+        gain_ln = self.output(x)
+        y_hat = torch.tanh(gain_ln * x_in)
+        return y_hat
+    def compute_receptive_field(self):
+        """Compute the receptive field in samples."""
+        rf = self.kernel_size
+        for n in range(1, self.nblocks):
+            dilation = self.dilation_growth ** (n % self.stack_size)
+            rf = rf + ((self.kernel_size - 1) * dilation)
+        return rf

remfx/utils.py CHANGED Viewed

@@ -7,6 +7,8 @@ from frechet_audio_distance import FrechetAudioDistance
 import numpy as np
 import torch
 import torchaudio
 def get_logger(name=__name__) -> logging.Logger:
@@ -138,3 +140,79 @@ def create_sequential_chunks(
             break
         chunks.append(audio[:, start : start + chunk_size])
     return chunks, sr

 import numpy as np
 import torch
 import torchaudio
+from torch import nn
+import collections.abc
 def get_logger(name=__name__) -> logging.Logger:
             break
         chunks.append(audio[:, start : start + chunk_size])
     return chunks, sr
+def spectrogram(
+    x: torch.Tensor,
+    window: torch.Tensor,
+    n_fft: int,
+    hop_length: int,
+    alpha: float,
+) -> torch.Tensor:
+    bs, chs, samp = x.size()
+    x = x.view(bs * chs, -1)  # move channels onto batch dim
+    X = torch.stft(
+        x,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        window=window,
+        return_complex=True,
+    )
+    # move channels back
+    X = X.view(bs, chs, X.shape[-2], X.shape[-1])
+    return torch.pow(X.abs() + 1e-8, alpha)
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer."""
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, "bias"):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.0)
+def init_bn(bn):
+    """Initialize a Batchnorm layer."""
+    bn.bias.data.fill_(0.0)
+    bn.weight.data.fill_(1.0)
+def _ntuple(n: int):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple([x] * n)
+    return parse
+single = _ntuple(1)
+def concat_complex(a: torch.tensor, b: torch.tensor, dim: int = 1) -> torch.tensor:
+    """
+    Concatenate two complex tensors in same dimension concept
+    :param a: complex tensor
+    :param b: another complex tensor
+    :param dim: target dimension
+    :return: concatenated tensor
+    """
+    a_real, a_img = a.chunk(2, dim)
+    b_real, b_img = b.chunk(2, dim)
+    return torch.cat([a_real, b_real, a_img, b_img], dim=dim)
+def center_crop(x, length: int):
+    start = (x.shape[-1] - length) // 2
+    stop = start + length
+    return x[..., start:stop]
+def causal_crop(x, length: int):
+    stop = x.shape[-1] - 1
+    start = stop - length
+    return x[..., start:stop]

scripts/test.py CHANGED Viewed

@@ -3,7 +3,6 @@ import hydra
 from omegaconf import DictConfig
 import remfx.utils as utils
 from pytorch_lightning.utilities.model_summary import ModelSummary
-from remfx.models import RemFXModel
 import torch
 log = utils.get_logger(__name__)

 from omegaconf import DictConfig
 import remfx.utils as utils
 from pytorch_lightning.utilities.model_summary import ModelSummary
 import torch
 log = utils.get_logger(__name__)