audio-styling

Build error

File size: 6,807 Bytes

51da11a

#   Copyright 2022 Christian J. Steinmetz

#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at

#       http://www.apache.org/licenses/LICENSE-2.0

#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

# TCN implementation adapted from:
# https://github.com/csteinmetz1/micro-tcn/blob/main/microtcn/tcn.py

import torch
from argparse import ArgumentParser

from deepafx_st.utils import center_crop, causal_crop


class FiLM(torch.nn.Module):
    def __init__(self, num_features, cond_dim):
        super().__init__()
        self.num_features = num_features
        self.bn = torch.nn.BatchNorm1d(num_features, affine=False)
        self.adaptor = torch.nn.Linear(cond_dim, num_features * 2)

    def forward(self, x, cond):

        # project conditioning to 2 x num. conv channels
        cond = self.adaptor(cond)

        # split the projection into gain and bias
        g, b = torch.chunk(cond, 2, dim=-1)

        # add virtual channel dim if needed
        if g.ndim == 2:
            g = g.unsqueeze(1)
            b = b.unsqueeze(1)

        # reshape for application
        g = g.permute(0, 2, 1)
        b = b.permute(0, 2, 1)

        x = self.bn(x)  # apply BatchNorm without affine
        x = (x * g) + b  # then apply conditional affine

        return x


class ConditionalTCNBlock(torch.nn.Module):
    def __init__(
        self, in_ch, out_ch, cond_dim, kernel_size=3, dilation=1, causal=False, **kwargs
    ):
        super().__init__()

        self.in_ch = in_ch
        self.out_ch = out_ch
        self.kernel_size = kernel_size
        self.dilation = dilation
        self.causal = causal

        self.conv1 = torch.nn.Conv1d(
            in_ch,
            out_ch,
            kernel_size=kernel_size,
            padding=0,
            dilation=dilation,
            bias=True,
        )
        self.film = FiLM(out_ch, cond_dim)
        self.relu = torch.nn.PReLU(out_ch)
        self.res = torch.nn.Conv1d(
            in_ch, out_ch, kernel_size=1, groups=in_ch, bias=False
        )

    def forward(self, x, p):
        x_in = x

        x = self.conv1(x)
        x = self.film(x, p)  # apply FiLM conditioning
        x = self.relu(x)
        x_res = self.res(x_in)

        if self.causal:
            x = x + causal_crop(x_res, x.shape[-1])
        else:
            x = x + center_crop(x_res, x.shape[-1])

        return x


class ConditionalTCN(torch.nn.Module):
    """Temporal convolutional network with conditioning module.
    Args:
        sample_rate (float): Audio sample rate.
        num_control_params (int, optional): Dimensionality of the conditioning signal. Default: 24
        ninputs (int, optional): Number of input channels (mono = 1, stereo 2). Default: 1
        noutputs (int, optional): Number of output channels (mono = 1, stereo 2). Default: 1
        nblocks (int, optional): Number of total TCN blocks. Default: 10
        kernel_size (int, optional: Width of the convolutional kernels. Default: 3
        dialation_growth (int, optional): Compute the dilation factor at each block as dilation_growth ** (n % stack_size). Default: 1
        channel_growth (int, optional): Compute the output channels at each black as in_ch * channel_growth. Default: 2
        channel_width (int, optional): When channel_growth = 1 all blocks use convolutions with this many channels. Default: 64
        stack_size (int, optional): Number of blocks that constitute a single stack of blocks. Default: 10
        causal (bool, optional): Causal TCN configuration does not consider future input values. Default: False
    """

    def __init__(
        self,
        sample_rate,
        num_control_params=24,
        ninputs=1,
        noutputs=1,
        nblocks=10,
        kernel_size=15,
        dilation_growth=2,
        channel_growth=1,
        channel_width=64,
        stack_size=10,
        causal=False,
        skip_connections=False,
        **kwargs,
    ):
        super().__init__()
        self.num_control_params = num_control_params
        self.ninputs = ninputs
        self.noutputs = noutputs
        self.nblocks = nblocks
        self.kernel_size = kernel_size
        self.dilation_growth = dilation_growth
        self.channel_growth = channel_growth
        self.channel_width = channel_width
        self.stack_size = stack_size
        self.causal = causal
        self.skip_connections = skip_connections
        self.sample_rate = sample_rate

        self.blocks = torch.nn.ModuleList()
        for n in range(nblocks):
            in_ch = out_ch if n > 0 else ninputs

            if self.channel_growth > 1:
                out_ch = in_ch * self.channel_growth
            else:
                out_ch = self.channel_width

            dilation = self.dilation_growth ** (n % self.stack_size)

            self.blocks.append(
                ConditionalTCNBlock(
                    in_ch,
                    out_ch,
                    self.num_control_params,
                    kernel_size=self.kernel_size,
                    dilation=dilation,
                    padding="same" if self.causal else "valid",
                    causal=self.causal,
                )
            )

        self.output = torch.nn.Conv1d(out_ch, noutputs, kernel_size=1)
        self.receptive_field = self.compute_receptive_field()
        # print(
        #     f"TCN receptive field: {self.receptive_field} samples",
        #     f" or {(self.receptive_field/self.sample_rate)*1e3:0.3f} ms",
        # )

    def forward(self, x, p, **kwargs):

        # causally pad input signal
        x = torch.nn.functional.pad(x, (self.receptive_field - 1, 0))

        # iterate over blocks passing conditioning
        for idx, block in enumerate(self.blocks):
            x = block(x, p)
            if self.skip_connections:
                if idx == 0:
                    skips = x
                else:
                    skips = center_crop(skips, x[-1]) + x
            else:
                skips = 0

        # final 1x1 convolution to collapse channels
        out = self.output(x + skips)

        return out

    def compute_receptive_field(self):
        """Compute the receptive field in samples."""
        rf = self.kernel_size
        for n in range(1, self.nblocks):
            dilation = self.dilation_growth ** (n % self.stack_size)
            rf = rf + ((self.kernel_size - 1) * dilation)
        return rf