Spaces:

alibabasglab
/

ClearVoice

Running on Zero

File size: 36,565 Bytes

8e8cd3e

import math
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from packaging.version import parse as V
from torch.nn import init
from torch.nn.parameter import Parameter

from models.mossformer_gan_se.fsmn import UniDeepFsmn
from models.mossformer_gan_se.conv_module import ConvModule
from models.mossformer_gan_se.mossformer import MossFormer
from models.mossformer_gan_se.se_layer import SELayer
from models.mossformer_gan_se.get_layer_from_string import get_layer
from models.mossformer_gan_se.discriminator import Discriminator

# Check if the installed version of PyTorch is 1.9.0 or higher
is_torch_1_9_plus = V(torch.__version__) >= V("1.9.0")


class MossFormerGAN_SE_16K(nn.Module):
    """
    MossFormerGAN_SE_16K: A GAN-based speech enhancement model for 16kHz input audio.

    This model integrates a synchronous attention network (SyncANet) for 
    feature extraction. Depending on the mode (train or inference), it may 
    also include a discriminator for adversarial training.

    Args:
        args (Namespace): Arguments containing configuration parameters, 
                          including 'fft_len' and 'mode'.
    """

    def __init__(self, args):
        """Initializes the MossFormerGAN_SE_16K model."""
        super(MossFormerGAN_SE_16K, self).__init__()
        
        # Initialize SyncANet with specified number of channels and features
        self.model = SyncANet(num_channel=64, num_features=args.fft_len // 2 + 1)

        # Initialize discriminator if in training mode
        if args.mode == 'train':
            self.discriminator = Discriminator(ndf=16)
        else:
            self.discriminator = None

    def forward(self, x):
        """
        Defines the forward pass of the MossFormerGAN_SE_16K model.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, num_channels, height, width].

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Output tensors representing the real and imaginary parts.
        """
        output_real, output_imag = self.model(x)  # Get real and imaginary outputs from the model
        return output_real, output_imag  # Return the outputs


class FSMN_Wrap(nn.Module):
    """
    FSMN_Wrap: A wrapper around the UniDeepFsmn module to facilitate 
    integration into the larger model architecture.

    Args:
        nIn (int): Number of input features.
        nHidden (int): Number of hidden features in the FSMN (default is 128).
        lorder (int): Order of the FSMN (default is 20).
        nOut (int): Number of output features (default is 128).
    """

    def __init__(self, nIn, nHidden=128, lorder=20, nOut=128):
        """Initializes the FSMN_Wrap module with specified parameters."""
        super(FSMN_Wrap, self).__init__()

        # Initialize the UniDeepFsmn module
        self.fsmn = UniDeepFsmn(nIn, nHidden, lorder, nHidden)

    def forward(self, x):
        """
        Defines the forward pass of the FSMN_Wrap module.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, channels, height, time, 2].

        Returns:
            torch.Tensor: Output tensor reshaped to [batch_size, channels, height, time].
        """
        # Shape of input x: [b, c, h, T, 2]
        b, c, T, h = x.size()

        # Permute x to reshape it for FSMN processing: [b, T, h, c]
        x = x.permute(0, 2, 3, 1)  # Change dimensions to [b, T, h, c]
        x = torch.reshape(x, (b * T, h, c))  # Reshape to [b*T, h, c]

        # Pass through the FSMN
        output = self.fsmn(x)  # output: [b*T, h, c]

        # Reshape output back to original dimensions
        output = torch.reshape(output, (b, T, h, c))  # output: [b, T, h, c]

        return output.permute(0, 3, 1, 2)  # Final output shape: [b, c, h, T]

class DilatedDenseNet(nn.Module):
    """
    DilatedDenseNet: A dilated dense network for feature extraction.

    This network consists of a series of dilated convolutions organized in a dense block structure,
    allowing for efficient feature reuse and capturing multi-scale information.

    Args:
        depth (int): The number of layers in the dense block (default is 4).
        in_channels (int): The number of input channels for the first layer (default is 64).
    """

    def __init__(self, depth=4, in_channels=64):
        """Initializes the DilatedDenseNet with specified depth and input channels."""
        super(DilatedDenseNet, self).__init__()
        self.depth = depth
        self.in_channels = in_channels
        self.pad = nn.ConstantPad2d((1, 1, 1, 0), value=0.)  # Padding for the first layer
        self.twidth = 2  # Temporal width for convolutions
        self.kernel_size = (self.twidth, 3)  # Kernel size for convolutions

        # Initialize dilated convolutions, padding, normalization, and FSMN for each layer
        for i in range(self.depth):
            dil = 2 ** i  # Dilation factor for the current layer
            pad_length = self.twidth + (dil - 1) * (self.twidth - 1) - 1  # Calculate padding length
            setattr(self, 'pad{}'.format(i + 1), nn.ConstantPad2d((1, 1, pad_length, 0), value=0.))
            setattr(self, 'conv{}'.format(i + 1),
                    nn.Conv2d(self.in_channels * (i + 1), self.in_channels, kernel_size=self.kernel_size,
                              dilation=(dil, 1)))  # Convolution layer
            setattr(self, 'norm{}'.format(i + 1), nn.InstanceNorm2d(in_channels, affine=True))  # Normalization
            setattr(self, 'prelu{}'.format(i + 1), nn.PReLU(self.in_channels))  # Activation function
            setattr(self, 'fsmn{}'.format(i + 1), FSMN_Wrap(nIn=self.in_channels, nHidden=self.in_channels, lorder=5, nOut=self.in_channels))

    def forward(self, x):
        """
        Defines the forward pass for the DilatedDenseNet.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, channels, height, width].

        Returns:
            torch.Tensor: Output tensor after processing through the dense network.
        """
        skip = x  # Initialize skip connection with input
        for i in range(self.depth):
            # Apply padding, convolution, normalization, activation, and FSMN in sequence
            out = getattr(self, 'pad{}'.format(i + 1))(skip)
            out = getattr(self, 'conv{}'.format(i + 1))(out)
            out = getattr(self, 'norm{}'.format(i + 1))(out)
            out = getattr(self, 'prelu{}'.format(i + 1))(out)
            out = getattr(self, 'fsmn{}'.format(i + 1))(out)
            skip = torch.cat([out, skip], dim=1)  # Concatenate outputs for dense connectivity
        return out  # Return the final output


class DenseEncoder(nn.Module):
    """
    DenseEncoder: A dense encoding module for feature extraction from input data.

    This module consists of a series of convolutional layers followed by a 
    dilated dense network for robust feature learning.

    Args:
        in_channel (int): Number of input channels for the encoder.
        channels (int): Number of output channels for each convolutional layer (default is 64).
    """

    def __init__(self, in_channel, channels=64):
        """Initializes the DenseEncoder with specified input channels and feature size."""
        super(DenseEncoder, self).__init__()
        self.conv_1 = nn.Sequential(
            nn.Conv2d(in_channel, channels, (1, 1), (1, 1)),  # Initial convolution layer
            nn.InstanceNorm2d(channels, affine=True),  # Normalization layer
            nn.PReLU(channels)  # Activation function
        )
        self.dilated_dense = DilatedDenseNet(depth=4, in_channels=channels)  # Dilated Dense Network
        self.conv_2 = nn.Sequential(
            nn.Conv2d(channels, channels, (1, 3), (1, 2), padding=(0, 1)),  # Second convolution layer
            nn.InstanceNorm2d(channels, affine=True),  # Normalization layer
            nn.PReLU(channels)  # Activation function
        )

    def forward(self, x):
        """
        Defines the forward pass for the DenseEncoder.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, in_channel, height, width].

        Returns:
            torch.Tensor: Output tensor after processing through the encoder.
        """
        x = self.conv_1(x)  # Process through the first convolutional layer
        x = self.dilated_dense(x)  # Process through the dilated dense network
        x = self.conv_2(x)  # Process through the second convolutional layer
        return x  # Return the final output


class SPConvTranspose2d(nn.Module):
    """
    SPConvTranspose2d: A spatially separable convolution transpose layer.

    This module implements a transposed convolution operation with spatial separability,
    allowing for efficient upsampling and feature extraction.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (tuple): Size of the convolution kernel.
        r (int): Upsampling rate (default is 1).
    """

    def __init__(self, in_channels, out_channels, kernel_size, r=1):
        """Initializes the SPConvTranspose2d with specified parameters."""
        super(SPConvTranspose2d, self).__init__()
        self.pad1 = nn.ConstantPad2d((1, 1, 0, 0), value=0.)  # Padding for input
        self.out_channels = out_channels  # Store number of output channels
        self.conv = nn.Conv2d(in_channels, out_channels * r, kernel_size=kernel_size, stride=(1, 1))  # Convolution layer
        self.r = r  # Store the upsampling rate

    def forward(self, x):
        """
        Defines the forward pass for the SPConvTranspose2d module.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, in_channels, height, width].

        Returns:
            torch.Tensor: Output tensor after transposed convolution operation.
        """
        x = self.pad1(x)  # Apply padding to input
        out = self.conv(x)  # Perform convolution operation
        batch_size, nchannels, H, W = out.shape  # Get output shape
        out = out.view((batch_size, self.r, nchannels // self.r, H, W))  # Reshape output for separation
        out = out.permute(0, 2, 3, 4, 1)  # Rearrange dimensions
        out = out.contiguous().view((batch_size, nchannels // self.r, H, -1))  # Final output shape
        return out  # Return the final output

class MaskDecoder(nn.Module):
    """
    MaskDecoder: A decoder module for estimating masks used in audio processing.

    This module utilizes a dilated dense network to capture features and 
    applies sub-pixel convolution to upscale the output. It produces 
    a mask that can be applied to the magnitude of audio signals.

    Args:
        num_features (int): The number of features in the output mask.
        num_channel (int): The number of channels in intermediate layers (default is 64).
        out_channel (int): The number of output channels for the final output mask (default is 1).
    """

    def __init__(self, num_features, num_channel=64, out_channel=1):
        """Initializes the MaskDecoder with specified parameters."""
        super(MaskDecoder, self).__init__()
        self.dense_block = DilatedDenseNet(depth=4, in_channels=num_channel)  # Dense feature extraction
        self.sub_pixel = SPConvTranspose2d(num_channel, num_channel, (1, 3), 2)  # Sub-pixel convolution for upsampling
        self.conv_1 = nn.Conv2d(num_channel, out_channel, (1, 2))  # Convolution layer to produce mask
        self.norm = nn.InstanceNorm2d(out_channel, affine=True)  # Normalization layer
        self.prelu = nn.PReLU(out_channel)  # Activation function
        self.final_conv = nn.Conv2d(out_channel, out_channel, (1, 1))  # Final convolution layer
        self.prelu_out = nn.PReLU(num_features, init=-0.25)  # Final activation for output mask

    def forward(self, x):
        """
        Defines the forward pass for the MaskDecoder.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, channels, height, width].

        Returns:
            torch.Tensor: Output mask tensor after processing through the decoder.
        """
        x = self.dense_block(x)  # Feature extraction using dilated dense block
        x = self.sub_pixel(x)  # Upsample the features
        x = self.conv_1(x)  # Convolution to estimate the mask
        x = self.prelu(self.norm(x))  # Apply normalization and activation
        x = self.final_conv(x).permute(0, 3, 2, 1).squeeze(-1)  # Final convolution and rearrangement
        return self.prelu_out(x).permute(0, 2, 1).unsqueeze(1)  # Final output shape


class ComplexDecoder(nn.Module):
    """
    ComplexDecoder: A decoder module for estimating complex-valued outputs.

    This module processes features through a dilated dense network and a 
    sub-pixel convolution layer to generate two output channels representing 
    the real and imaginary parts of the complex output.

    Args:
        num_channel (int): The number of channels in intermediate layers (default is 64).
    """

    def __init__(self, num_channel=64):
        """Initializes the ComplexDecoder with specified parameters."""
        super(ComplexDecoder, self).__init__()
        self.dense_block = DilatedDenseNet(depth=4, in_channels=num_channel)  # Dense feature extraction
        self.sub_pixel = SPConvTranspose2d(num_channel, num_channel, (1, 3), 2)  # Sub-pixel convolution for upsampling
        self.prelu = nn.PReLU(num_channel)  # Activation function
        self.norm = nn.InstanceNorm2d(num_channel, affine=True)  # Normalization layer
        self.conv = nn.Conv2d(num_channel, 2, (1, 2))  # Convolution layer to produce complex outputs

    def forward(self, x):
        """
        Defines the forward pass for the ComplexDecoder.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, channels, height, width].

        Returns:
            torch.Tensor: Output tensor containing real and imaginary parts.
        """
        x = self.dense_block(x)  # Feature extraction using dilated dense block
        x = self.sub_pixel(x)  # Upsample the features
        x = self.prelu(self.norm(x))  # Apply normalization and activation
        x = self.conv(x)  # Generate complex output
        return x  # Return the output tensor


class SyncANet(nn.Module):
    """
    SyncANet: A synchronous audio processing network for separating audio signals.

    This network integrates dense encoding, synchronous attention blocks, 
    and separate decoders for estimating masks and complex-valued outputs.

    Args:
        num_channel (int): The number of channels in the network (default is 64).
        num_features (int): The number of features for the mask decoder (default is 201).
    """

    def __init__(self, num_channel=64, num_features=201):
        """Initializes the SyncANet with specified parameters."""
        super(SyncANet, self).__init__()
        self.dense_encoder = DenseEncoder(in_channel=3, channels=num_channel)  # Dense encoder for input
        self.n_layers = 6  # Number of synchronous attention layers
        self.blocks = nn.ModuleList([])  # List to hold attention blocks
        
        # Initialize attention blocks
        for _ in range(self.n_layers):
            self.blocks.append(
                SyncANetBlock(
                    emb_dim=num_channel,
                    emb_ks=2,
                    emb_hs=1,
                    n_freqs=int(num_features//2)+1,
                    hidden_channels=num_channel*2,
                    n_head=4,
                    approx_qk_dim=512,
                    activation='prelu',
                    eps=1.0e-5,
                )
            )

        self.mask_decoder = MaskDecoder(num_features, num_channel=num_channel, out_channel=1)  # Mask decoder
        self.complex_decoder = ComplexDecoder(num_channel=num_channel)  # Complex decoder

    def forward(self, x):
        """
        Defines the forward pass for the SyncANet.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, 2, height, width] representing complex signals.

        Returns:
            list: List containing the real and imaginary parts of the output tensor.
        """
        out_list = []  # List to store outputs
        mag = torch.sqrt(x[:, 0, :, :]**2 + x[:, 1, :, :]**2).unsqueeze(1)  # Calculate magnitude
        noisy_phase = torch.angle(torch.complex(x[:, 0, :, :], x[:, 1, :, :])).unsqueeze(1)  # Calculate phase
        x_in = torch.cat([mag, x], dim=1)  # Concatenate magnitude and input for processing

        x = self.dense_encoder(x_in)  # Feature extraction using dense encoder
        for ii in range(self.n_layers):
            x = self.blocks[ii](x)  # Pass through attention blocks

        mask = self.mask_decoder(x)  # Estimate mask from features
        out_mag = mask * mag  # Apply mask to magnitude

        complex_out = self.complex_decoder(x)  # Generate complex output
        mag_real = out_mag * torch.cos(noisy_phase)  # Real part of the output
        mag_imag = out_mag * torch.sin(noisy_phase)  # Imaginary part of the output
        final_real = mag_real + complex_out[:, 0, :, :].unsqueeze(1)  # Final real output
        final_imag = mag_imag + complex_out[:, 1, :, :].unsqueeze(1)  # Final imaginary output
        out_list.append(final_real)  # Append real output to list
        out_list.append(final_imag)  # Append imaginary output to list

        return out_list  # Return list of outputs

class FFConvM(nn.Module):
    """
    FFConvM: A feedforward convolutional module combining linear layers, normalization, 
    non-linear activation, and convolution operations.

    This module processes input tensors through a sequence of transformations, including 
    normalization, a linear layer with a SiLU activation, a convolutional operation, and 
    dropout for regularization.

    Args:
        dim_in (int): The number of input features (dimensionality of input).
        dim_out (int): The number of output features (dimensionality of output).
        norm_klass (nn.Module): The normalization class to be applied (default is nn.LayerNorm).
        dropout (float): The dropout probability for regularization (default is 0.1).
    """

    def __init__(
        self,
        dim_in,
        dim_out,
        norm_klass=nn.LayerNorm,
        dropout=0.1
    ):
        """Initializes the FFConvM with specified parameters."""
        super().__init__()
        
        # Define the sequential model
        self.mdl = nn.Sequential(
            norm_klass(dim_in),  # Apply normalization to input
            nn.Linear(dim_in, dim_out),  # Linear transformation to dim_out
            nn.SiLU(),  # Non-linear activation using SiLU (Sigmoid Linear Unit)
            ConvModule(dim_out),  # Convolution operation on the output
            nn.Dropout(dropout)  # Dropout layer for regularization
        )

    def forward(self, x):
        """
        Defines the forward pass for the FFConvM.

        Args:
            x (torch.Tensor): Input tensor of shape [batch_size, dim_in].

        Returns:
            torch.Tensor: Output tensor of shape [batch_size, dim_out] after processing.
        """
        output = self.mdl(x)  # Pass input through the sequential model
        return output  # Return the processed output

class SyncANetBlock(nn.Module):
    """
    SyncANetBlock implements a modified version of the MossFormer (GatedFormer) module,
    inspired by the TF-GridNet architecture (https://arxiv.org/abs/2211.12433). 
    It combines gated triple-attention schemes and Finite Short Memory Network (FSMN) modules 
    to enhance computational efficiency and overall performance in audio processing tasks.

    Attributes:
        emb_dim (int): Dimensionality of the embedding.
        emb_ks (int): Kernel size for embeddings.
        emb_hs (int): Stride size for embeddings.
        n_freqs (int): Number of frequency bands.
        hidden_channels (int): Number of hidden channels.
        n_head (int): Number of attention heads.
        approx_qk_dim (int): Approximate dimension for query-key matrices.
        activation (str): Activation function to use.
        eps (float): Small value to avoid division by zero in normalization layers.
    """
    
    def __getitem__(self, key):
        """ 
        Allows accessing module attributes using indexing.
        
        Args:
            key: Attribute name to retrieve.
        
        Returns:
            The requested attribute.
        """
        return getattr(self, key)

    def __init__(
        self,
        emb_dim,
        emb_ks,
        emb_hs,
        n_freqs,
        hidden_channels,
        n_head=4,
        approx_qk_dim=512,
        activation="prelu",
        eps=1e-5,
    ):
        """
        Initializes the SyncANetBlock with the specified parameters.

        Args:
            emb_dim (int): Dimensionality of the embedding.
            emb_ks (int): Kernel size for embeddings.
            emb_hs (int): Stride size for embeddings.
            n_freqs (int): Number of frequency bands.
            hidden_channels (int): Number of hidden channels.
            n_head (int): Number of attention heads. Default is 4.
            approx_qk_dim (int): Approximate dimension for query-key matrices. Default is 512.
            activation (str): Activation function to use. Default is "prelu".
            eps (float): Small value to avoid division by zero in normalization layers. Default is 1e-5.
        """
        super().__init__()

        in_channels = emb_dim * emb_ks  # Calculate the number of input channels

        ## Intra modules: Modules for internal processing within the block
        self.Fconv = nn.Conv2d(emb_dim, in_channels, kernel_size=(1, emb_ks), stride=(1, 1), groups=emb_dim)
        self.intra_norm = LayerNormalization4D(emb_dim, eps=eps)  # Layer normalization
        self.intra_to_u = FFConvM(
            dim_in=in_channels,
            dim_out=hidden_channels,
            norm_klass=nn.LayerNorm,
            dropout=0.1,
        )
        self.intra_to_v = FFConvM(
            dim_in=in_channels,
            dim_out=hidden_channels,
            norm_klass=nn.LayerNorm,
            dropout=0.1,
        )
        self.intra_rnn = self._build_repeats(in_channels, hidden_channels, 20, hidden_channels, repeats=1)  # FSMN layers
        self.intra_mossformer = MossFormer(dim=emb_dim, group_size=n_freqs)  # MossFormer module

        # Linear transformation for intra module output
        self.intra_linear = nn.ConvTranspose1d(
            hidden_channels, emb_dim, emb_ks, stride=emb_hs
        )
        self.intra_se = SELayer(channel=emb_dim, reduction=1)  # Squeeze-and-excitation layer

        ## Inter modules: Modules for external processing between blocks
        self.inter_norm = LayerNormalization4D(emb_dim, eps=eps)  # Layer normalization
        self.inter_to_u = FFConvM(
            dim_in=in_channels,
            dim_out=hidden_channels,
            norm_klass=nn.LayerNorm,
            dropout=0.1,
        )
        self.inter_to_v = FFConvM(
            dim_in=in_channels,
            dim_out=hidden_channels,
            norm_klass=nn.LayerNorm,
            dropout=0.1,
        )
        self.inter_rnn = self._build_repeats(in_channels, hidden_channels, 20, hidden_channels, repeats=1)  # FSMN layers
        self.inter_mossformer = MossFormer(dim=emb_dim, group_size=256)  # MossFormer module

        # Linear transformation for inter module output
        self.inter_linear = nn.ConvTranspose1d(
            hidden_channels, emb_dim, emb_ks, stride=emb_hs
        )
        self.inter_se = SELayer(channel=emb_dim, reduction=1)  # Squeeze-and-excitation layer

        # Approximate query-key dimension calculation
        E = math.ceil(approx_qk_dim * 1.0 / n_freqs)
        assert emb_dim % n_head == 0  # Ensure emb_dim is divisible by n_head

        # Define attention convolution layers for each head
        for ii in range(n_head):
            self.add_module(
                f"attn_conv_Q_{ii}",
                nn.Sequential(
                    nn.Conv2d(emb_dim, E, 1),
                    get_layer(activation)(),
                    LayerNormalization4DCF((E, n_freqs), eps=eps),
                ),
            )
            self.add_module(
                f"attn_conv_K_{ii}",
                nn.Sequential(
                    nn.Conv2d(emb_dim, E, 1),
                    get_layer(activation)(),
                    LayerNormalization4DCF((E, n_freqs), eps=eps),
                ),
            )
            self.add_module(
                f"attn_conv_V_{ii}",
                nn.Sequential(
                    nn.Conv2d(emb_dim, emb_dim // n_head, 1),
                    get_layer(activation)(),
                    LayerNormalization4DCF((emb_dim // n_head, n_freqs), eps=eps),
                ),
            )
        
        # Final attention concatenation projection
        self.add_module(
            "attn_concat_proj",
            nn.Sequential(
                nn.Conv2d(emb_dim, emb_dim, 1),
                get_layer(activation)(),
                LayerNormalization4DCF((emb_dim, n_freqs), eps=eps),
            ),
        )

        # Store parameters for further processing
        self.emb_dim = emb_dim
        self.emb_ks = emb_ks
        self.emb_hs = emb_hs
        self.n_head = n_head

    def _build_repeats(self, in_channels, out_channels, lorder, hidden_size, repeats=1):
        """
        Constructs a sequence of UniDeepFSMN modules.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            lorder (int): Order of the filter.
            hidden_size (int): Hidden size for the FSMN.
            repeats (int): Number of times to repeat the module. Default is 1.

        Returns:
            nn.Sequential: A sequence of UniDeepFSMN modules.
        """
        repeats = [
            UniDeepFsmn(in_channels, out_channels, lorder, hidden_size)
            for _ in range(repeats)
        ]
        return nn.Sequential(*repeats)

    def forward(self, x):
        """Performs a forward pass through the SyncANetBlock.

        Args:
            x (torch.Tensor): Input tensor of shape [B, C, T, Q] where 
                              B is batch size, C is number of channels, 
                              T is temporal dimension, and Q is frequency dimension.

        Returns:
            torch.Tensor: Output tensor of the same shape [B, C, T, Q].
        """
        B, C, old_T, old_Q = x.shape
        
        # Calculate new dimensions for padding
        T = math.ceil((old_T - self.emb_ks) / self.emb_hs) * self.emb_hs + self.emb_ks
        Q = math.ceil((old_Q - self.emb_ks) / self.emb_hs) * self.emb_hs + self.emb_ks
        
        # Pad the input tensor to match the new dimensions
        x = F.pad(x, (0, Q - old_Q, 0, T - old_T))

        # Intra-process
        input_ = x
        intra_rnn = self.intra_norm(input_)  # Normalize input for intra-process
        intra_rnn = self.Fconv(intra_rnn)    # Apply depthwise convolution
        intra_rnn = (
            intra_rnn.transpose(1, 2).contiguous().view(B * T, C * self.emb_ks, -1)
        )  # Reshape for subsequent operations

        intra_rnn = intra_rnn.transpose(1, 2)  # Reshape for processing
        intra_rnn_u = self.intra_to_u(intra_rnn)  # Linear transformation
        intra_rnn_v = self.intra_to_v(intra_rnn)  # Linear transformation
        intra_rnn_u = self.intra_rnn(intra_rnn_u)  # Apply FSMN
        intra_rnn = intra_rnn_v * intra_rnn_u  # Element-wise multiplication
        intra_rnn = intra_rnn.transpose(1, 2)  # Reshape back
        intra_rnn = self.intra_linear(intra_rnn)  # Linear projection
        intra_rnn = intra_rnn.transpose(1, 2)  # Reshape for mossformer
        intra_rnn = intra_rnn.view([B, T, Q, C])  # Reshape for mossformer
        intra_rnn = self.intra_mossformer(intra_rnn)  # Apply MossFormer
        intra_rnn = intra_rnn.transpose(1, 2)  # Reshape back
        intra_rnn = intra_rnn.view([B, T, C, Q])  # Reshape back
        intra_rnn = intra_rnn.transpose(1, 2).contiguous()  # Final reshape
        intra_rnn = self.intra_se(intra_rnn)  # Squeeze-and-excitation layer
        intra_rnn = intra_rnn + input_  # Residual connection

        # Inter-process
        input_ = intra_rnn
        inter_rnn = self.inter_norm(input_)  # Normalize input for inter-process
        inter_rnn = (
            inter_rnn.permute(0, 3, 1, 2).contiguous().view(B * Q, C, T)
        )  # Reshape for processing
        inter_rnn = F.unfold(
            inter_rnn[..., None], (self.emb_ks, 1), stride=(self.emb_hs, 1)
        )  # Extract sliding windows
        inter_rnn = inter_rnn.transpose(1, 2)  # Reshape for further processing
        inter_rnn_u = self.inter_to_u(inter_rnn)  # Linear transformation
        inter_rnn_v = self.inter_to_v(inter_rnn)  # Linear transformation
        inter_rnn_u = self.inter_rnn(inter_rnn_u)  # Apply FSMN
        inter_rnn = inter_rnn_v * inter_rnn_u  # Element-wise multiplication
        inter_rnn = inter_rnn.transpose(1, 2)  # Reshape back
        inter_rnn = self.inter_linear(inter_rnn)  # Linear projection
        inter_rnn = inter_rnn.transpose(1, 2)  # Reshape for mossformer
        inter_rnn = inter_rnn.view([B, Q, T, C])  # Reshape for mossformer
        inter_rnn = self.inter_mossformer(inter_rnn)  # Apply MossFormer
        inter_rnn = inter_rnn.transpose(1, 2)  # Reshape back
        inter_rnn = inter_rnn.view([B, Q, C, T])  # Final reshape
        inter_rnn = inter_rnn.permute(0, 2, 3, 1).contiguous()  # Permute for SE layer
        inter_rnn = self.inter_se(inter_rnn)  # Squeeze-and-excitation layer
        inter_rnn = inter_rnn + input_  # Residual connection

        # Attention mechanism
        inter_rnn = inter_rnn[..., :old_T, :old_Q]  # Trim to original shape

        batch = inter_rnn
        all_Q, all_K, all_V = [], [], []
        
        # Compute query, key, and value for each attention head
        for ii in range(self.n_head):
            all_Q.append(self["attn_conv_Q_%d" % ii](batch))  # Query
            all_K.append(self["attn_conv_K_%d" % ii](batch))  # Key
            all_V.append(self["attn_conv_V_%d" % ii](batch))  # Value

        Q = torch.cat(all_Q, dim=0)  # Concatenate all queries
        K = torch.cat(all_K, dim=0)  # Concatenate all keys
        V = torch.cat(all_V, dim=0)  # Concatenate all values

        # Reshape for attention calculation
        Q = Q.transpose(1, 2)
        Q = Q.flatten(start_dim=2)  # Flatten for attention calculation
        K = K.transpose(1, 2)
        K = K.flatten(start_dim=2)  # Flatten for attention calculation
        V = V.transpose(1, 2)  # Reshape for attention calculation
        old_shape = V.shape
        V = V.flatten(start_dim=2)  # Flatten for attention calculation
        emb_dim = Q.shape[-1]

        # Compute scaled dot-product attention
        attn_mat = torch.matmul(Q, K.transpose(1, 2)) / (emb_dim**0.5)  # Attention matrix
        attn_mat = F.softmax(attn_mat, dim=2)  # Softmax over attention scores
        V = torch.matmul(attn_mat, V)  # Weighted sum of values

        V = V.reshape(old_shape)  # Reshape back
        V = V.transpose(1, 2)  # Final reshaping
        emb_dim = V.shape[1]

        batch = V.view([self.n_head, B, emb_dim, old_T, -1])  # Reshape for multi-head
        batch = batch.transpose(0, 1)  # Permute for batch processing
        batch = batch.contiguous().view(
            [B, self.n_head * emb_dim, old_T, -1]
        )  # Final reshape for concatenation
        batch = self["attn_concat_proj"](batch)  # Final linear projection

        # Combine inter-process result with attention output
        out = batch + inter_rnn
        return out  # Return the output tensor

class LayerNormalization4D(nn.Module):
    """
    LayerNormalization4D applies layer normalization to 4D tensors 
    (e.g., [B, C, T, F]), where B is the batch size, C is the number of channels,
    T is the temporal dimension, and F is the frequency dimension.

    Attributes:
        gamma (torch.Parameter): Learnable scaling parameter.
        beta (torch.Parameter): Learnable shifting parameter.
        eps (float): Small value for numerical stability during variance calculation.
    """

    def __init__(self, input_dimension, eps=1e-5):
        """
        Initializes the LayerNormalization4D layer.

        Args:
            input_dimension (int): The number of channels in the input tensor.
            eps (float, optional): Small constant added for numerical stability.
        """
        super().__init__()
        param_size = [1, input_dimension, 1, 1]
        self.gamma = Parameter(torch.Tensor(*param_size).to(torch.float32))  # Scale parameter
        self.beta = Parameter(torch.Tensor(*param_size).to(torch.float32))   # Shift parameter
        init.ones_(self.gamma)  # Initialize gamma to 1
        init.zeros_(self.beta)   # Initialize beta to 0
        self.eps = eps  # Set the epsilon value

    def forward(self, x):
        """
        Forward pass for the layer normalization.

        Args:
            x (torch.Tensor): Input tensor of shape [B, C, T, F].

        Returns:
            torch.Tensor: Normalized output tensor of the same shape.
        """
        if x.ndim == 4:
            _, C, _, _ = x.shape  # Extract the number of channels
            stat_dim = (1,)  # Dimension to compute statistics over
        else:
            raise ValueError("Expect x to have 4 dimensions, but got {}".format(x.ndim))

        # Compute mean and standard deviation along the specified dimension
        mu_ = x.mean(dim=stat_dim, keepdim=True)  # [B, 1, T, F]
        std_ = torch.sqrt(
            x.var(dim=stat_dim, unbiased=False, keepdim=True) + self.eps
        )  # [B, 1, T, F]

        # Normalize the input tensor and apply learnable parameters
        x_hat = ((x - mu_) / std_) * self.gamma + self.beta  # [B, C, T, F]
        return x_hat

class LayerNormalization4DCF(nn.Module):
    """
    LayerNormalization4DCF applies layer normalization to 4D tensors 
    (e.g., [B, C, T, F]) specifically designed for DCF (Dynamic Channel Frequency) inputs.
    
    Attributes:
        gamma (torch.Parameter): Learnable scaling parameter.
        beta (torch.Parameter): Learnable shifting parameter.
        eps (float): Small value for numerical stability during variance calculation.
    """

    def __init__(self, input_dimension, eps=1e-5):
        """
        Initializes the LayerNormalization4DCF layer.

        Args:
            input_dimension (tuple): A tuple containing the dimensions of the input tensor 
                                     (number of channels, frequency dimension).
            eps (float, optional): Small constant added for numerical stability.
        """
        super().__init__()
        assert len(input_dimension) == 2, "Input dimension must be a tuple of length 2."
        param_size = [1, input_dimension[0], 1, input_dimension[1]]  # Shape based on input dimensions
        self.gamma = Parameter(torch.Tensor(*param_size).to(torch.float32))  # Scale parameter
        self.beta = Parameter(torch.Tensor(*param_size).to(torch.float32))   # Shift parameter
        init.ones_(self.gamma)  # Initialize gamma to 1
        init.zeros_(self.beta)   # Initialize beta to 0
        self.eps = eps  # Set the epsilon value

    def forward(self, x):
        """
        Forward pass for the layer normalization.

        Args:
            x (torch.Tensor): Input tensor of shape [B, C, T, F].

        Returns:
            torch.Tensor: Normalized output tensor of the same shape.
        """
        if x.ndim == 4:
            stat_dim = (1, 3)  # Dimensions to compute statistics over
        else:
            raise ValueError("Expect x to have 4 dimensions, but got {}".format(x.ndim))

        # Compute mean and standard deviation along the specified dimensions
        mu_ = x.mean(dim=stat_dim, keepdim=True)  # [B, 1, T, 1]
        std_ = torch.sqrt(
            x.var(dim=stat_dim, unbiased=False, keepdim=True) + self.eps
        )  # [B, 1, T, F]

        # Normalize the input tensor and apply learnable parameters
        x_hat = ((x - mu_) / std_) * self.gamma + self.beta  # [B, C, T, F]
        return x_hat