from __future__ import annotations from typing import Any, Dict, Tuple, Union, Optional import torch import yaml from torch import nn from .heads import ISTFTHead from .models import VocosBackbone class Vocos(nn.Module): """ The Vocos class represents a Fourier-based neural vocoder for audio synthesis. This class is primarily designed for inference, with support for loading from pretrained model checkpoints. It consists of three main components: a feature extractor, a backbone, and a head. """ def __init__( self, args, ): super().__init__() self.backbone = VocosBackbone( input_channels=args.vocos.backbone.input_channels, dim=args.vocos.backbone.dim, intermediate_dim=args.vocos.backbone.intermediate_dim, num_layers=args.vocos.backbone.num_layers, ) self.head = ISTFTHead( dim=args.vocos.head.dim, n_fft=args.vocos.head.n_fft, hop_length=args.vocos.head.hop_length, padding=args.vocos.head.padding, ) def forward(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor: """ Method to decode audio waveform from already calculated features. The features input is passed through the backbone and the head to reconstruct the audio output. Args: features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size, C denotes the feature dimension, and L is the sequence length. Returns: Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T). """ x = self.backbone(features_input, **kwargs) audio_output = self.head(x) return audio_output