Spaces:
Paused
Paused
import math | |
from typing import List, Optional, Tuple | |
import torch | |
from torch import Tensor | |
from torch.nn import Module | |
from . import components | |
class Wav2Vec2Model(Module): | |
"""Acoustic model used in *wav2vec 2.0* :cite:`baevski2020wav2vec`. | |
Note: | |
To build the model, please use one of the factory functions. | |
See Also: | |
* :class:`torchaudio.pipelines.Wav2Vec2Bundle`: Pretrained models (without fine-tuning) | |
* :class:`torchaudio.pipelines.Wav2Vec2ASRBundle`: ASR pipelines with pretrained models. | |
Args: | |
feature_extractor (torch.nn.Module): | |
Feature extractor that extracts feature vectors from raw audio Tensor. | |
encoder (torch.nn.Module): | |
Encoder that converts the audio features into the sequence of probability | |
distribution (in negative log-likelihood) over labels. | |
aux (torch.nn.Module or None, optional): | |
Auxiliary module. If provided, the output from encoder is passed to this module. | |
""" # noqa: E501 | |
def __init__( | |
self, | |
feature_extractor: Module, | |
encoder: Module, | |
aux: Optional[Module] = None, | |
): | |
super().__init__() | |
self.feature_extractor = feature_extractor | |
self.encoder = encoder | |
self.aux = aux | |
def extract_features( | |
self, | |
waveforms: Tensor, | |
lengths: Optional[Tensor] = None, | |
num_layers: Optional[int] = None, | |
) -> Tuple[List[Tensor], Optional[Tensor]]: | |
"""Extract feature vectors from raw waveforms | |
This returns the list of outputs from the intermediate layers of | |
transformer block in encoder. | |
Args: | |
waveforms (Tensor): Audio tensor of shape `(batch, frames)`. | |
lengths (Tensor or None, optional): | |
Indicates the valid length of each audio in the batch. | |
Shape: `(batch, )`. | |
When the ``waveforms`` contains audios with different durations, | |
by providing ``lengths`` argument, the model will compute | |
the corresponding valid output lengths and apply proper mask in | |
transformer attention layer. | |
If ``None``, it is assumed that the entire audio waveform | |
length is valid. | |
num_layers (int or None, optional): | |
If given, limit the number of intermediate layers to go through. | |
Providing `1` will stop the computation after going through one | |
intermediate layers. If not given, the outputs from all the | |
intermediate layers are returned. | |
Returns: | |
(List[Tensor], Optional[Tensor]): | |
List of Tensors | |
Features from requested layers. | |
Each Tensor is of shape: `(batch, time frame, feature dimension)` | |
Tensor or None | |
If ``lengths`` argument was provided, a Tensor of shape `(batch, )` | |
is returned. | |
It indicates the valid length in time axis of each feature Tensor. | |
""" | |
x, lengths = self.feature_extractor(waveforms, lengths) | |
x = self.encoder.extract_features(x, lengths, num_layers) | |
return x, lengths | |
def forward( | |
self, | |
waveforms: Tensor, | |
lengths: Optional[Tensor] = None, | |
) -> Tuple[Tensor, Optional[Tensor]]: | |
"""Compute the sequence of probability distribution over labels. | |
Args: | |
waveforms (Tensor): Audio tensor of shape `(batch, frames)`. | |
lengths (Tensor or None, optional): | |
Indicates the valid length of each audio in the batch. | |
Shape: `(batch, )`. | |
When the ``waveforms`` contains audios with different durations, | |
by providing ``lengths`` argument, the model will compute | |
the corresponding valid output lengths and apply proper mask in | |
transformer attention layer. | |
If ``None``, it is assumed that all the audio in ``waveforms`` | |
have valid length. Default: ``None``. | |
Returns: | |
(Tensor, Optional[Tensor]): | |
Tensor | |
The sequences of probability distribution (in logit) over labels. | |
Shape: `(batch, frames, num labels)`. | |
Tensor or None | |
If ``lengths`` argument was provided, a Tensor of shape `(batch, )` | |
is returned. | |
It indicates the valid length in time axis of the output Tensor. | |
""" | |
x, lengths = self.feature_extractor(waveforms, lengths) | |
x = self.encoder(x, lengths) | |
if self.aux is not None: | |
x = self.aux(x) | |
return x, lengths | |
class HuBERTPretrainModel(Module): | |
"""HuBERTPretrainModel() | |
HuBERT model used for pretraining in *HuBERT* :cite:`hsu2021hubert`. | |
Note: | |
To build the model, please use one of the factory functions. | |
See Also: | |
`HuBERT Pre-training and Fine-tuning Recipes | |
<https://github.com/pytorch/audio/tree/main/examples/hubert>`__ | |
Args: | |
wav2vec2 (Wav2Vec2Model): | |
Wav2Vec2 encoder that generates the transformer outputs. | |
mask_generator (torch.nn.Module): | |
Mask generator that generates the mask for masked prediction during the training. | |
logit_generator (torch.nn.Module): | |
Logit generator that predicts the logits of the masked and unmasked inputs. | |
feature_grad_mult (float or None): | |
The factor to scale the convolutional feature extraction layer gradients by. | |
If ``None``, the gradients of feature extraction layers are not affected. | |
The scale factor will not affect the forward pass. | |
""" | |
def __init__( | |
self, | |
wav2vec2: Wav2Vec2Model, | |
mask_generator: Module, | |
logit_generator: Module, | |
feature_grad_mult: Optional[float], | |
): | |
super().__init__() | |
self.wav2vec2 = wav2vec2 | |
self.mask_generator = mask_generator | |
self.logit_generator = logit_generator | |
if feature_grad_mult is not None and not 0.0 < feature_grad_mult < 1.0: | |
raise ValueError( | |
f"The value of `feature_grad_mult` must be ``None``or between (0, 1). Found {feature_grad_mult}" | |
) | |
self.feature_grad_mult = feature_grad_mult | |
def forward( | |
self, | |
waveforms: Tensor, | |
labels: Tensor, | |
audio_lengths: Optional[Tensor] = None, | |
) -> Tuple[Tensor, Optional[Tensor]]: | |
"""Compute the sequence of probability distribution over labels. | |
Args: | |
waveforms (Tensor): Audio tensor of dimension `[batch, frames]`. | |
labels (Tensor): Label for pre-training. A Tensor of dimension `[batch, frames]`. | |
audio_lengths (Tensor or None, optional): | |
Indicates the valid length of each audio in the batch. | |
Shape: `[batch, ]`. | |
When the ``waveforms`` contains audios with different durations, | |
by providing ``lengths`` argument, the model will compute | |
the corresponding valid output lengths and apply proper mask in | |
transformer attention layer. | |
If ``None``, it is assumed that all the audio in ``waveforms`` | |
have valid length. Default: ``None``. | |
Returns: | |
(Tensor, Tensor, Tensor): | |
Tensor | |
The masked sequences of probability distribution (in logit). | |
Shape: `(masked_frames, num labels)`. | |
Tensor | |
The unmasked sequence of probability distribution (in logit). | |
Shape: `(unmasked_frames, num labels)`. | |
Tensor | |
The feature mean value for additional penalty loss. | |
Shape: `(1,)`. | |
""" | |
x, lengths = self.wav2vec2.feature_extractor(waveforms, audio_lengths) | |
if self.feature_grad_mult is not None and self.feature_grad_mult < 1.0: | |
x = components.GradMultiply.apply(x, self.feature_grad_mult) | |
features_pen = x.float().pow(2).mean() | |
if lengths is not None: | |
padding_mask = components._get_padding_mask(x, lengths) | |
else: | |
padding_mask = None | |
x, attention_mask = self.wav2vec2.encoder._preprocess(x, lengths) | |
x, mask = self.mask_generator(x, padding_mask) | |
x = self.wav2vec2.encoder.transformer(x, attention_mask=attention_mask) | |
if x.shape[1] != labels.shape[1]: | |
raise ValueError("The length of label must match that of HuBERT model output") | |
if padding_mask is not None: | |
mask_m = torch.logical_and(~padding_mask, mask) | |
mask_u = torch.logical_and(~padding_mask, ~mask_m) | |
else: | |
mask_m = mask | |
mask_u = ~mask_m | |
logit_m, logit_u = self.logit_generator(x, labels, mask_m, mask_u) | |
return logit_m, logit_u, features_pen | |
def wav2vec2_model( | |
extractor_mode: str, | |
extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], | |
extractor_conv_bias: bool, | |
encoder_embed_dim: int, | |
encoder_projection_dropout: float, | |
encoder_pos_conv_kernel: int, | |
encoder_pos_conv_groups: int, | |
encoder_num_layers: int, | |
encoder_num_heads: int, | |
encoder_attention_dropout: float, | |
encoder_ff_interm_features: int, | |
encoder_ff_interm_dropout: float, | |
encoder_dropout: float, | |
encoder_layer_norm_first: bool, | |
encoder_layer_drop: float, | |
aux_num_out: Optional[int], | |
) -> Wav2Vec2Model: | |
"""Builds custom :class:`~torchaudio.models.Wav2Vec2Model`. | |
Note: | |
The "feature extractor" below corresponds to | |
`ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__ | |
in the original ``fairseq`` implementation. | |
This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0* | |
:cite:`baevski2020wav2vec` paper. | |
The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__, | |
and this is referred as "Transformer" in the paper. | |
Args: | |
extractor_mode (str): Operation mode of feature extractor. | |
Valid values are ``"group_norm"`` or ``"layer_norm"``. | |
If ``"group_norm"``, then a single normalization is applied | |
in the first convolution block. Otherwise, all the convolution | |
blocks will have layer normalization. | |
This option corresponds to ``extractor_mode`` from ``fairseq``. | |
extractor_conv_layer_config (list of integer tuples or None): | |
Configuration of convolution layers in feature extractor. | |
List of convolution configuration, | |
i.e. ``[(output_channel, kernel_size, stride), ...]`` | |
If ``None`` is provided, then the following default value is used. | |
.. code-block:: python | |
[ | |
(512, 10, 5), | |
(512, 3, 2), | |
(512, 3, 2), | |
(512, 3, 2), | |
(512, 3, 2), | |
(512, 2, 2), | |
(512, 2, 2), | |
] | |
This option corresponds to ``conv_feature_layers`` from ``fairseq``. | |
extractor_conv_bias (bool): | |
Whether to include bias term to each convolution operation. | |
This option corresponds to ``conv_bias`` from ``fairseq``. | |
encoder_embed_dim (int): | |
The dimension of embedding in encoder. | |
This option corresponds to ``encoder_embed_dim`` from ``fairseq``. | |
encoder_projection_dropout (float): | |
The dropout probability applied after the input feature is projected | |
to ``encoder_embed_dim``. | |
This option corresponds to ``dropout_input`` from ``fairseq``. | |
encoder_pos_conv_kernel (int): | |
The kernel size of convolutional positional embeddings. | |
This option corresponds to ``conv_pos`` from ``fairseq``. | |
encoder_pos_conv_groups (int): | |
The number of groups of convolutional positional embeddings. | |
This option corresponds to ``conv_pos_groups`` from ``fairseq``. | |
encoder_num_layers (int): | |
The number of self attention layers in transformer block. | |
This option corresponds to ``encoder_layers`` from ``fairseq``. | |
encoder_num_heads (int): | |
The number of heads in self attention layers. | |
This option corresponds to ``encoder_attention_heads`` from ``fairseq``. | |
encoder_attention_dropout (float): | |
The dropout probability applied after softmax in self-attention layer. | |
This option corresponds to ``attention_dropout`` from ``fairseq``. | |
encoder_ff_interm_features (int): | |
The dimension of hidden features in feed forward layer. | |
This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``. | |
encoder_ff_interm_dropout (float): | |
The dropout probability applied in feedforward layer. | |
This option correspinds to ``activation_dropout`` from ``fairseq``. | |
encoder_dropout (float): | |
The dropout probability applied at the end of feed forward layer. | |
This option corresponds to ``dropout`` from ``fairseq``. | |
encoder_layer_norm_first (bool): | |
Control the order of layer norm in transformer layer and each encoder layer. | |
If True, in transformer layer, layer norm is applied before features are fed | |
to encoder layers. In encoder layer, two layer norms are applied before and after | |
self attention. | |
If False, in transformer layer, layer norm is applied after features are fed | |
to encoder layers. In encoder layer, two layer norms are applied after self | |
attention, before and after feed forward. | |
This option corresponds to ``layer_norm_first`` from ``fairseq``. | |
encoder_layer_drop (float): | |
Probability to drop each encoder layer during training. | |
This option corresponds to ``layerdrop`` from ``fairseq``. | |
aux_num_out (int or None): | |
When provided, attach an extra linear layer on top of encoder, which can be | |
used for fine-tuning. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
if extractor_conv_layer_config is None: | |
extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2 | |
feature_extractor = components._get_feature_extractor( | |
extractor_mode, extractor_conv_layer_config, extractor_conv_bias | |
) | |
encoder = components._get_encoder( | |
in_features=extractor_conv_layer_config[-1][0], | |
embed_dim=encoder_embed_dim, | |
dropout_input=encoder_projection_dropout, | |
pos_conv_kernel=encoder_pos_conv_kernel, | |
pos_conv_groups=encoder_pos_conv_groups, | |
num_layers=encoder_num_layers, | |
num_heads=encoder_num_heads, | |
attention_dropout=encoder_attention_dropout, | |
ff_interm_features=encoder_ff_interm_features, | |
ff_interm_dropout=encoder_ff_interm_dropout, | |
dropout=encoder_dropout, | |
layer_norm_first=encoder_layer_norm_first, | |
layer_drop=encoder_layer_drop, | |
) | |
aux = None | |
if aux_num_out is not None: | |
aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out) | |
return Wav2Vec2Model(feature_extractor, encoder, aux) | |
def wav2vec2_base( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.1, | |
encoder_ff_interm_dropout: float = 0.1, | |
encoder_dropout: float = 0.1, | |
encoder_layer_drop: float = 0.1, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "base" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec` | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`wav2vec2_model`. | |
aux_num_out (int or None, optional): | |
See :py:func:`wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
return wav2vec2_model( | |
extractor_mode="group_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=768, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=12, | |
encoder_num_heads=12, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=3072, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=False, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def wav2vec2_large( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.1, | |
encoder_ff_interm_dropout: float = 0.1, | |
encoder_dropout: float = 0.1, | |
encoder_layer_drop: float = 0.1, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "large" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec` | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`wav2vec2_model`. | |
aux_num_out (int or None, optional): | |
See :py:func:`wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
return wav2vec2_model( | |
extractor_mode="group_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=1024, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=24, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=4096, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=False, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def wav2vec2_large_lv60k( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.1, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.1, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "large lv-60k" :class:`~torchaudio.models.Wav2Vec2Model` from *wav2vec 2.0* :cite:`baevski2020wav2vec` | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`wav2vec2_model`. | |
aux_num_out (int or None, optional): | |
See :py:func:`wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
return wav2vec2_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=True, | |
encoder_embed_dim=1024, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=24, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=4096, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def hubert_base( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.1, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.1, | |
encoder_layer_drop: float = 0.05, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "base" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert` | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`wav2vec2_model`. | |
aux_num_out (int or None, optional): | |
See :py:func:`wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
return wav2vec2_model( | |
extractor_mode="group_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=768, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=12, | |
encoder_num_heads=12, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=3072, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=False, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def hubert_large( | |
encoder_projection_dropout: float = 0.0, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert` | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`wav2vec2_model`. | |
aux_num_out (int or None, optional): | |
See :py:func:`wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
return wav2vec2_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=1024, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=24, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=4096, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def hubert_xlarge( | |
encoder_projection_dropout: float = 0.0, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "extra large" :class:`HuBERT <torchaudio.models.Wav2Vec2Model>` from *HuBERT* :cite:`hsu2021hubert` | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`wav2vec2_model`. | |
aux_num_out (int or None, optional): | |
See :py:func:`wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" # noqa: E501 | |
return wav2vec2_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=1280, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=48, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=5120, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def _init_hubert_pretrain_model(module): | |
if isinstance(module, components.ConvLayerBlock): | |
torch.nn.init.kaiming_normal_(module.conv.weight) | |
elif isinstance(module, components.ConvolutionalPositionalEmbedding): | |
# normalize the weight to normal distribution. | |
std = math.sqrt(4.0 / (module.embed_dim * module.kernel_size)) | |
torch.nn.init.normal_(module.conv.weight, mean=0.0, std=std) | |
torch.nn.init.constant_(module.conv.bias, 0.0) | |
elif isinstance(module, components.SelfAttention): | |
# normalize the query, key, value, and out_proj parameters in self attention module. | |
torch.nn.init.xavier_uniform_(module.k_proj.weight, gain=1 / math.sqrt(2)) | |
torch.nn.init.xavier_uniform_(module.v_proj.weight, gain=1 / math.sqrt(2)) | |
torch.nn.init.xavier_uniform_(module.q_proj.weight, gain=1 / math.sqrt(2)) | |
torch.nn.init.xavier_uniform_(module.out_proj.weight) | |
torch.nn.init.constant_(module.out_proj.bias, 0.0) | |
elif isinstance(module, components.Transformer): | |
module.apply(components._init_transformer_params) | |
else: | |
pass | |
def hubert_pretrain_model( | |
extractor_mode: str, | |
extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], | |
extractor_conv_bias: bool, | |
encoder_embed_dim: int, | |
encoder_projection_dropout: float, | |
encoder_pos_conv_kernel: int, | |
encoder_pos_conv_groups: int, | |
encoder_num_layers: int, | |
encoder_num_heads: int, | |
encoder_attention_dropout: float, | |
encoder_ff_interm_features: int, | |
encoder_ff_interm_dropout: float, | |
encoder_dropout: float, | |
encoder_layer_norm_first: bool, | |
encoder_layer_drop: float, | |
mask_prob: float, | |
mask_selection: str, | |
mask_other: float, | |
mask_length: int, | |
no_mask_overlap: bool, | |
mask_min_space: int, | |
mask_channel_prob: float, | |
mask_channel_selection: str, | |
mask_channel_other: float, | |
mask_channel_length: int, | |
no_mask_channel_overlap: bool, | |
mask_channel_min_space: int, | |
skip_masked: bool, | |
skip_nomask: bool, | |
num_classes: int, | |
final_dim: int, | |
feature_grad_mult: Optional[float], | |
) -> HuBERTPretrainModel: | |
"""Builds custom :class:`HuBERTPretrainModel` for training from scratch | |
Note: | |
The "feature extractor" below corresponds to | |
`ConvFeatureExtractionModel <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L736>`__ | |
in the original ``fairseq`` implementation. | |
This is referred as "(convolutional) feature encoder" in the *wav2vec 2.0* | |
:cite:`baevski2020wav2vec` paper. | |
The "encoder" below corresponds to `TransformerEncoder <https://github.com/pytorch/fairseq/blob/dd3bd3c0497ae9a7ae7364404a6b0a4c501780b3/fairseq/models/wav2vec/wav2vec2.py#L817>`__, | |
and this is referred as "Transformer" in the paper. | |
Args: | |
extractor_mode (str): Operation mode of feature extractor. | |
Valid values are ``"group_norm"`` or ``"layer_norm"``. | |
If ``"group_norm"``, then a single normalization is applied | |
in the first convolution block. Otherwise, all the convolution | |
blocks will have layer normalization. | |
This option corresponds to ``extractor_mode`` from ``fairseq``. | |
extractor_conv_layer_config (list of integer tuples or None): | |
Configuration of convolution layers in feature extractor. | |
List of convolution configuration, | |
i.e. ``[(output_channel, kernel_size, stride), ...]`` | |
If ``None`` is provided, then the following default value is used. | |
.. code-block:: python | |
[ | |
(512, 10, 5), | |
(512, 3, 2), | |
(512, 3, 2), | |
(512, 3, 2), | |
(512, 3, 2), | |
(512, 2, 2), | |
(512, 2, 2), | |
] | |
This option corresponds to ``conv_feature_layers`` from ``fairseq``. | |
extractor_conv_bias (bool): | |
Whether to include bias term to each convolution operation. | |
This option corresponds to ``conv_bias`` from ``fairseq``. | |
encoder_embed_dim (int): | |
The dimension of embedding in encoder. | |
This option corresponds to ``encoder_embed_dim`` from ``fairseq``. | |
encoder_projection_dropout (float): | |
The dropout probability applied after the input feature is projected | |
to ``encoder_embed_dim``. | |
This option corresponds to ``dropout_input`` from ``fairseq``. | |
encoder_pos_conv_kernel (int): | |
The kernel size of convolutional positional embeddings. | |
This option corresponds to ``conv_pos`` from ``fairseq``. | |
encoder_pos_conv_groups (int): | |
The number of groups of convolutional positional embeddings. | |
This option corresponds to ``conv_pos_groups`` from ``fairseq``. | |
encoder_num_layers (int): | |
The number of self attention layers in transformer block. | |
This option corresponds to ``encoder_layers`` from ``fairseq``. | |
encoder_num_heads (int): | |
The number of heads in self attention layers. | |
This option corresponds to ``encoder_attention_heads`` from ``fairseq``. | |
encoder_attention_dropout (float): | |
The dropout probability applied after softmax in self-attention layer. | |
This option corresponds to ``attention_dropout`` from ``fairseq``. | |
encoder_ff_interm_features (int): | |
The dimension of hidden features in feed forward layer. | |
This option corresponds to ``encoder_ffn_embed_dim`` from ``fairseq``. | |
encoder_ff_interm_dropout (float): | |
The dropout probability applied in feedforward layer. | |
This option correspinds to ``activation_dropout`` from ``fairseq``. | |
encoder_dropout (float): | |
The dropout probability applied at the end of feed forward layer. | |
This option corresponds to ``dropout`` from ``fairseq``. | |
encoder_layer_norm_first (bool): | |
Control the order of layer norm in transformer layer and each encoder layer. | |
If True, in transformer layer, layer norm is applied before features are fed | |
to encoder layers. In encoder layer, two layer norms are applied before and after | |
self attention. | |
If False, in transformer layer, layer norm is applied after features are fed | |
to encoder layers. In encoder layer, two layer norms are applied after self | |
attention, before and after feed forward. | |
This option corresponds to ``layer_norm_first`` from ``fairseq``. | |
encoder_layer_drop (float): | |
Probability to drop each encoder layer during training. | |
This option corresponds to ``layerdrop`` from ``fairseq``. | |
mask_prob (float): | |
Probability for each token to be chosen as start of the span to be masked. this will be multiplied by | |
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. | |
However due to overlaps, the actual number will be smaller (unless no_overlap is True). | |
This option corresponds to ``mask_prob`` from ``fairseq``. | |
mask_selection (str): | |
How to choose the mask length. Options: [``static``, ``uniform``, ``normal``, ``poisson``]. | |
This option corresponds to ``mask_selection`` from ``fairseq``. | |
mask_other (float): | |
Secondary mask argument (used for more complex distributions). | |
This option corresponds to ``mask_other`` from ``fairseq``. | |
mask_length (int): | |
The lengths of the mask. | |
This option corresponds to ``mask_length`` from ``fairseq``. | |
no_mask_overlap (bool): | |
Whether to allow masks to overlap. | |
This option corresponds to ``no_mask_overlap`` from ``fairseq``. | |
mask_min_space (int): | |
Minimum space between spans (if no overlap is enabled). | |
This option corresponds to ``mask_min_space`` from ``fairseq``. | |
mask_channel_prob: (float): | |
The probability of replacing a feature with 0. | |
This option corresponds to ``mask_channel_prob`` from ``fairseq``. | |
mask_channel_selection (str): | |
How to choose the mask length for channel masking. Options: [``static``, ``uniform``, ``normal``, ``poisson``]. | |
This option corresponds to ``mask_channel_selection`` from ``fairseq``. | |
mask_channel_other (float): | |
Secondary mask argument for channel masking(used for more complex distributions). | |
This option corresponds to ``mask_channel_other`` from ``fairseq``. | |
mask_channel_length (int): | |
Minimum space between spans (if no overlap is enabled) for channel masking. | |
This option corresponds to ``mask_channel_length`` from ``fairseq``. | |
no_mask_channel_overlap (bool): | |
Whether to allow channel masks to overlap. | |
This option corresponds to ``no_mask_channel_overlap`` from ``fairseq``. | |
mask_channel_min_space (int): | |
Minimum space between spans for channel masking(if no overlap is enabled). | |
This option corresponds to ``mask_channel_min_space`` from ``fairseq``. | |
skip_masked (bool): | |
If True, skip computing losses over masked frames. | |
This option corresponds to ``skip_masked`` from ``fairseq``. | |
skip_nomask (bool): | |
If True, skip computing losses over unmasked frames. | |
This option corresponds to ``skip_nomask`` from ``fairseq``. | |
num_classes (int): | |
The number of classes in the labels. | |
final_dim (int): | |
Project final representations and targets to `final_dim`. | |
This option corresponds to ``final_dim`` from ``fairseq``. | |
feature_grad_mult (float or None): | |
The factor to scale the convolutional feature extraction layer gradients by. | |
The scale factor will not affect the forward pass. | |
This option corresponds to ``feature_grad_mult`` from ``fairseq``. | |
Returns: | |
HuBERTPretrainModel: | |
The resulting model. | |
""" # noqa: E501 | |
if extractor_conv_layer_config is None: | |
extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2 | |
feature_extractor = components._get_feature_extractor( | |
extractor_mode, extractor_conv_layer_config, extractor_conv_bias | |
) | |
encoder = components._get_encoder( | |
in_features=extractor_conv_layer_config[-1][0], | |
embed_dim=encoder_embed_dim, | |
dropout_input=encoder_projection_dropout, | |
pos_conv_kernel=encoder_pos_conv_kernel, | |
pos_conv_groups=encoder_pos_conv_groups, | |
num_layers=encoder_num_layers, | |
num_heads=encoder_num_heads, | |
attention_dropout=encoder_attention_dropout, | |
ff_interm_features=encoder_ff_interm_features, | |
ff_interm_dropout=encoder_ff_interm_dropout, | |
dropout=encoder_dropout, | |
layer_norm_first=encoder_layer_norm_first, | |
layer_drop=encoder_layer_drop, | |
) | |
wav2vec2 = Wav2Vec2Model(feature_extractor, encoder) | |
mask_generator = components.MaskGenerator( | |
encoder_embed_dim, | |
mask_prob, | |
mask_selection, | |
mask_other, | |
mask_length, | |
no_mask_overlap, | |
mask_min_space, | |
mask_channel_prob, | |
mask_channel_selection, | |
mask_channel_other, | |
mask_channel_length, | |
no_mask_channel_overlap, | |
mask_channel_min_space, | |
) | |
logit_generator = components.LogitGenerator( | |
encoder_embed_dim, | |
num_classes, | |
final_dim, | |
skip_masked, | |
skip_nomask, | |
) | |
model = HuBERTPretrainModel( | |
wav2vec2=wav2vec2, | |
mask_generator=mask_generator, | |
logit_generator=logit_generator, | |
feature_grad_mult=feature_grad_mult, | |
) | |
# initialize the model for pre-training | |
model.apply(_init_hubert_pretrain_model) | |
return model | |
def hubert_pretrain_base( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.1, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.1, | |
encoder_layer_drop: float = 0.05, | |
mask_prob: float = 0.8, | |
mask_channel_prob: float = 0.0, | |
mask_channel_length: int = 10, | |
feature_grad_mult: Optional[float] = 0.1, | |
num_classes: int = 100, | |
) -> HuBERTPretrainModel: | |
"""Builds "base" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_layer_drop (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_prob (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_channel_prob (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_channel_length (int): | |
See :py:func:`hubert_pretrain_model`. | |
feature_grad_mult (float or None): | |
See :py:func:`hubert_pretrain_model`. | |
num_classes (int, optional): | |
See :py:func:`hubert_pretrain_model`. | |
Returns: | |
HuBERTPretrainModel: | |
The resulting model. | |
""" # noqa: E501 | |
return hubert_pretrain_model( | |
extractor_mode="group_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=768, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=12, | |
encoder_num_heads=12, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=3072, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=False, | |
encoder_layer_drop=encoder_layer_drop, | |
mask_prob=mask_prob, | |
mask_selection="static", | |
mask_other=0.0, | |
mask_length=10, | |
no_mask_overlap=False, | |
mask_min_space=1, | |
mask_channel_prob=mask_channel_prob, | |
mask_channel_selection="static", | |
mask_channel_other=0.0, | |
mask_channel_length=mask_channel_length, | |
no_mask_channel_overlap=False, | |
mask_channel_min_space=1, | |
skip_masked=False, | |
skip_nomask=False, | |
num_classes=num_classes, | |
final_dim=256, | |
feature_grad_mult=feature_grad_mult, | |
) | |
def hubert_pretrain_large( | |
encoder_projection_dropout: float = 0.0, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
mask_prob: float = 0.8, | |
mask_channel_prob: float = 0.0, | |
mask_channel_length: int = 10, | |
feature_grad_mult: Optional[float] = None, | |
) -> HuBERTPretrainModel: | |
"""Builds "large" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_layer_drop (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_prob (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_channel_prob (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_channel_length (int): | |
See :py:func:`hubert_pretrain_model`. | |
feature_grad_mult (float or None): | |
See :py:func:`hubert_pretrain_model`. | |
Returns: | |
HuBERTPretrainModel: | |
The resulting model. | |
""" # noqa: E501 | |
return hubert_pretrain_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=1024, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=24, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=4096, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
mask_prob=mask_prob, | |
mask_selection="static", | |
mask_other=0.0, | |
mask_length=10, | |
no_mask_overlap=False, | |
mask_min_space=1, | |
mask_channel_prob=mask_channel_prob, | |
mask_channel_selection="static", | |
mask_channel_other=0.0, | |
mask_channel_length=mask_channel_length, | |
no_mask_channel_overlap=False, | |
mask_channel_min_space=1, | |
skip_masked=False, | |
skip_nomask=False, | |
num_classes=500, | |
final_dim=768, | |
feature_grad_mult=feature_grad_mult, | |
) | |
def hubert_pretrain_xlarge( | |
encoder_projection_dropout: float = 0.0, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
mask_prob: float = 0.8, | |
mask_channel_prob: float = 0.0, | |
mask_channel_length: int = 10, | |
feature_grad_mult: Optional[float] = None, | |
) -> HuBERTPretrainModel: | |
"""Builds "extra large" :class:`HuBERTPretrainModel` from *HuBERT* :cite:`hsu2021hubert` for pretraining. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_dropout (float): | |
See :py:func:`hubert_pretrain_model`. | |
encoder_layer_drop (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_prob (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_channel_prob (float): | |
See :py:func:`hubert_pretrain_model`. | |
mask_channel_length (int): | |
See :py:func:`hubert_pretrain_model`. | |
feature_grad_mult (float or None): | |
See :py:func:`hubert_pretrain_model`. | |
Returns: | |
HuBERTPretrainModel: | |
The resulting model. | |
""" # noqa: E501 | |
return hubert_pretrain_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=1280, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=48, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=5120, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
mask_prob=mask_prob, | |
mask_selection="static", | |
mask_other=0.0, | |
mask_length=10, | |
no_mask_overlap=False, | |
mask_min_space=1, | |
mask_channel_prob=mask_channel_prob, | |
mask_channel_selection="static", | |
mask_channel_other=0.0, | |
mask_channel_length=mask_channel_length, | |
no_mask_channel_overlap=False, | |
mask_channel_min_space=1, | |
skip_masked=False, | |
skip_nomask=False, | |
num_classes=500, | |
final_dim=1024, | |
feature_grad_mult=feature_grad_mult, | |
) | |
def wavlm_model( | |
extractor_mode: str, | |
extractor_conv_layer_config: Optional[List[Tuple[int, int, int]]], | |
extractor_conv_bias: bool, | |
encoder_embed_dim: int, | |
encoder_projection_dropout: float, | |
encoder_pos_conv_kernel: int, | |
encoder_pos_conv_groups: int, | |
encoder_num_layers: int, | |
encoder_num_heads: int, | |
encoder_num_buckets: int, | |
encoder_max_distance: int, | |
encoder_attention_dropout: float, | |
encoder_ff_interm_features: int, | |
encoder_ff_interm_dropout: float, | |
encoder_dropout: float, | |
encoder_layer_norm_first: bool, | |
encoder_layer_drop: float, | |
aux_num_out: Optional[int], | |
) -> Wav2Vec2Model: | |
"""Builds custom WaveLM model :cite:`chen2022wavlm`. The architecture is compatible | |
with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output object is | |
:class:`~torchaudio.models.Wav2Vec2Model`. Most of the arguments have the same meaning | |
as in :py:func:`~torchaudio.models.wav2vec2_model` so please refer there for documentation. | |
Args: | |
extractor_mode (str): Operation mode of feature extractor. | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
extractor_conv_layer_config (list of integer tuples or None): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
extractor_conv_bias (bool): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_embed_dim (int): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_projection_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_pos_conv_kernel (int): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_pos_conv_groups (int): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_num_layers (int): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_num_heads (int): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_num_buckets (int): | |
Number of buckets for relative position embedding. | |
encoder_max_distance (int): | |
Maximum distance for relative position embedding. | |
encoder_attention_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_features (int): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_norm_first (bool): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
aux_num_out (int or None): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" | |
if extractor_conv_layer_config is None: | |
extractor_conv_layer_config = [(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] * 2 | |
feature_extractor = components._get_feature_extractor( | |
extractor_mode, extractor_conv_layer_config, extractor_conv_bias | |
) | |
encoder = components._get_wavlm_encoder( | |
in_features=extractor_conv_layer_config[-1][0], | |
embed_dim=encoder_embed_dim, | |
dropout_input=encoder_projection_dropout, | |
pos_conv_kernel=encoder_pos_conv_kernel, | |
pos_conv_groups=encoder_pos_conv_groups, | |
num_layers=encoder_num_layers, | |
num_heads=encoder_num_heads, | |
num_buckets=encoder_num_buckets, | |
max_distance=encoder_max_distance, | |
attention_dropout=encoder_attention_dropout, | |
ff_interm_features=encoder_ff_interm_features, | |
ff_interm_dropout=encoder_ff_interm_dropout, | |
dropout=encoder_dropout, | |
layer_norm_first=encoder_layer_norm_first, | |
layer_drop=encoder_layer_drop, | |
) | |
aux = None | |
if aux_num_out is not None: | |
aux = torch.nn.Linear(in_features=encoder_embed_dim, out_features=aux_num_out) | |
return Wav2Vec2Model(feature_extractor, encoder, aux) | |
def wavlm_base( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.1, | |
encoder_ff_interm_dropout: float = 0.1, | |
encoder_dropout: float = 0.1, | |
encoder_layer_drop: float = 0.1, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "base" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible | |
with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is | |
:class:`~torchaudio.models.Wav2Vec2Model`. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
aux_num_out (int, optional): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" | |
return wavlm_model( | |
extractor_mode="group_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=768, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=12, | |
encoder_num_heads=12, | |
encoder_num_buckets=320, | |
encoder_max_distance=800, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=3072, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=False, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def wavlm_large( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.1, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.1, | |
encoder_layer_drop: float = 0.1, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds "large" WaveLM model :cite:`chen2022wavlm`. The architecture is compatible | |
with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is | |
:class:`~torchaudio.models.Wav2Vec2Model`. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
aux_num_out (int, optional): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" | |
return wavlm_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=False, | |
encoder_embed_dim=1024, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=24, | |
encoder_num_heads=16, | |
encoder_num_buckets=320, | |
encoder_max_distance=800, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=4096, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def wav2vec2_xlsr_300m( | |
encoder_projection_dropout: float = 0.0, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds XLS-R model :cite:`babu2021xls` with 300 millions of parameters. The architecture is compatible | |
with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is | |
:class:`~torchaudio.models.Wav2Vec2Model`. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
aux_num_out (int, optional): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" | |
return wav2vec2_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=True, | |
encoder_embed_dim=1024, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=24, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=4096, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def wav2vec2_xlsr_1b( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds XLS-R model :cite:`babu2021xls` with 1 billion of parameters. The architecture is compatible | |
with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is | |
:class:`~torchaudio.models.Wav2Vec2Model`. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
aux_num_out (int, optional): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" | |
return wav2vec2_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=True, | |
encoder_embed_dim=1280, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=48, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=5120, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |
def wav2vec2_xlsr_2b( | |
encoder_projection_dropout: float = 0.1, | |
encoder_attention_dropout: float = 0.0, | |
encoder_ff_interm_dropout: float = 0.0, | |
encoder_dropout: float = 0.0, | |
encoder_layer_drop: float = 0.0, | |
aux_num_out: Optional[int] = None, | |
) -> Wav2Vec2Model: | |
"""Builds XLS-R model :cite:`babu2021xls` with 2 billions of parameters. The architecture is compatible | |
with Wav2Vec2 model :cite:`baevski2020wav2vec`, and so the output class is | |
:class:`~torchaudio.models.Wav2Vec2Model`. | |
Args: | |
encoder_projection_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_attention_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_ff_interm_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_dropout (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
encoder_layer_drop (float): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
aux_num_out (int, optional): | |
See :py:func:`~torchaudio.models.wav2vec2_model`. | |
Returns: | |
Wav2Vec2Model: | |
The resulting model. | |
""" | |
return wav2vec2_model( | |
extractor_mode="layer_norm", | |
extractor_conv_layer_config=None, | |
extractor_conv_bias=True, | |
encoder_embed_dim=1920, | |
encoder_projection_dropout=encoder_projection_dropout, | |
encoder_pos_conv_kernel=128, | |
encoder_pos_conv_groups=16, | |
encoder_num_layers=48, | |
encoder_num_heads=16, | |
encoder_attention_dropout=encoder_attention_dropout, | |
encoder_ff_interm_features=7680, | |
encoder_ff_interm_dropout=encoder_ff_interm_dropout, | |
encoder_dropout=encoder_dropout, | |
encoder_layer_norm_first=True, | |
encoder_layer_drop=encoder_layer_drop, | |
aux_num_out=aux_num_out, | |
) | |