GameServerX

Running

App Files Files Community

GameServerX / MLPY /Lib /site-packages /torchaudio /models /wav2letter.py

Kano001

Upload 462 files

864affd verified 10 months ago

raw

history blame

3.35 kB

	from torch import nn, Tensor

	__all__ = [
	"Wav2Letter",
	]


	class Wav2Letter(nn.Module):
	r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
	Recognition System* :cite:`collobert2016wav2letter`.

	See Also:
	* `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wav2letter>`__

	Args:
	num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
	input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
	or ``mfcc`` (Default: ``waveform``).
	num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
	"""

	def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:
	super().__init__()

	acoustic_num_features = 250 if input_type == "waveform" else num_features
	acoustic_model = nn.Sequential(
	nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
	nn.ReLU(inplace=True),
	nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
	nn.ReLU(inplace=True),
	)

	if input_type == "waveform":
	waveform_model = nn.Sequential(
	nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
	nn.ReLU(inplace=True),
	)
	self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)

	if input_type in ["power_spectrum", "mfcc"]:
	self.acoustic_model = acoustic_model

	def forward(self, x: Tensor) -> Tensor:
	r"""
	Args:
	x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).

	Returns:
	Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
	"""

	x = self.acoustic_model(x)
	x = nn.functional.log_softmax(x, dim=1)
	return x