GameServerX

Running

App Files Files Community

GameServerX / MLPY /Lib /site-packages /torchaudio /models /rnnt.py

Kano001

Upload 462 files

864affd verified 10 months ago

raw

history blame

36.4 kB

	from abc import ABC, abstractmethod
	from typing import List, Optional, Tuple

	import torch
	from torchaudio.models import Emformer


	__all__ = ["RNNT", "emformer_rnnt_base", "emformer_rnnt_model"]


	class _TimeReduction(torch.nn.Module):
	r"""Coalesces frames along time dimension into a
	fewer number of frames with higher feature dimensionality.

	Args:
	stride (int): number of frames to merge for each output frame.
	"""

	def __init__(self, stride: int) -> None:
	super().__init__()
	self.stride = stride

	def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	r"""Forward pass.

	B: batch size;
	T: maximum input sequence length in batch;
	D: feature dimension of each input sequence frame.

	Args:
	input (torch.Tensor): input sequences, with shape `(B, T, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``input``.

	Returns:
	(torch.Tensor, torch.Tensor):
	torch.Tensor
	output sequences, with shape
	`(B, T // stride, D * stride)`
	torch.Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in output sequences.
	"""
	B, T, D = input.shape
	num_frames = T - (T % self.stride)
	input = input[:, :num_frames, :]
	lengths = lengths.div(self.stride, rounding_mode="trunc")
	T_max = num_frames // self.stride

	output = input.reshape(B, T_max, D * self.stride)
	output = output.contiguous()
	return output, lengths


	class _CustomLSTM(torch.nn.Module):
	r"""Custom long-short-term memory (LSTM) block that applies layer normalization
	to internal nodes.

	Args:
	input_dim (int): input dimension.
	hidden_dim (int): hidden dimension.
	layer_norm (bool, optional): if ``True``, enables layer normalization. (Default: ``False``)
	layer_norm_epsilon (float, optional): value of epsilon to use in
	layer normalization layers (Default: 1e-5)
	"""

	def __init__(
	self,
	input_dim: int,
	hidden_dim: int,
	layer_norm: bool = False,
	layer_norm_epsilon: float = 1e-5,
	) -> None:
	super().__init__()
	self.x2g = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=(not layer_norm))
	self.p2g = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=False)
	if layer_norm:
	self.c_norm = torch.nn.LayerNorm(hidden_dim, eps=layer_norm_epsilon)
	self.g_norm = torch.nn.LayerNorm(4 * hidden_dim, eps=layer_norm_epsilon)
	else:
	self.c_norm = torch.nn.Identity()
	self.g_norm = torch.nn.Identity()

	self.hidden_dim = hidden_dim

	def forward(
	self, input: torch.Tensor, state: Optional[List[torch.Tensor]]
	) -> Tuple[torch.Tensor, List[torch.Tensor]]:
	r"""Forward pass.

	B: batch size;
	T: maximum sequence length in batch;
	D: feature dimension of each input sequence element.

	Args:
	input (torch.Tensor): with shape `(T, B, D)`.
	state (List[torch.Tensor] or None): list of tensors
	representing internal state generated in preceding invocation
	of ``forward``.

	Returns:
	(torch.Tensor, List[torch.Tensor]):
	torch.Tensor
	output, with shape `(T, B, hidden_dim)`.
	List[torch.Tensor]
	list of tensors representing internal state generated
	in current invocation of ``forward``.
	"""
	if state is None:
	B = input.size(1)
	h = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype)
	c = torch.zeros(B, self.hidden_dim, device=input.device, dtype=input.dtype)
	else:
	h, c = state

	gated_input = self.x2g(input)
	outputs = []
	for gates in gated_input.unbind(0):
	gates = gates + self.p2g(h)
	gates = self.g_norm(gates)
	input_gate, forget_gate, cell_gate, output_gate = gates.chunk(4, 1)
	input_gate = input_gate.sigmoid()
	forget_gate = forget_gate.sigmoid()
	cell_gate = cell_gate.tanh()
	output_gate = output_gate.sigmoid()
	c = forget_gate * c + input_gate * cell_gate
	c = self.c_norm(c)
	h = output_gate * c.tanh()
	outputs.append(h)

	output = torch.stack(outputs, dim=0)
	state = [h, c]

	return output, state


	class _Transcriber(ABC):
	@abstractmethod
	def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	pass

	@abstractmethod
	def infer(
	self,
	input: torch.Tensor,
	lengths: torch.Tensor,
	states: Optional[List[List[torch.Tensor]]],
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	pass


	class _EmformerEncoder(torch.nn.Module, _Transcriber):
	r"""Emformer-based recurrent neural network transducer (RNN-T) encoder (transcription network).

	Args:
	input_dim (int): feature dimension of each input sequence element.
	output_dim (int): feature dimension of each output sequence element.
	segment_length (int): length of input segment expressed as number of frames.
	right_context_length (int): length of right context expressed as number of frames.
	time_reduction_input_dim (int): dimension to scale each element in input sequences to
	prior to applying time reduction block.
	time_reduction_stride (int): factor by which to reduce length of input sequence.
	transformer_num_heads (int): number of attention heads in each Emformer layer.
	transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
	transformer_num_layers (int): number of Emformer layers to instantiate.
	transformer_left_context_length (int): length of left context.
	transformer_dropout (float, optional): transformer dropout probability. (Default: 0.0)
	transformer_activation (str, optional): activation function to use in each Emformer layer's
	feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
	transformer_max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
	transformer_weight_init_scale_strategy (str, optional): per-layer weight initialization scaling
	strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
	transformer_tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
	"""

	def __init__(
	self,
	*,
	input_dim: int,
	output_dim: int,
	segment_length: int,
	right_context_length: int,
	time_reduction_input_dim: int,
	time_reduction_stride: int,
	transformer_num_heads: int,
	transformer_ffn_dim: int,
	transformer_num_layers: int,
	transformer_left_context_length: int,
	transformer_dropout: float = 0.0,
	transformer_activation: str = "relu",
	transformer_max_memory_size: int = 0,
	transformer_weight_init_scale_strategy: str = "depthwise",
	transformer_tanh_on_mem: bool = False,
	) -> None:
	super().__init__()
	self.input_linear = torch.nn.Linear(
	input_dim,
	time_reduction_input_dim,
	bias=False,
	)
	self.time_reduction = _TimeReduction(time_reduction_stride)
	transformer_input_dim = time_reduction_input_dim * time_reduction_stride
	self.transformer = Emformer(
	transformer_input_dim,
	transformer_num_heads,
	transformer_ffn_dim,
	transformer_num_layers,
	segment_length // time_reduction_stride,
	dropout=transformer_dropout,
	activation=transformer_activation,
	left_context_length=transformer_left_context_length,
	right_context_length=right_context_length // time_reduction_stride,
	max_memory_size=transformer_max_memory_size,
	weight_init_scale_strategy=transformer_weight_init_scale_strategy,
	tanh_on_mem=transformer_tanh_on_mem,
	)
	self.output_linear = torch.nn.Linear(transformer_input_dim, output_dim)
	self.layer_norm = torch.nn.LayerNorm(output_dim)

	def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	r"""Forward pass for training.

	B: batch size;
	T: maximum input sequence length in batch;
	D: feature dimension of each input sequence frame (input_dim).

	Args:
	input (torch.Tensor): input frame sequences right-padded with right context, with
	shape `(B, T + right context length, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``input``.

	Returns:
	(torch.Tensor, torch.Tensor):
	torch.Tensor
	output frame sequences, with
	shape `(B, T // time_reduction_stride, output_dim)`.
	torch.Tensor
	output input lengths, with shape `(B,)` and i-th element representing
	number of valid elements for i-th batch element in output frame sequences.
	"""
	input_linear_out = self.input_linear(input)
	time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths)
	transformer_out, transformer_lengths = self.transformer(time_reduction_out, time_reduction_lengths)
	output_linear_out = self.output_linear(transformer_out)
	layer_norm_out = self.layer_norm(output_linear_out)
	return layer_norm_out, transformer_lengths

	@torch.jit.export
	def infer(
	self,
	input: torch.Tensor,
	lengths: torch.Tensor,
	states: Optional[List[List[torch.Tensor]]],
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	r"""Forward pass for inference.

	B: batch size;
	T: maximum input sequence segment length in batch;
	D: feature dimension of each input sequence frame (input_dim).

	Args:
	input (torch.Tensor): input frame sequence segments right-padded with right context, with
	shape `(B, T + right context length, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``input``.
	state (List[List[torch.Tensor]] or None): list of lists of tensors
	representing internal state generated in preceding invocation
	of ``infer``.

	Returns:
	(torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
	torch.Tensor
	output frame sequences, with
	shape `(B, T // time_reduction_stride, output_dim)`.
	torch.Tensor
	output input lengths, with shape `(B,)` and i-th element representing
	number of valid elements for i-th batch element in output.
	List[List[torch.Tensor]]
	output states; list of lists of tensors
	representing internal state generated in current invocation
	of ``infer``.
	"""
	input_linear_out = self.input_linear(input)
	time_reduction_out, time_reduction_lengths = self.time_reduction(input_linear_out, lengths)
	(
	transformer_out,
	transformer_lengths,
	transformer_states,
	) = self.transformer.infer(time_reduction_out, time_reduction_lengths, states)
	output_linear_out = self.output_linear(transformer_out)
	layer_norm_out = self.layer_norm(output_linear_out)
	return layer_norm_out, transformer_lengths, transformer_states


	class _Predictor(torch.nn.Module):
	r"""Recurrent neural network transducer (RNN-T) prediction network.

	Args:
	num_symbols (int): size of target token lexicon.
	output_dim (int): feature dimension of each output sequence element.
	symbol_embedding_dim (int): dimension of each target token embedding.
	num_lstm_layers (int): number of LSTM layers to instantiate.
	lstm_hidden_dim (int): output dimension of each LSTM layer.
	lstm_layer_norm (bool, optional): if ``True``, enables layer normalization
	for LSTM layers. (Default: ``False``)
	lstm_layer_norm_epsilon (float, optional): value of epsilon to use in
	LSTM layer normalization layers. (Default: 1e-5)
	lstm_dropout (float, optional): LSTM dropout probability. (Default: 0.0)

	"""

	def __init__(
	self,
	num_symbols: int,
	output_dim: int,
	symbol_embedding_dim: int,
	num_lstm_layers: int,
	lstm_hidden_dim: int,
	lstm_layer_norm: bool = False,
	lstm_layer_norm_epsilon: float = 1e-5,
	lstm_dropout: float = 0.0,
	) -> None:
	super().__init__()
	self.embedding = torch.nn.Embedding(num_symbols, symbol_embedding_dim)
	self.input_layer_norm = torch.nn.LayerNorm(symbol_embedding_dim)
	self.lstm_layers = torch.nn.ModuleList(
	[
	_CustomLSTM(
	symbol_embedding_dim if idx == 0 else lstm_hidden_dim,
	lstm_hidden_dim,
	layer_norm=lstm_layer_norm,
	layer_norm_epsilon=lstm_layer_norm_epsilon,
	)
	for idx in range(num_lstm_layers)
	]
	)
	self.dropout = torch.nn.Dropout(p=lstm_dropout)
	self.linear = torch.nn.Linear(lstm_hidden_dim, output_dim)
	self.output_layer_norm = torch.nn.LayerNorm(output_dim)

	self.lstm_dropout = lstm_dropout

	def forward(
	self,
	input: torch.Tensor,
	lengths: torch.Tensor,
	state: Optional[List[List[torch.Tensor]]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	r"""Forward pass.

	B: batch size;
	U: maximum sequence length in batch;
	D: feature dimension of each input sequence element.

	Args:
	input (torch.Tensor): target sequences, with shape `(B, U)` and each element
	mapping to a target symbol, i.e. in range `[0, num_symbols)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``input``.
	state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
	representing internal state generated in preceding invocation
	of ``forward``. (Default: ``None``)

	Returns:
	(torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
	torch.Tensor
	output encoding sequences, with shape `(B, U, output_dim)`
	torch.Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid elements for i-th batch element in output encoding sequences.
	List[List[torch.Tensor]]
	output states; list of lists of tensors
	representing internal state generated in current invocation of ``forward``.
	"""
	input_tb = input.permute(1, 0)
	embedding_out = self.embedding(input_tb)
	input_layer_norm_out = self.input_layer_norm(embedding_out)

	lstm_out = input_layer_norm_out
	state_out: List[List[torch.Tensor]] = []
	for layer_idx, lstm in enumerate(self.lstm_layers):
	lstm_out, lstm_state_out = lstm(lstm_out, None if state is None else state[layer_idx])
	lstm_out = self.dropout(lstm_out)
	state_out.append(lstm_state_out)

	linear_out = self.linear(lstm_out)
	output_layer_norm_out = self.output_layer_norm(linear_out)
	return output_layer_norm_out.permute(1, 0, 2), lengths, state_out


	class _Joiner(torch.nn.Module):
	r"""Recurrent neural network transducer (RNN-T) joint network.

	Args:
	input_dim (int): source and target input dimension.
	output_dim (int): output dimension.
	activation (str, optional): activation function to use in the joiner.
	Must be one of ("relu", "tanh"). (Default: "relu")

	"""

	def __init__(self, input_dim: int, output_dim: int, activation: str = "relu") -> None:
	super().__init__()
	self.linear = torch.nn.Linear(input_dim, output_dim, bias=True)
	if activation == "relu":
	self.activation = torch.nn.ReLU()
	elif activation == "tanh":
	self.activation = torch.nn.Tanh()
	else:
	raise ValueError(f"Unsupported activation {activation}")

	def forward(
	self,
	source_encodings: torch.Tensor,
	source_lengths: torch.Tensor,
	target_encodings: torch.Tensor,
	target_lengths: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	r"""Forward pass for training.

	B: batch size;
	T: maximum source sequence length in batch;
	U: maximum target sequence length in batch;
	D: dimension of each source and target sequence encoding.

	Args:
	source_encodings (torch.Tensor): source encoding sequences, with
	shape `(B, T, D)`.
	source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	valid sequence length of i-th batch element in ``source_encodings``.
	target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
	target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	valid sequence length of i-th batch element in ``target_encodings``.

	Returns:
	(torch.Tensor, torch.Tensor, torch.Tensor):
	torch.Tensor
	joint network output, with shape `(B, T, U, output_dim)`.
	torch.Tensor
	output source lengths, with shape `(B,)` and i-th element representing
	number of valid elements along dim 1 for i-th batch element in joint network output.
	torch.Tensor
	output target lengths, with shape `(B,)` and i-th element representing
	number of valid elements along dim 2 for i-th batch element in joint network output.
	"""
	joint_encodings = source_encodings.unsqueeze(2).contiguous() + target_encodings.unsqueeze(1).contiguous()
	activation_out = self.activation(joint_encodings)
	output = self.linear(activation_out)
	return output, source_lengths, target_lengths


	class RNNT(torch.nn.Module):
	r"""torchaudio.models.RNNT()

	Recurrent neural network transducer (RNN-T) model.

	Note:
	To build the model, please use one of the factory functions.

	See Also:
	:class:`torchaudio.pipelines.RNNTBundle`: ASR pipeline with pre-trained models.

	Args:
	transcriber (torch.nn.Module): transcription network.
	predictor (torch.nn.Module): prediction network.
	joiner (torch.nn.Module): joint network.
	"""

	def __init__(self, transcriber: _Transcriber, predictor: _Predictor, joiner: _Joiner) -> None:
	super().__init__()
	self.transcriber = transcriber
	self.predictor = predictor
	self.joiner = joiner

	def forward(
	self,
	sources: torch.Tensor,
	source_lengths: torch.Tensor,
	targets: torch.Tensor,
	target_lengths: torch.Tensor,
	predictor_state: Optional[List[List[torch.Tensor]]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	r"""Forward pass for training.

	B: batch size;
	T: maximum source sequence length in batch;
	U: maximum target sequence length in batch;
	D: feature dimension of each source sequence element.

	Args:
	sources (torch.Tensor): source frame sequences right-padded with right context, with
	shape `(B, T, D)`.
	source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``sources``.
	targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
	mapping to a target symbol.
	target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``targets``.
	predictor_state (List[List[torch.Tensor]] or None, optional): list of lists of tensors
	representing prediction network internal state generated in preceding invocation
	of ``forward``. (Default: ``None``)

	Returns:
	(torch.Tensor, torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
	torch.Tensor
	joint network output, with shape
	`(B, max output source length, max output target length, output_dim (number of target symbols))`.
	torch.Tensor
	output source lengths, with shape `(B,)` and i-th element representing
	number of valid elements along dim 1 for i-th batch element in joint network output.
	torch.Tensor
	output target lengths, with shape `(B,)` and i-th element representing
	number of valid elements along dim 2 for i-th batch element in joint network output.
	List[List[torch.Tensor]]
	output states; list of lists of tensors
	representing prediction network internal state generated in current invocation
	of ``forward``.
	"""
	source_encodings, source_lengths = self.transcriber(
	input=sources,
	lengths=source_lengths,
	)
	target_encodings, target_lengths, predictor_state = self.predictor(
	input=targets,
	lengths=target_lengths,
	state=predictor_state,
	)
	output, source_lengths, target_lengths = self.joiner(
	source_encodings=source_encodings,
	source_lengths=source_lengths,
	target_encodings=target_encodings,
	target_lengths=target_lengths,
	)

	return (
	output,
	source_lengths,
	target_lengths,
	predictor_state,
	)

	@torch.jit.export
	def transcribe_streaming(
	self,
	sources: torch.Tensor,
	source_lengths: torch.Tensor,
	state: Optional[List[List[torch.Tensor]]],
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	r"""Applies transcription network to sources in streaming mode.

	B: batch size;
	T: maximum source sequence segment length in batch;
	D: feature dimension of each source sequence frame.

	Args:
	sources (torch.Tensor): source frame sequence segments right-padded with right context, with
	shape `(B, T + right context length, D)`.
	source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``sources``.
	state (List[List[torch.Tensor]] or None): list of lists of tensors
	representing transcription network internal state generated in preceding invocation
	of ``transcribe_streaming``.

	Returns:
	(torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
	torch.Tensor
	output frame sequences, with
	shape `(B, T // time_reduction_stride, output_dim)`.
	torch.Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid elements for i-th batch element in output.
	List[List[torch.Tensor]]
	output states; list of lists of tensors
	representing transcription network internal state generated in current invocation
	of ``transcribe_streaming``.
	"""
	return self.transcriber.infer(sources, source_lengths, state)

	@torch.jit.export
	def transcribe(
	self,
	sources: torch.Tensor,
	source_lengths: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	r"""Applies transcription network to sources in non-streaming mode.

	B: batch size;
	T: maximum source sequence length in batch;
	D: feature dimension of each source sequence frame.

	Args:
	sources (torch.Tensor): source frame sequences right-padded with right context, with
	shape `(B, T + right context length, D)`.
	source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``sources``.

	Returns:
	(torch.Tensor, torch.Tensor):
	torch.Tensor
	output frame sequences, with
	shape `(B, T // time_reduction_stride, output_dim)`.
	torch.Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid elements for i-th batch element in output frame sequences.
	"""
	return self.transcriber(sources, source_lengths)

	@torch.jit.export
	def predict(
	self,
	targets: torch.Tensor,
	target_lengths: torch.Tensor,
	state: Optional[List[List[torch.Tensor]]],
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	r"""Applies prediction network to targets.

	B: batch size;
	U: maximum target sequence length in batch;
	D: feature dimension of each target sequence frame.

	Args:
	targets (torch.Tensor): target sequences, with shape `(B, U)` and each element
	mapping to a target symbol, i.e. in range `[0, num_symbols)`.
	target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``targets``.
	state (List[List[torch.Tensor]] or None): list of lists of tensors
	representing internal state generated in preceding invocation
	of ``predict``.

	Returns:
	(torch.Tensor, torch.Tensor, List[List[torch.Tensor]]):
	torch.Tensor
	output frame sequences, with shape `(B, U, output_dim)`.
	torch.Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid elements for i-th batch element in output.
	List[List[torch.Tensor]]
	output states; list of lists of tensors
	representing internal state generated in current invocation of ``predict``.
	"""
	return self.predictor(input=targets, lengths=target_lengths, state=state)

	@torch.jit.export
	def join(
	self,
	source_encodings: torch.Tensor,
	source_lengths: torch.Tensor,
	target_encodings: torch.Tensor,
	target_lengths: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	r"""Applies joint network to source and target encodings.

	B: batch size;
	T: maximum source sequence length in batch;
	U: maximum target sequence length in batch;
	D: dimension of each source and target sequence encoding.

	Args:
	source_encodings (torch.Tensor): source encoding sequences, with
	shape `(B, T, D)`.
	source_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	valid sequence length of i-th batch element in ``source_encodings``.
	target_encodings (torch.Tensor): target encoding sequences, with shape `(B, U, D)`.
	target_lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	valid sequence length of i-th batch element in ``target_encodings``.

	Returns:
	(torch.Tensor, torch.Tensor, torch.Tensor):
	torch.Tensor
	joint network output, with shape `(B, T, U, output_dim)`.
	torch.Tensor
	output source lengths, with shape `(B,)` and i-th element representing
	number of valid elements along dim 1 for i-th batch element in joint network output.
	torch.Tensor
	output target lengths, with shape `(B,)` and i-th element representing
	number of valid elements along dim 2 for i-th batch element in joint network output.
	"""
	output, source_lengths, target_lengths = self.joiner(
	source_encodings=source_encodings,
	source_lengths=source_lengths,
	target_encodings=target_encodings,
	target_lengths=target_lengths,
	)
	return output, source_lengths, target_lengths


	def emformer_rnnt_model(
	*,
	input_dim: int,
	encoding_dim: int,
	num_symbols: int,
	segment_length: int,
	right_context_length: int,
	time_reduction_input_dim: int,
	time_reduction_stride: int,
	transformer_num_heads: int,
	transformer_ffn_dim: int,
	transformer_num_layers: int,
	transformer_dropout: float,
	transformer_activation: str,
	transformer_left_context_length: int,
	transformer_max_memory_size: int,
	transformer_weight_init_scale_strategy: str,
	transformer_tanh_on_mem: bool,
	symbol_embedding_dim: int,
	num_lstm_layers: int,
	lstm_layer_norm: bool,
	lstm_layer_norm_epsilon: float,
	lstm_dropout: float,
	) -> RNNT:
	r"""Builds Emformer-based :class:`~torchaudio.models.RNNT`.

	Note:
	For non-streaming inference, the expectation is for `transcribe` to be called on input
	sequences right-concatenated with `right_context_length` frames.

	For streaming inference, the expectation is for `transcribe_streaming` to be called
	on input chunks comprising `segment_length` frames right-concatenated with `right_context_length`
	frames.

	Args:
	input_dim (int): dimension of input sequence frames passed to transcription network.
	encoding_dim (int): dimension of transcription- and prediction-network-generated encodings
	passed to joint network.
	num_symbols (int): cardinality of set of target tokens.
	segment_length (int): length of input segment expressed as number of frames.
	right_context_length (int): length of right context expressed as number of frames.
	time_reduction_input_dim (int): dimension to scale each element in input sequences to
	prior to applying time reduction block.
	time_reduction_stride (int): factor by which to reduce length of input sequence.
	transformer_num_heads (int): number of attention heads in each Emformer layer.
	transformer_ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
	transformer_num_layers (int): number of Emformer layers to instantiate.
	transformer_left_context_length (int): length of left context considered by Emformer.
	transformer_dropout (float): Emformer dropout probability.
	transformer_activation (str): activation function to use in each Emformer layer's
	feedforward network. Must be one of ("relu", "gelu", "silu").
	transformer_max_memory_size (int): maximum number of memory elements to use.
	transformer_weight_init_scale_strategy (str): per-layer weight initialization scaling
	strategy. Must be one of ("depthwise", "constant", ``None``).
	transformer_tanh_on_mem (bool): if ``True``, applies tanh to memory elements.
	symbol_embedding_dim (int): dimension of each target token embedding.
	num_lstm_layers (int): number of LSTM layers to instantiate.
	lstm_layer_norm (bool): if ``True``, enables layer normalization for LSTM layers.
	lstm_layer_norm_epsilon (float): value of epsilon to use in LSTM layer normalization layers.
	lstm_dropout (float): LSTM dropout probability.

	Returns:
	RNNT:
	Emformer RNN-T model.
	"""
	encoder = _EmformerEncoder(
	input_dim=input_dim,
	output_dim=encoding_dim,
	segment_length=segment_length,
	right_context_length=right_context_length,
	time_reduction_input_dim=time_reduction_input_dim,
	time_reduction_stride=time_reduction_stride,
	transformer_num_heads=transformer_num_heads,
	transformer_ffn_dim=transformer_ffn_dim,
	transformer_num_layers=transformer_num_layers,
	transformer_dropout=transformer_dropout,
	transformer_activation=transformer_activation,
	transformer_left_context_length=transformer_left_context_length,
	transformer_max_memory_size=transformer_max_memory_size,
	transformer_weight_init_scale_strategy=transformer_weight_init_scale_strategy,
	transformer_tanh_on_mem=transformer_tanh_on_mem,
	)
	predictor = _Predictor(
	num_symbols,
	encoding_dim,
	symbol_embedding_dim=symbol_embedding_dim,
	num_lstm_layers=num_lstm_layers,
	lstm_hidden_dim=symbol_embedding_dim,
	lstm_layer_norm=lstm_layer_norm,
	lstm_layer_norm_epsilon=lstm_layer_norm_epsilon,
	lstm_dropout=lstm_dropout,
	)
	joiner = _Joiner(encoding_dim, num_symbols)
	return RNNT(encoder, predictor, joiner)


	def emformer_rnnt_base(num_symbols: int) -> RNNT:
	r"""Builds basic version of Emformer-based :class:`~torchaudio.models.RNNT`.

	Args:
	num_symbols (int): The size of target token lexicon.

	Returns:
	RNNT:
	Emformer RNN-T model.
	"""
	return emformer_rnnt_model(
	input_dim=80,
	encoding_dim=1024,
	num_symbols=num_symbols,
	segment_length=16,
	right_context_length=4,
	time_reduction_input_dim=128,
	time_reduction_stride=4,
	transformer_num_heads=8,
	transformer_ffn_dim=2048,
	transformer_num_layers=20,
	transformer_dropout=0.1,
	transformer_activation="gelu",
	transformer_left_context_length=30,
	transformer_max_memory_size=0,
	transformer_weight_init_scale_strategy="depthwise",
	transformer_tanh_on_mem=True,
	symbol_embedding_dim=512,
	num_lstm_layers=3,
	lstm_layer_norm=True,
	lstm_layer_norm_epsilon=1e-3,
	lstm_dropout=0.3,
	)