GameServerX

Running

App Files Files Community

GameServerX / MLPY /Lib /site-packages /torchaudio /models /emformer.py

Kano001

Upload 462 files

864affd verified 10 months ago

raw

history blame

38.7 kB

	import math
	from typing import List, Optional, Tuple

	import torch


	__all__ = ["Emformer"]


	def _lengths_to_padding_mask(lengths: torch.Tensor) -> torch.Tensor:
	batch_size = lengths.shape[0]
	max_length = int(torch.max(lengths).item())
	padding_mask = torch.arange(max_length, device=lengths.device, dtype=lengths.dtype).expand(
	batch_size, max_length
	) >= lengths.unsqueeze(1)
	return padding_mask


	def _gen_padding_mask(
	utterance: torch.Tensor,
	right_context: torch.Tensor,
	summary: torch.Tensor,
	lengths: torch.Tensor,
	mems: torch.Tensor,
	left_context_key: Optional[torch.Tensor] = None,
	) -> Optional[torch.Tensor]:
	T = right_context.size(0) + utterance.size(0) + summary.size(0)
	B = right_context.size(1)
	if B == 1:
	padding_mask = None
	else:
	right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0)
	left_context_blocks_length = left_context_key.size(0) if left_context_key is not None else 0
	klengths = lengths + mems.size(0) + right_context_blocks_length + left_context_blocks_length
	padding_mask = _lengths_to_padding_mask(lengths=klengths)
	return padding_mask


	def _get_activation_module(activation: str) -> torch.nn.Module:
	if activation == "relu":
	return torch.nn.ReLU()
	elif activation == "gelu":
	return torch.nn.GELU()
	elif activation == "silu":
	return torch.nn.SiLU()
	else:
	raise ValueError(f"Unsupported activation {activation}")


	def _get_weight_init_gains(weight_init_scale_strategy: Optional[str], num_layers: int) -> List[Optional[float]]:
	if weight_init_scale_strategy is None:
	return [None for _ in range(num_layers)]
	elif weight_init_scale_strategy == "depthwise":
	return [1.0 / math.sqrt(layer_idx + 1) for layer_idx in range(num_layers)]
	elif weight_init_scale_strategy == "constant":
	return [1.0 / math.sqrt(2) for layer_idx in range(num_layers)]
	else:
	raise ValueError(f"Unsupported weight_init_scale_strategy value {weight_init_scale_strategy}")


	def _gen_attention_mask_block(
	col_widths: List[int], col_mask: List[bool], num_rows: int, device: torch.device
	) -> torch.Tensor:
	if len(col_widths) != len(col_mask):
	raise ValueError("Length of col_widths must match that of col_mask")

	mask_block = [
	torch.ones(num_rows, col_width, device=device)
	if is_ones_col
	else torch.zeros(num_rows, col_width, device=device)
	for col_width, is_ones_col in zip(col_widths, col_mask)
	]
	return torch.cat(mask_block, dim=1)


	class _EmformerAttention(torch.nn.Module):
	r"""Emformer layer attention module.

	Args:
	input_dim (int): input dimension.
	num_heads (int): number of attention heads in each Emformer layer.
	dropout (float, optional): dropout probability. (Default: 0.0)
	weight_init_gain (float or None, optional): scale factor to apply when initializing
	attention module parameters. (Default: ``None``)
	tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
	negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
	"""

	def __init__(
	self,
	input_dim: int,
	num_heads: int,
	dropout: float = 0.0,
	weight_init_gain: Optional[float] = None,
	tanh_on_mem: bool = False,
	negative_inf: float = -1e8,
	):
	super().__init__()

	if input_dim % num_heads != 0:
	raise ValueError(f"input_dim ({input_dim}) is not a multiple of num_heads ({num_heads}).")

	self.input_dim = input_dim
	self.num_heads = num_heads
	self.dropout = dropout
	self.tanh_on_mem = tanh_on_mem
	self.negative_inf = negative_inf

	self.scaling = (self.input_dim // self.num_heads) ** -0.5

	self.emb_to_key_value = torch.nn.Linear(input_dim, 2 * input_dim, bias=True)
	self.emb_to_query = torch.nn.Linear(input_dim, input_dim, bias=True)
	self.out_proj = torch.nn.Linear(input_dim, input_dim, bias=True)

	if weight_init_gain:
	torch.nn.init.xavier_uniform_(self.emb_to_key_value.weight, gain=weight_init_gain)
	torch.nn.init.xavier_uniform_(self.emb_to_query.weight, gain=weight_init_gain)

	def _gen_key_value(self, input: torch.Tensor, mems: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	T, _, _ = input.shape
	summary_length = mems.size(0) + 1
	right_ctx_utterance_block = input[: T - summary_length]
	mems_right_ctx_utterance_block = torch.cat([mems, right_ctx_utterance_block])
	key, value = self.emb_to_key_value(mems_right_ctx_utterance_block).chunk(chunks=2, dim=2)
	return key, value

	def _gen_attention_probs(
	self,
	attention_weights: torch.Tensor,
	attention_mask: torch.Tensor,
	padding_mask: Optional[torch.Tensor],
	) -> torch.Tensor:
	attention_weights_float = attention_weights.float()
	attention_weights_float = attention_weights_float.masked_fill(attention_mask.unsqueeze(0), self.negative_inf)
	T = attention_weights.size(1)
	B = attention_weights.size(0) // self.num_heads
	if padding_mask is not None:
	attention_weights_float = attention_weights_float.view(B, self.num_heads, T, -1)
	attention_weights_float = attention_weights_float.masked_fill(
	padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), self.negative_inf
	)
	attention_weights_float = attention_weights_float.view(B * self.num_heads, T, -1)
	attention_probs = torch.nn.functional.softmax(attention_weights_float, dim=-1).type_as(attention_weights)
	return torch.nn.functional.dropout(attention_probs, p=float(self.dropout), training=self.training)

	def _forward_impl(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	summary: torch.Tensor,
	mems: torch.Tensor,
	attention_mask: torch.Tensor,
	left_context_key: Optional[torch.Tensor] = None,
	left_context_val: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	B = utterance.size(1)
	T = right_context.size(0) + utterance.size(0) + summary.size(0)

	# Compute query with [right context, utterance, summary].
	query = self.emb_to_query(torch.cat([right_context, utterance, summary]))

	# Compute key and value with [mems, right context, utterance].
	key, value = self.emb_to_key_value(torch.cat([mems, right_context, utterance])).chunk(chunks=2, dim=2)

	if left_context_key is not None and left_context_val is not None:
	right_context_blocks_length = T - torch.max(lengths).int() - summary.size(0)
	key = torch.cat(
	[
	key[: mems.size(0) + right_context_blocks_length],
	left_context_key,
	key[mems.size(0) + right_context_blocks_length :],
	],
	)
	value = torch.cat(
	[
	value[: mems.size(0) + right_context_blocks_length],
	left_context_val,
	value[mems.size(0) + right_context_blocks_length :],
	],
	)

	# Compute attention weights from query, key, and value.
	reshaped_query, reshaped_key, reshaped_value = [
	tensor.contiguous().view(-1, B * self.num_heads, self.input_dim // self.num_heads).transpose(0, 1)
	for tensor in [query, key, value]
	]
	attention_weights = torch.bmm(reshaped_query * self.scaling, reshaped_key.transpose(1, 2))

	# Compute padding mask.
	padding_mask = _gen_padding_mask(utterance, right_context, summary, lengths, mems, left_context_key)

	# Compute attention probabilities.
	attention_probs = self._gen_attention_probs(attention_weights, attention_mask, padding_mask)

	# Compute attention.
	attention = torch.bmm(attention_probs, reshaped_value)
	if attention.shape != (
	B * self.num_heads,
	T,
	self.input_dim // self.num_heads,
	):
	raise AssertionError("Computed attention has incorrect dimensions")
	attention = attention.transpose(0, 1).contiguous().view(T, B, self.input_dim)

	# Apply output projection.
	output_right_context_mems = self.out_proj(attention)

	summary_length = summary.size(0)
	output_right_context = output_right_context_mems[: T - summary_length]
	output_mems = output_right_context_mems[T - summary_length :]
	if self.tanh_on_mem:
	output_mems = torch.tanh(output_mems)
	else:
	output_mems = torch.clamp(output_mems, min=-10, max=10)

	return output_right_context, output_mems, key, value

	def forward(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	summary: torch.Tensor,
	mems: torch.Tensor,
	attention_mask: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	r"""Forward pass for training.

	B: batch size;
	D: feature dimension of each frame;
	T: number of utterance frames;
	R: number of right context frames;
	S: number of summary elements;
	M: number of memory elements.

	Args:
	utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``utterance``.
	right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
	summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
	mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
	attention_mask (torch.Tensor): attention mask for underlying attention module.

	Returns:
	(Tensor, Tensor):
	Tensor
	output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
	Tensor
	updated memory elements, with shape `(M, B, D)`.
	"""
	output, output_mems, _, _ = self._forward_impl(utterance, lengths, right_context, summary, mems, attention_mask)
	return output, output_mems[:-1]

	@torch.jit.export
	def infer(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	summary: torch.Tensor,
	mems: torch.Tensor,
	left_context_key: torch.Tensor,
	left_context_val: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
	r"""Forward pass for inference.

	B: batch size;
	D: feature dimension of each frame;
	T: number of utterance frames;
	R: number of right context frames;
	S: number of summary elements;
	M: number of memory elements.

	Args:
	utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``utterance``.
	right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
	summary (torch.Tensor): summary elements, with shape `(S, B, D)`.
	mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
	left_context_key (torch.Tensor): left context attention key computed from preceding invocation.
	left_context_val (torch.Tensor): left context attention value computed from preceding invocation.

	Returns:
	(Tensor, Tensor, Tensor, and Tensor):
	Tensor
	output frames corresponding to utterance and right_context, with shape `(T + R, B, D)`.
	Tensor
	updated memory elements, with shape `(M, B, D)`.
	Tensor
	attention key computed for left context and utterance.
	Tensor
	attention value computed for left context and utterance.
	"""
	query_dim = right_context.size(0) + utterance.size(0) + summary.size(0)
	key_dim = right_context.size(0) + utterance.size(0) + mems.size(0) + left_context_key.size(0)
	attention_mask = torch.zeros(query_dim, key_dim).to(dtype=torch.bool, device=utterance.device)
	attention_mask[-1, : mems.size(0)] = True
	output, output_mems, key, value = self._forward_impl(
	utterance,
	lengths,
	right_context,
	summary,
	mems,
	attention_mask,
	left_context_key=left_context_key,
	left_context_val=left_context_val,
	)
	return (
	output,
	output_mems,
	key[mems.size(0) + right_context.size(0) :],
	value[mems.size(0) + right_context.size(0) :],
	)


	class _EmformerLayer(torch.nn.Module):
	r"""Emformer layer that constitutes Emformer.

	Args:
	input_dim (int): input dimension.
	num_heads (int): number of attention heads.
	ffn_dim: (int): hidden layer dimension of feedforward network.
	segment_length (int): length of each input segment.
	dropout (float, optional): dropout probability. (Default: 0.0)
	activation (str, optional): activation function to use in feedforward network.
	Must be one of ("relu", "gelu", "silu"). (Default: "relu")
	left_context_length (int, optional): length of left context. (Default: 0)
	max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
	weight_init_gain (float or None, optional): scale factor to apply when initializing
	attention module parameters. (Default: ``None``)
	tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
	negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)
	"""

	def __init__(
	self,
	input_dim: int,
	num_heads: int,
	ffn_dim: int,
	segment_length: int,
	dropout: float = 0.0,
	activation: str = "relu",
	left_context_length: int = 0,
	max_memory_size: int = 0,
	weight_init_gain: Optional[float] = None,
	tanh_on_mem: bool = False,
	negative_inf: float = -1e8,
	):
	super().__init__()

	self.attention = _EmformerAttention(
	input_dim=input_dim,
	num_heads=num_heads,
	dropout=dropout,
	weight_init_gain=weight_init_gain,
	tanh_on_mem=tanh_on_mem,
	negative_inf=negative_inf,
	)
	self.dropout = torch.nn.Dropout(dropout)
	self.memory_op = torch.nn.AvgPool1d(kernel_size=segment_length, stride=segment_length, ceil_mode=True)

	activation_module = _get_activation_module(activation)
	self.pos_ff = torch.nn.Sequential(
	torch.nn.LayerNorm(input_dim),
	torch.nn.Linear(input_dim, ffn_dim),
	activation_module,
	torch.nn.Dropout(dropout),
	torch.nn.Linear(ffn_dim, input_dim),
	torch.nn.Dropout(dropout),
	)
	self.layer_norm_input = torch.nn.LayerNorm(input_dim)
	self.layer_norm_output = torch.nn.LayerNorm(input_dim)

	self.left_context_length = left_context_length
	self.segment_length = segment_length
	self.max_memory_size = max_memory_size
	self.input_dim = input_dim

	self.use_mem = max_memory_size > 0

	def _init_state(self, batch_size: int, device: Optional[torch.device]) -> List[torch.Tensor]:
	empty_memory = torch.zeros(self.max_memory_size, batch_size, self.input_dim, device=device)
	left_context_key = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
	left_context_val = torch.zeros(self.left_context_length, batch_size, self.input_dim, device=device)
	past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device)
	return [empty_memory, left_context_key, left_context_val, past_length]

	def _unpack_state(self, state: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	past_length = state[3][0][0].item()
	past_left_context_length = min(self.left_context_length, past_length)
	past_mem_length = min(self.max_memory_size, math.ceil(past_length / self.segment_length))
	pre_mems = state[0][self.max_memory_size - past_mem_length :]
	lc_key = state[1][self.left_context_length - past_left_context_length :]
	lc_val = state[2][self.left_context_length - past_left_context_length :]
	return pre_mems, lc_key, lc_val

	def _pack_state(
	self,
	next_k: torch.Tensor,
	next_v: torch.Tensor,
	update_length: int,
	mems: torch.Tensor,
	state: List[torch.Tensor],
	) -> List[torch.Tensor]:
	new_k = torch.cat([state[1], next_k])
	new_v = torch.cat([state[2], next_v])
	state[0] = torch.cat([state[0], mems])[-self.max_memory_size :]
	state[1] = new_k[new_k.shape[0] - self.left_context_length :]
	state[2] = new_v[new_v.shape[0] - self.left_context_length :]
	state[3] = state[3] + update_length
	return state

	def _process_attention_output(
	self,
	rc_output: torch.Tensor,
	utterance: torch.Tensor,
	right_context: torch.Tensor,
	) -> torch.Tensor:
	result = self.dropout(rc_output) + torch.cat([right_context, utterance])
	result = self.pos_ff(result) + result
	result = self.layer_norm_output(result)
	return result

	def _apply_pre_attention_layer_norm(
	self, utterance: torch.Tensor, right_context: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	layer_norm_input = self.layer_norm_input(torch.cat([right_context, utterance]))
	return (
	layer_norm_input[right_context.size(0) :],
	layer_norm_input[: right_context.size(0)],
	)

	def _apply_post_attention_ffn(
	self, rc_output: torch.Tensor, utterance: torch.Tensor, right_context: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	rc_output = self._process_attention_output(rc_output, utterance, right_context)
	return rc_output[right_context.size(0) :], rc_output[: right_context.size(0)]

	def _apply_attention_forward(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	mems: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	) -> Tuple[torch.Tensor, torch.Tensor]:
	if attention_mask is None:
	raise ValueError("attention_mask must be not None when for_inference is False")

	if self.use_mem:
	summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
	else:
	summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
	rc_output, next_m = self.attention(
	utterance=utterance,
	lengths=lengths,
	right_context=right_context,
	summary=summary,
	mems=mems,
	attention_mask=attention_mask,
	)
	return rc_output, next_m

	def _apply_attention_infer(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	mems: torch.Tensor,
	state: Optional[List[torch.Tensor]],
	) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor]]:
	if state is None:
	state = self._init_state(utterance.size(1), device=utterance.device)
	pre_mems, lc_key, lc_val = self._unpack_state(state)
	if self.use_mem:
	summary = self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
	summary = summary[:1]
	else:
	summary = torch.empty(0).to(dtype=utterance.dtype, device=utterance.device)
	rc_output, next_m, next_k, next_v = self.attention.infer(
	utterance=utterance,
	lengths=lengths,
	right_context=right_context,
	summary=summary,
	mems=pre_mems,
	left_context_key=lc_key,
	left_context_val=lc_val,
	)
	state = self._pack_state(next_k, next_v, utterance.size(0), mems, state)
	return rc_output, next_m, state

	def forward(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	mems: torch.Tensor,
	attention_mask: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	r"""Forward pass for training.

	B: batch size;
	D: feature dimension of each frame;
	T: number of utterance frames;
	R: number of right context frames;
	M: number of memory elements.

	Args:
	utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``utterance``.
	right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
	mems (torch.Tensor): memory elements, with shape `(M, B, D)`.
	attention_mask (torch.Tensor): attention mask for underlying attention module.

	Returns:
	(Tensor, Tensor, Tensor):
	Tensor
	encoded utterance frames, with shape `(T, B, D)`.
	Tensor
	updated right context frames, with shape `(R, B, D)`.
	Tensor
	updated memory elements, with shape `(M, B, D)`.
	"""
	(
	layer_norm_utterance,
	layer_norm_right_context,
	) = self._apply_pre_attention_layer_norm(utterance, right_context)
	rc_output, output_mems = self._apply_attention_forward(
	layer_norm_utterance,
	lengths,
	layer_norm_right_context,
	mems,
	attention_mask,
	)
	output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context)
	return output_utterance, output_right_context, output_mems

	@torch.jit.export
	def infer(
	self,
	utterance: torch.Tensor,
	lengths: torch.Tensor,
	right_context: torch.Tensor,
	state: Optional[List[torch.Tensor]],
	mems: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], torch.Tensor]:
	r"""Forward pass for inference.

	B: batch size;
	D: feature dimension of each frame;
	T: number of utterance frames;
	R: number of right context frames;
	M: number of memory elements.

	Args:
	utterance (torch.Tensor): utterance frames, with shape `(T, B, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``utterance``.
	right_context (torch.Tensor): right context frames, with shape `(R, B, D)`.
	state (List[torch.Tensor] or None): list of tensors representing layer internal state
	generated in preceding invocation of ``infer``.
	mems (torch.Tensor): memory elements, with shape `(M, B, D)`.

	Returns:
	(Tensor, Tensor, List[torch.Tensor], Tensor):
	Tensor
	encoded utterance frames, with shape `(T, B, D)`.
	Tensor
	updated right context frames, with shape `(R, B, D)`.
	List[Tensor]
	list of tensors representing layer internal state
	generated in current invocation of ``infer``.
	Tensor
	updated memory elements, with shape `(M, B, D)`.
	"""
	(
	layer_norm_utterance,
	layer_norm_right_context,
	) = self._apply_pre_attention_layer_norm(utterance, right_context)
	rc_output, output_mems, output_state = self._apply_attention_infer(
	layer_norm_utterance, lengths, layer_norm_right_context, mems, state
	)
	output_utterance, output_right_context = self._apply_post_attention_ffn(rc_output, utterance, right_context)
	return output_utterance, output_right_context, output_state, output_mems


	class _EmformerImpl(torch.nn.Module):
	def __init__(
	self,
	emformer_layers: torch.nn.ModuleList,
	segment_length: int,
	left_context_length: int = 0,
	right_context_length: int = 0,
	max_memory_size: int = 0,
	):
	super().__init__()

	self.use_mem = max_memory_size > 0
	self.memory_op = torch.nn.AvgPool1d(
	kernel_size=segment_length,
	stride=segment_length,
	ceil_mode=True,
	)
	self.emformer_layers = emformer_layers
	self.left_context_length = left_context_length
	self.right_context_length = right_context_length
	self.segment_length = segment_length
	self.max_memory_size = max_memory_size

	def _gen_right_context(self, input: torch.Tensor) -> torch.Tensor:
	T = input.shape[0]
	num_segs = math.ceil((T - self.right_context_length) / self.segment_length)
	right_context_blocks = []
	for seg_idx in range(num_segs - 1):
	start = (seg_idx + 1) * self.segment_length
	end = start + self.right_context_length
	right_context_blocks.append(input[start:end])
	right_context_blocks.append(input[T - self.right_context_length :])
	return torch.cat(right_context_blocks)

	def _gen_attention_mask_col_widths(self, seg_idx: int, utterance_length: int) -> List[int]:
	num_segs = math.ceil(utterance_length / self.segment_length)
	rc = self.right_context_length
	lc = self.left_context_length
	rc_start = seg_idx * rc
	rc_end = rc_start + rc
	seg_start = max(seg_idx * self.segment_length - lc, 0)
	seg_end = min((seg_idx + 1) * self.segment_length, utterance_length)
	rc_length = self.right_context_length * num_segs

	if self.use_mem:
	m_start = max(seg_idx - self.max_memory_size, 0)
	mem_length = num_segs - 1
	col_widths = [
	m_start, # before memory
	seg_idx - m_start, # memory
	mem_length - seg_idx, # after memory
	rc_start, # before right context
	rc, # right context
	rc_length - rc_end, # after right context
	seg_start, # before query segment
	seg_end - seg_start, # query segment
	utterance_length - seg_end, # after query segment
	]
	else:
	col_widths = [
	rc_start, # before right context
	rc, # right context
	rc_length - rc_end, # after right context
	seg_start, # before query segment
	seg_end - seg_start, # query segment
	utterance_length - seg_end, # after query segment
	]

	return col_widths

	def _gen_attention_mask(self, input: torch.Tensor) -> torch.Tensor:
	utterance_length = input.size(0)
	num_segs = math.ceil(utterance_length / self.segment_length)

	rc_mask = []
	query_mask = []
	summary_mask = []

	if self.use_mem:
	num_cols = 9
	# memory, right context, query segment
	rc_q_cols_mask = [idx in [1, 4, 7] for idx in range(num_cols)]
	# right context, query segment
	s_cols_mask = [idx in [4, 7] for idx in range(num_cols)]
	masks_to_concat = [rc_mask, query_mask, summary_mask]
	else:
	num_cols = 6
	# right context, query segment
	rc_q_cols_mask = [idx in [1, 4] for idx in range(num_cols)]
	s_cols_mask = None
	masks_to_concat = [rc_mask, query_mask]

	for seg_idx in range(num_segs):
	col_widths = self._gen_attention_mask_col_widths(seg_idx, utterance_length)

	rc_mask_block = _gen_attention_mask_block(
	col_widths, rc_q_cols_mask, self.right_context_length, input.device
	)
	rc_mask.append(rc_mask_block)

	query_mask_block = _gen_attention_mask_block(
	col_widths,
	rc_q_cols_mask,
	min(
	self.segment_length,
	utterance_length - seg_idx * self.segment_length,
	),
	input.device,
	)
	query_mask.append(query_mask_block)

	if s_cols_mask is not None:
	summary_mask_block = _gen_attention_mask_block(col_widths, s_cols_mask, 1, input.device)
	summary_mask.append(summary_mask_block)

	attention_mask = (1 - torch.cat([torch.cat(mask) for mask in masks_to_concat])).to(torch.bool)
	return attention_mask

	def forward(self, input: torch.Tensor, lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	r"""Forward pass for training and non-streaming inference.

	B: batch size;
	T: max number of input frames in batch;
	D: feature dimension of each frame.

	Args:
	input (torch.Tensor): utterance frames right-padded with right context frames, with
	shape `(B, T + right_context_length, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid utterance frames for i-th batch element in ``input``.

	Returns:
	(Tensor, Tensor):
	Tensor
	output frames, with shape `(B, T, D)`.
	Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in output frames.
	"""
	input = input.permute(1, 0, 2)
	right_context = self._gen_right_context(input)
	utterance = input[: input.size(0) - self.right_context_length]
	attention_mask = self._gen_attention_mask(utterance)
	mems = (
	self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)[:-1]
	if self.use_mem
	else torch.empty(0).to(dtype=input.dtype, device=input.device)
	)
	output = utterance
	for layer in self.emformer_layers:
	output, right_context, mems = layer(output, lengths, right_context, mems, attention_mask)
	return output.permute(1, 0, 2), lengths

	@torch.jit.export
	def infer(
	self,
	input: torch.Tensor,
	lengths: torch.Tensor,
	states: Optional[List[List[torch.Tensor]]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor, List[List[torch.Tensor]]]:
	r"""Forward pass for streaming inference.

	B: batch size;
	D: feature dimension of each frame.

	Args:
	input (torch.Tensor): utterance frames right-padded with right context frames, with
	shape `(B, segment_length + right_context_length, D)`.
	lengths (torch.Tensor): with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in ``input``.
	states (List[List[torch.Tensor]] or None, optional): list of lists of tensors
	representing internal state generated in preceding invocation of ``infer``. (Default: ``None``)

	Returns:
	(Tensor, Tensor, List[List[Tensor]]):
	Tensor
	output frames, with shape `(B, segment_length, D)`.
	Tensor
	output lengths, with shape `(B,)` and i-th element representing
	number of valid frames for i-th batch element in output frames.
	List[List[Tensor]]
	output states; list of lists of tensors representing internal state
	generated in current invocation of ``infer``.
	"""
	if input.size(1) != self.segment_length + self.right_context_length:
	raise ValueError(
	"Per configured segment_length and right_context_length"
	f", expected size of {self.segment_length + self.right_context_length} for dimension 1 of input"
	f", but got {input.size(1)}."
	)
	input = input.permute(1, 0, 2)
	right_context_start_idx = input.size(0) - self.right_context_length
	right_context = input[right_context_start_idx:]
	utterance = input[:right_context_start_idx]
	output_lengths = torch.clamp(lengths - self.right_context_length, min=0)
	mems = (
	self.memory_op(utterance.permute(1, 2, 0)).permute(2, 0, 1)
	if self.use_mem
	else torch.empty(0).to(dtype=input.dtype, device=input.device)
	)
	output = utterance
	output_states: List[List[torch.Tensor]] = []
	for layer_idx, layer in enumerate(self.emformer_layers):
	output, right_context, output_state, mems = layer.infer(
	output,
	output_lengths,
	right_context,
	None if states is None else states[layer_idx],
	mems,
	)
	output_states.append(output_state)

	return output.permute(1, 0, 2), output_lengths, output_states


	class Emformer(_EmformerImpl):
	r"""Emformer architecture introduced in
	Emformer: Efficient Memory Transformer Based Acoustic Model for Low Latency Streaming Speech Recognition
	:cite:`shi2021emformer`.

	See Also:
	* :func:`~torchaudio.models.emformer_rnnt_model`,
	:func:`~torchaudio.models.emformer_rnnt_base`: factory functions.
	* :class:`torchaudio.pipelines.RNNTBundle`: ASR pipelines with pretrained model.

	Args:
	input_dim (int): input dimension.
	num_heads (int): number of attention heads in each Emformer layer.
	ffn_dim (int): hidden layer dimension of each Emformer layer's feedforward network.
	num_layers (int): number of Emformer layers to instantiate.
	segment_length (int): length of each input segment.
	dropout (float, optional): dropout probability. (Default: 0.0)
	activation (str, optional): activation function to use in each Emformer layer's
	feedforward network. Must be one of ("relu", "gelu", "silu"). (Default: "relu")
	left_context_length (int, optional): length of left context. (Default: 0)
	right_context_length (int, optional): length of right context. (Default: 0)
	max_memory_size (int, optional): maximum number of memory elements to use. (Default: 0)
	weight_init_scale_strategy (str or None, optional): per-layer weight initialization scaling
	strategy. Must be one of ("depthwise", "constant", ``None``). (Default: "depthwise")
	tanh_on_mem (bool, optional): if ``True``, applies tanh to memory elements. (Default: ``False``)
	negative_inf (float, optional): value to use for negative infinity in attention weights. (Default: -1e8)

	Examples:
	>>> emformer = Emformer(512, 8, 2048, 20, 4, right_context_length=1)
	>>> input = torch.rand(128, 400, 512) # batch, num_frames, feature_dim
	>>> lengths = torch.randint(1, 200, (128,)) # batch
	>>> output, lengths = emformer(input, lengths)
	>>> input = torch.rand(128, 5, 512)
	>>> lengths = torch.ones(128) * 5
	>>> output, lengths, states = emformer.infer(input, lengths, None)
	"""

	def __init__(
	self,
	input_dim: int,
	num_heads: int,
	ffn_dim: int,
	num_layers: int,
	segment_length: int,
	dropout: float = 0.0,
	activation: str = "relu",
	left_context_length: int = 0,
	right_context_length: int = 0,
	max_memory_size: int = 0,
	weight_init_scale_strategy: Optional[str] = "depthwise",
	tanh_on_mem: bool = False,
	negative_inf: float = -1e8,
	):
	weight_init_gains = _get_weight_init_gains(weight_init_scale_strategy, num_layers)
	emformer_layers = torch.nn.ModuleList(
	[
	_EmformerLayer(
	input_dim,
	num_heads,
	ffn_dim,
	segment_length,
	dropout=dropout,
	activation=activation,
	left_context_length=left_context_length,
	max_memory_size=max_memory_size,
	weight_init_gain=weight_init_gains[layer_idx],
	tanh_on_mem=tanh_on_mem,
	negative_inf=negative_inf,
	)
	for layer_idx in range(num_layers)
	]
	)
	super().__init__(
	emformer_layers,
	segment_length,
	left_context_length=left_context_length,
	right_context_length=right_context_length,
	max_memory_size=max_memory_size,
	)