import numpy as np import torch import torch.nn as nn from utils.model_util import max_with_lens, mean_with_lens def embedding_pooling(x, lens, pooling="mean"): if pooling == "max": fc_embs = max_with_lens(x, lens) elif pooling == "mean": fc_embs = mean_with_lens(x, lens) elif pooling == "mean+max": x_mean = mean_with_lens(x, lens) x_max = max_with_lens(x, lens) fc_embs = x_mean + x_max elif pooling == "last": indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1)) # indices: [N, 1, hidden] fc_embs = torch.gather(x, 1, indices).squeeze(1) else: raise Exception(f"pooling method {pooling} not support") return fc_embs class BaseEncoder(nn.Module): """ Encode the given audio into embedding Base encoder class, cannot be called directly All encoders should inherit from this class """ def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim): super(BaseEncoder, self).__init__() self.spec_dim = spec_dim self.fc_feat_dim = fc_feat_dim self.attn_feat_dim = attn_feat_dim def forward(self, x): ######################### # Arguments: # `x`: { # (may contain) # wav: [batch_size, n_samples], # spec: [batch_size, n_frames, spec_dim], # fc: [batch_size, fc_feat_dim], # attn: [batch_size, attn_max_len, attn_feat_dim], # attn_len: [batch_size,] # ...... # } # # Returns: # `encoded`: { # fc_emb: [batch_size, fc_emb_dim], # attn_emb: [batch_size, attn_max_len, attn_emb_dim], # attn_emb_lens: [batch_size,] # } ######################### raise NotImplementedError class BaseDecoder(nn.Module): """ Take word/audio embeddings and output the next word probs """ def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim, dropout=0.2, tie_weights=False): super().__init__() self.emb_dim = emb_dim self.vocab_size = vocab_size self.fc_emb_dim = fc_emb_dim self.attn_emb_dim = attn_emb_dim self.tie_weights = tie_weights self.word_embedding = nn.Embedding(vocab_size, emb_dim) self.in_dropout = nn.Dropout(dropout) def forward(self, x): raise NotImplementedError def load_word_embedding(self, weight, freeze=True): embedding = np.load(weight) assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch" assert embedding.shape[1] == self.emb_dim, "embed size mismatch" # embeddings = torch.as_tensor(embeddings).float() # self.word_embeddings.weight = nn.Parameter(embeddings) # for para in self.word_embeddings.parameters(): # para.requires_grad = tune self.word_embedding = nn.Embedding.from_pretrained(embedding, freeze=freeze)