|
import numpy as np |
|
import torch |
|
import torch.nn as nn |
|
|
|
from utils.model_util import max_with_lens, mean_with_lens |
|
|
|
|
|
def embedding_pooling(x, lens, pooling="mean"): |
|
if pooling == "max": |
|
fc_embs = max_with_lens(x, lens) |
|
elif pooling == "mean": |
|
fc_embs = mean_with_lens(x, lens) |
|
elif pooling == "mean+max": |
|
x_mean = mean_with_lens(x, lens) |
|
x_max = max_with_lens(x, lens) |
|
fc_embs = x_mean + x_max |
|
elif pooling == "last": |
|
indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1)) |
|
|
|
fc_embs = torch.gather(x, 1, indices).squeeze(1) |
|
else: |
|
raise Exception(f"pooling method {pooling} not support") |
|
return fc_embs |
|
|
|
|
|
class BaseEncoder(nn.Module): |
|
|
|
""" |
|
Encode the given audio into embedding |
|
Base encoder class, cannot be called directly |
|
All encoders should inherit from this class |
|
""" |
|
|
|
def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim): |
|
super(BaseEncoder, self).__init__() |
|
self.spec_dim = spec_dim |
|
self.fc_feat_dim = fc_feat_dim |
|
self.attn_feat_dim = attn_feat_dim |
|
|
|
|
|
def forward(self, x): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raise NotImplementedError |
|
|
|
|
|
class BaseDecoder(nn.Module): |
|
""" |
|
Take word/audio embeddings and output the next word probs |
|
""" |
|
def __init__(self, emb_dim, vocab_size, fc_emb_dim, |
|
attn_emb_dim, dropout=0.2, tie_weights=False): |
|
super().__init__() |
|
self.emb_dim = emb_dim |
|
self.vocab_size = vocab_size |
|
self.fc_emb_dim = fc_emb_dim |
|
self.attn_emb_dim = attn_emb_dim |
|
self.tie_weights = tie_weights |
|
self.word_embedding = nn.Embedding(vocab_size, emb_dim) |
|
self.in_dropout = nn.Dropout(dropout) |
|
|
|
def forward(self, x): |
|
raise NotImplementedError |
|
|
|
def load_word_embedding(self, weight, freeze=True): |
|
embedding = np.load(weight) |
|
assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch" |
|
assert embedding.shape[1] == self.emb_dim, "embed size mismatch" |
|
|
|
|
|
|
|
|
|
|
|
self.word_embedding = nn.Embedding.from_pretrained(embedding, |
|
freeze=freeze) |
|
|