Spaces:

wsntxxn
/

efficient_audio_captioning

Running

efficient_audio_captioning / models /__init__.py

wsntxxn

Add AudioCaps checkpoint

6065472 about 1 year ago

3.03 kB

	import numpy as np
	import torch
	import torch.nn as nn

	from utils.model_util import max_with_lens, mean_with_lens


	def embedding_pooling(x, lens, pooling="mean"):
	if pooling == "max":
	fc_embs = max_with_lens(x, lens)
	elif pooling == "mean":
	fc_embs = mean_with_lens(x, lens)
	elif pooling == "mean+max":
	x_mean = mean_with_lens(x, lens)
	x_max = max_with_lens(x, lens)
	fc_embs = x_mean + x_max
	elif pooling == "last":
	indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1))
	# indices: [N, 1, hidden]
	fc_embs = torch.gather(x, 1, indices).squeeze(1)
	else:
	raise Exception(f"pooling method {pooling} not support")
	return fc_embs


	class BaseEncoder(nn.Module):

	"""
	Encode the given audio into embedding
	Base encoder class, cannot be called directly
	All encoders should inherit from this class
	"""

	def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
	super(BaseEncoder, self).__init__()
	self.spec_dim = spec_dim
	self.fc_feat_dim = fc_feat_dim
	self.attn_feat_dim = attn_feat_dim


	def forward(self, x):
	#########################
	# Arguments:
	# `x`: {
	# (may contain)
	# wav: [batch_size, n_samples],
	# spec: [batch_size, n_frames, spec_dim],
	# fc: [batch_size, fc_feat_dim],
	# attn: [batch_size, attn_max_len, attn_feat_dim],
	# attn_len: [batch_size,]
	# ......
	# }
	#
	# Returns:
	# `encoded`: {
	# fc_emb: [batch_size, fc_emb_dim],
	# attn_emb: [batch_size, attn_max_len, attn_emb_dim],
	# attn_emb_lens: [batch_size,]
	# }
	#########################
	raise NotImplementedError


	class BaseDecoder(nn.Module):
	"""
	Take word/audio embeddings and output the next word probs
	"""
	def __init__(self, emb_dim, vocab_size, fc_emb_dim,
	attn_emb_dim, dropout=0.2, tie_weights=False):
	super().__init__()
	self.emb_dim = emb_dim
	self.vocab_size = vocab_size
	self.fc_emb_dim = fc_emb_dim
	self.attn_emb_dim = attn_emb_dim
	self.tie_weights = tie_weights
	self.word_embedding = nn.Embedding(vocab_size, emb_dim)
	self.in_dropout = nn.Dropout(dropout)

	def forward(self, x):
	raise NotImplementedError

	def load_word_embedding(self, weight, freeze=True):
	embedding = np.load(weight)
	assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch"
	assert embedding.shape[1] == self.emb_dim, "embed size mismatch"

	# embeddings = torch.as_tensor(embeddings).float()
	# self.word_embeddings.weight = nn.Parameter(embeddings)
	# for para in self.word_embeddings.parameters():
	# para.requires_grad = tune
	self.word_embedding = nn.Embedding.from_pretrained(embedding,
	freeze=freeze)