wsntxxn
Add AudioCaps checkpoint
6065472
import numpy as np
import torch
import torch.nn as nn
from utils.model_util import max_with_lens, mean_with_lens
def embedding_pooling(x, lens, pooling="mean"):
if pooling == "max":
fc_embs = max_with_lens(x, lens)
elif pooling == "mean":
fc_embs = mean_with_lens(x, lens)
elif pooling == "mean+max":
x_mean = mean_with_lens(x, lens)
x_max = max_with_lens(x, lens)
fc_embs = x_mean + x_max
elif pooling == "last":
indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1))
# indices: [N, 1, hidden]
fc_embs = torch.gather(x, 1, indices).squeeze(1)
else:
raise Exception(f"pooling method {pooling} not support")
return fc_embs
class BaseEncoder(nn.Module):
"""
Encode the given audio into embedding
Base encoder class, cannot be called directly
All encoders should inherit from this class
"""
def __init__(self, spec_dim, fc_feat_dim, attn_feat_dim):
super(BaseEncoder, self).__init__()
self.spec_dim = spec_dim
self.fc_feat_dim = fc_feat_dim
self.attn_feat_dim = attn_feat_dim
def forward(self, x):
#########################
# Arguments:
# `x`: {
# (may contain)
# wav: [batch_size, n_samples],
# spec: [batch_size, n_frames, spec_dim],
# fc: [batch_size, fc_feat_dim],
# attn: [batch_size, attn_max_len, attn_feat_dim],
# attn_len: [batch_size,]
# ......
# }
#
# Returns:
# `encoded`: {
# fc_emb: [batch_size, fc_emb_dim],
# attn_emb: [batch_size, attn_max_len, attn_emb_dim],
# attn_emb_lens: [batch_size,]
# }
#########################
raise NotImplementedError
class BaseDecoder(nn.Module):
"""
Take word/audio embeddings and output the next word probs
"""
def __init__(self, emb_dim, vocab_size, fc_emb_dim,
attn_emb_dim, dropout=0.2, tie_weights=False):
super().__init__()
self.emb_dim = emb_dim
self.vocab_size = vocab_size
self.fc_emb_dim = fc_emb_dim
self.attn_emb_dim = attn_emb_dim
self.tie_weights = tie_weights
self.word_embedding = nn.Embedding(vocab_size, emb_dim)
self.in_dropout = nn.Dropout(dropout)
def forward(self, x):
raise NotImplementedError
def load_word_embedding(self, weight, freeze=True):
embedding = np.load(weight)
assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch"
assert embedding.shape[1] == self.emb_dim, "embed size mismatch"
# embeddings = torch.as_tensor(embeddings).float()
# self.word_embeddings.weight = nn.Parameter(embeddings)
# for para in self.word_embeddings.parameters():
# para.requires_grad = tune
self.word_embedding = nn.Embedding.from_pretrained(embedding,
freeze=freeze)