Spaces:
Running
on
T4
Running
on
T4
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from torch import einsum | |
from tortoise.models.arch_util import AttentionBlock | |
from tortoise.models.xtransformers import ContinuousTransformerWrapper, Encoder | |
def exists(val): | |
return val is not None | |
def masked_mean(t, mask): | |
t = t.masked_fill(~mask, 0.) | |
return t.sum(dim=1) / mask.sum(dim=1) | |
class CollapsingTransformer(nn.Module): | |
def __init__(self, model_dim, output_dims, heads, dropout, depth, mask_percentage=0, **encoder_kwargs): | |
super().__init__() | |
self.transformer = ContinuousTransformerWrapper( | |
max_seq_len=-1, | |
use_pos_emb=False, | |
attn_layers=Encoder( | |
dim=model_dim, | |
depth=depth, | |
heads=heads, | |
ff_dropout=dropout, | |
ff_mult=1, | |
attn_dropout=dropout, | |
use_rmsnorm=True, | |
ff_glu=True, | |
rotary_pos_emb=True, | |
**encoder_kwargs, | |
)) | |
self.pre_combiner = nn.Sequential(nn.Conv1d(model_dim, output_dims, 1), | |
AttentionBlock( | |
output_dims, num_heads=heads, do_checkpoint=False), | |
nn.Conv1d(output_dims, output_dims, 1)) | |
self.mask_percentage = mask_percentage | |
def forward(self, x, **transformer_kwargs): | |
h = self.transformer(x, **transformer_kwargs) | |
h = h.permute(0, 2, 1) | |
h = self.pre_combiner(h).permute(0, 2, 1) | |
if self.training: | |
mask = torch.rand_like(h.float()) > self.mask_percentage | |
else: | |
mask = torch.ones_like(h.float()).bool() | |
return masked_mean(h, mask) | |
class ConvFormatEmbedding(nn.Module): | |
def __init__(self, *args, **kwargs): | |
super().__init__() | |
self.emb = nn.Embedding(*args, **kwargs) | |
def forward(self, x): | |
y = self.emb(x) | |
return y.permute(0, 2, 1) | |
class CVVP(nn.Module): | |
def __init__( | |
self, | |
model_dim=512, | |
transformer_heads=8, | |
dropout=.1, | |
conditioning_enc_depth=8, | |
cond_mask_percentage=0, | |
mel_channels=80, | |
mel_codes=None, | |
speech_enc_depth=8, | |
speech_mask_percentage=0, | |
latent_multiplier=1, | |
): | |
super().__init__() | |
latent_dim = latent_multiplier*model_dim | |
self.temperature = nn.Parameter(torch.tensor(1.)) | |
self.cond_emb = nn.Sequential(nn.Conv1d(mel_channels, model_dim//2, kernel_size=5, stride=2, padding=2), | |
nn.Conv1d(model_dim//2, model_dim, kernel_size=3, stride=2, padding=1)) | |
self.conditioning_transformer = CollapsingTransformer( | |
model_dim, model_dim, transformer_heads, dropout, conditioning_enc_depth, cond_mask_percentage) | |
self.to_conditioning_latent = nn.Linear( | |
latent_dim, latent_dim, bias=False) | |
if mel_codes is None: | |
self.speech_emb = nn.Conv1d( | |
mel_channels, model_dim, kernel_size=5, padding=2) | |
else: | |
self.speech_emb = ConvFormatEmbedding(mel_codes, model_dim) | |
self.speech_transformer = CollapsingTransformer( | |
model_dim, latent_dim, transformer_heads, dropout, speech_enc_depth, speech_mask_percentage) | |
self.to_speech_latent = nn.Linear( | |
latent_dim, latent_dim, bias=False) | |
def get_grad_norm_parameter_groups(self): | |
return { | |
'conditioning': list(self.conditioning_transformer.parameters()), | |
'speech': list(self.speech_transformer.parameters()), | |
} | |
def forward( | |
self, | |
mel_cond, | |
mel_input, | |
return_loss=False | |
): | |
cond_emb = self.cond_emb(mel_cond).permute(0, 2, 1) | |
enc_cond = self.conditioning_transformer(cond_emb) | |
cond_latents = self.to_conditioning_latent(enc_cond) | |
speech_emb = self.speech_emb(mel_input).permute(0, 2, 1) | |
enc_speech = self.speech_transformer(speech_emb) | |
speech_latents = self.to_speech_latent(enc_speech) | |
cond_latents, speech_latents = map(lambda t: F.normalize( | |
t, p=2, dim=-1), (cond_latents, speech_latents)) | |
temp = self.temperature.exp() | |
if not return_loss: | |
sim = einsum('n d, n d -> n', cond_latents, | |
speech_latents) * temp | |
return sim | |
sim = einsum('i d, j d -> i j', cond_latents, | |
speech_latents) * temp | |
labels = torch.arange( | |
cond_latents.shape[0], device=mel_input.device) | |
loss = (F.cross_entropy(sim, labels) + | |
F.cross_entropy(sim.t(), labels)) / 2 | |
return loss | |
if __name__ == '__main__': | |
clvp = CVVP() | |
clvp(torch.randn(2, 80, 100), | |
torch.randn(2, 80, 95), | |
return_loss=True) | |