Spaces:
Paused
Paused
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import math | |
class CNNPrenet(torch.nn.Module): | |
def __init__(self): | |
super(CNNPrenet, self).__init__() | |
# Define the layers using Sequential container | |
self.conv_layers = nn.Sequential( | |
nn.Conv1d(in_channels=1, out_channels=512, kernel_size=3, padding=1), | |
nn.BatchNorm1d(512), | |
nn.ReLU(), | |
nn.Dropout(0.1), | |
nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1), | |
nn.BatchNorm1d(512), | |
nn.ReLU(), | |
nn.Dropout(0.1), | |
nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1), | |
nn.BatchNorm1d(512), | |
nn.ReLU(), | |
nn.Dropout(0.1) | |
) | |
def forward(self, x): | |
# Add a new dimension for the channel | |
x = x.unsqueeze(1) | |
# Pass input through convolutional layers | |
x = self.conv_layers(x) | |
# Remove the channel dimension | |
x = x.squeeze(1) | |
# Scale the output to the range [-1, 1] | |
x = torch.tanh(x) | |
return x | |
class CNNDecoderPrenet(nn.Module): | |
def __init__(self, input_dim=80, hidden_dim=256, output_dim=256, final_dim=512, dropout_rate=0.5): | |
super(CNNDecoderPrenet, self).__init__() | |
self.layer1 = nn.Linear(input_dim, hidden_dim) | |
self.layer2 = nn.Linear(hidden_dim, output_dim) | |
self.linear_projection = nn.Linear(output_dim, final_dim) # Added linear projection | |
self.dropout = nn.Dropout(dropout_rate) | |
def forward(self, x): | |
# Transpose the input tensor to have the feature dimension as the last dimension | |
x = x.transpose(1, 2) | |
# Apply the linear layers | |
x = F.relu(self.layer1(x)) | |
x = self.dropout(x) | |
x = F.relu(self.layer2(x)) | |
x = self.dropout(x) | |
# Apply the linear projection | |
x = self.linear_projection(x) | |
x = x.transpose(1, 2) | |
return x | |
class CNNPostNet(torch.nn.Module): | |
""" | |
Conv Postnet | |
Arguments | |
--------- | |
n_mel_channels: int | |
input feature dimension for convolution layers | |
postnet_embedding_dim: int | |
output feature dimension for convolution layers | |
postnet_kernel_size: int | |
postnet convolution kernal size | |
postnet_n_convolutions: int | |
number of convolution layers | |
postnet_dropout: float | |
dropout probability fot postnet | |
""" | |
def __init__( | |
self, | |
n_mel_channels=80, | |
postnet_embedding_dim=512, | |
postnet_kernel_size=5, | |
postnet_n_convolutions=5, | |
postnet_dropout=0.1, | |
): | |
super(CNNPostNet, self).__init__() | |
self.conv_pre = nn.Conv1d( | |
in_channels=n_mel_channels, | |
out_channels=postnet_embedding_dim, | |
kernel_size=postnet_kernel_size, | |
padding="same", | |
) | |
self.convs_intermedite = nn.ModuleList() | |
for i in range(1, postnet_n_convolutions - 1): | |
self.convs_intermedite.append( | |
nn.Conv1d( | |
in_channels=postnet_embedding_dim, | |
out_channels=postnet_embedding_dim, | |
kernel_size=postnet_kernel_size, | |
padding="same", | |
), | |
) | |
self.conv_post = nn.Conv1d( | |
in_channels=postnet_embedding_dim, | |
out_channels=n_mel_channels, | |
kernel_size=postnet_kernel_size, | |
padding="same", | |
) | |
self.tanh = nn.Tanh() | |
self.ln1 = nn.LayerNorm(postnet_embedding_dim) | |
self.ln2 = nn.LayerNorm(postnet_embedding_dim) | |
self.ln3 = nn.LayerNorm(n_mel_channels) | |
self.dropout1 = nn.Dropout(postnet_dropout) | |
self.dropout2 = nn.Dropout(postnet_dropout) | |
self.dropout3 = nn.Dropout(postnet_dropout) | |
def forward(self, x): | |
"""Computes the forward pass | |
Arguments | |
--------- | |
x: torch.Tensor | |
a (batch, time_steps, features) input tensor | |
Returns | |
------- | |
output: torch.Tensor (the spectrogram predicted) | |
""" | |
x = self.conv_pre(x) | |
x = self.ln1(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length] | |
x = self.tanh(x) | |
x = self.dropout1(x) | |
for i in range(len(self.convs_intermedite)): | |
x = self.convs_intermedite[i](x) | |
x = self.ln2(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length] | |
x = self.tanh(x) | |
x = self.dropout2(x) | |
x = self.conv_post(x) | |
x = self.ln3(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length] | |
x = self.dropout3(x) | |
return x | |
class ScaledPositionalEncoding(nn.Module): | |
""" | |
This class implements the absolute sinusoidal positional encoding function | |
with an adaptive weight parameter alpha. | |
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) | |
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) | |
Arguments | |
--------- | |
input_size: int | |
Embedding dimension. | |
max_len : int, optional | |
Max length of the input sequences (default 2500). | |
Example | |
------- | |
>>> a = torch.rand((8, 120, 512)) | |
>>> enc = PositionalEncoding(input_size=a.shape[-1]) | |
>>> b = enc(a) | |
>>> b.shape | |
torch.Size([1, 120, 512]) | |
""" | |
def __init__(self, input_size, max_len=2500): | |
super().__init__() | |
if input_size % 2 != 0: | |
raise ValueError( | |
f"Cannot use sin/cos positional encoding with odd channels (got channels={input_size})" | |
) | |
self.max_len = max_len | |
self.alpha = nn.Parameter(torch.ones(1)) # Define alpha as a trainable parameter | |
pe = torch.zeros(self.max_len, input_size, requires_grad=False) | |
positions = torch.arange(0, self.max_len).unsqueeze(1).float() | |
denominator = torch.exp( | |
torch.arange(0, input_size, 2).float() | |
* -(math.log(10000.0) / input_size) | |
) | |
pe[:, 0::2] = torch.sin(positions * denominator) | |
pe[:, 1::2] = torch.cos(positions * denominator) | |
pe = pe.unsqueeze(0) | |
self.register_buffer("pe", pe) | |
def forward(self, x): | |
""" | |
Arguments | |
--------- | |
x : tensor | |
Input feature shape (batch, time, fea) | |
""" | |
pe_scaled = self.pe[:, :x.size(1)].clone().detach() * self.alpha # Scale positional encoding by alpha | |
return pe_scaled | |