Spaces:

myhanhhyugen
/

TTSDemoApp

Paused

File size: 6,582 Bytes

dc9eaa3


import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class CNNPrenet(torch.nn.Module):
    def __init__(self):
        super(CNNPrenet, self).__init__()

        # Define the layers using Sequential container
        self.conv_layers = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.1)
        )

    def forward(self, x):

        # Add a new dimension for the channel
        x = x.unsqueeze(1)

        # Pass input through convolutional layers
        x = self.conv_layers(x)

        # Remove the channel dimension
        x = x.squeeze(1)

        # Scale the output to the range [-1, 1]
        x = torch.tanh(x)

        return x



class CNNDecoderPrenet(nn.Module):
    def __init__(self, input_dim=80, hidden_dim=256, output_dim=256, final_dim=512, dropout_rate=0.5):
        super(CNNDecoderPrenet, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, output_dim)
        self.linear_projection = nn.Linear(output_dim, final_dim) # Added linear projection
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):

      # Transpose the input tensor to have the feature dimension as the last dimension
      x = x.transpose(1, 2)
      # Apply the linear layers
      x = F.relu(self.layer1(x))
      x = self.dropout(x)
      x = F.relu(self.layer2(x))
      x = self.dropout(x)
      # Apply the linear projection
      x = self.linear_projection(x)
      x = x.transpose(1, 2)

      return x




class CNNPostNet(torch.nn.Module):
    """
    Conv Postnet
    Arguments
    ---------
    n_mel_channels: int
       input feature dimension for convolution layers
    postnet_embedding_dim: int
       output feature dimension for convolution layers
    postnet_kernel_size: int
       postnet convolution kernal size
    postnet_n_convolutions: int
       number of convolution layers
    postnet_dropout: float
        dropout probability fot postnet
    """

    def __init__(
        self,
        n_mel_channels=80,
        postnet_embedding_dim=512,
        postnet_kernel_size=5,
        postnet_n_convolutions=5,
        postnet_dropout=0.1,
    ):
        super(CNNPostNet, self).__init__()

        self.conv_pre = nn.Conv1d(
            in_channels=n_mel_channels,
            out_channels=postnet_embedding_dim,
            kernel_size=postnet_kernel_size,
            padding="same",
        )

        self.convs_intermedite = nn.ModuleList()
        for i in range(1, postnet_n_convolutions - 1):
            self.convs_intermedite.append(
                nn.Conv1d(
                    in_channels=postnet_embedding_dim,
                    out_channels=postnet_embedding_dim,
                    kernel_size=postnet_kernel_size,
                    padding="same",
                ),
            )

        self.conv_post = nn.Conv1d(
            in_channels=postnet_embedding_dim,
            out_channels=n_mel_channels,
            kernel_size=postnet_kernel_size,
            padding="same",
        )

        self.tanh = nn.Tanh()
        self.ln1 = nn.LayerNorm(postnet_embedding_dim)
        self.ln2 = nn.LayerNorm(postnet_embedding_dim)
        self.ln3 = nn.LayerNorm(n_mel_channels)
        self.dropout1 = nn.Dropout(postnet_dropout)
        self.dropout2 = nn.Dropout(postnet_dropout)
        self.dropout3 = nn.Dropout(postnet_dropout)


    def forward(self, x):
        """Computes the forward pass
        Arguments
        ---------
        x: torch.Tensor
            a (batch, time_steps, features) input tensor
        Returns
        -------
        output: torch.Tensor (the spectrogram predicted)
        """
        x = self.conv_pre(x)
        x = self.ln1(x.permute(0, 2, 1)).permute(0, 2, 1)  # Transpose to [batch_size, feature_dim, sequence_length]
        x = self.tanh(x)
        x = self.dropout1(x)

        for i in range(len(self.convs_intermedite)):
            x = self.convs_intermedite[i](x)
        x = self.ln2(x.permute(0, 2, 1)).permute(0, 2, 1)  # Transpose to [batch_size, feature_dim, sequence_length]
        x = self.tanh(x)
        x = self.dropout2(x)

        x = self.conv_post(x)
        x = self.ln3(x.permute(0, 2, 1)).permute(0, 2, 1)  # Transpose to [batch_size, feature_dim, sequence_length]
        x = self.dropout3(x)

        return x


class ScaledPositionalEncoding(nn.Module):
    """
    This class implements the absolute sinusoidal positional encoding function
    with an adaptive weight parameter alpha.

    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))

    Arguments
    ---------
    input_size: int
        Embedding dimension.
    max_len : int, optional
        Max length of the input sequences (default 2500).
    Example
    -------
    >>> a = torch.rand((8, 120, 512))
    >>> enc = PositionalEncoding(input_size=a.shape[-1])
    >>> b = enc(a)
    >>> b.shape
    torch.Size([1, 120, 512])
    """

    def __init__(self, input_size, max_len=2500):
        super().__init__()
        if input_size % 2 != 0:
            raise ValueError(
                f"Cannot use sin/cos positional encoding with odd channels (got channels={input_size})"
            )
        self.max_len = max_len
        self.alpha = nn.Parameter(torch.ones(1))  # Define alpha as a trainable parameter
        pe = torch.zeros(self.max_len, input_size, requires_grad=False)
        positions = torch.arange(0, self.max_len).unsqueeze(1).float()
        denominator = torch.exp(
            torch.arange(0, input_size, 2).float()
            * -(math.log(10000.0) / input_size)
        )

        pe[:, 0::2] = torch.sin(positions * denominator)
        pe[:, 1::2] = torch.cos(positions * denominator)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        Arguments
        ---------
        x : tensor
            Input feature shape (batch, time, fea)
        """
        pe_scaled = self.pe[:, :x.size(1)].clone().detach() * self.alpha  # Scale positional encoding by alpha
        return pe_scaled