Spaces:

myhanhhyugen
/

TTSDemoApp

Paused

App Files Files Community

TTSDemoApp / module_classes.py

myhanhhyugen

initial commits

dc9eaa3 verified 7 months ago

raw

history blame contribute delete

6.58 kB


	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import math

	class CNNPrenet(torch.nn.Module):
	def __init__(self):
	super(CNNPrenet, self).__init__()

	# Define the layers using Sequential container
	self.conv_layers = nn.Sequential(
	nn.Conv1d(in_channels=1, out_channels=512, kernel_size=3, padding=1),
	nn.BatchNorm1d(512),
	nn.ReLU(),
	nn.Dropout(0.1),

	nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
	nn.BatchNorm1d(512),
	nn.ReLU(),
	nn.Dropout(0.1),

	nn.Conv1d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
	nn.BatchNorm1d(512),
	nn.ReLU(),
	nn.Dropout(0.1)
	)

	def forward(self, x):

	# Add a new dimension for the channel
	x = x.unsqueeze(1)

	# Pass input through convolutional layers
	x = self.conv_layers(x)

	# Remove the channel dimension
	x = x.squeeze(1)

	# Scale the output to the range [-1, 1]
	x = torch.tanh(x)

	return x



	class CNNDecoderPrenet(nn.Module):
	def __init__(self, input_dim=80, hidden_dim=256, output_dim=256, final_dim=512, dropout_rate=0.5):
	super(CNNDecoderPrenet, self).__init__()
	self.layer1 = nn.Linear(input_dim, hidden_dim)
	self.layer2 = nn.Linear(hidden_dim, output_dim)
	self.linear_projection = nn.Linear(output_dim, final_dim) # Added linear projection
	self.dropout = nn.Dropout(dropout_rate)

	def forward(self, x):

	# Transpose the input tensor to have the feature dimension as the last dimension
	x = x.transpose(1, 2)
	# Apply the linear layers
	x = F.relu(self.layer1(x))
	x = self.dropout(x)
	x = F.relu(self.layer2(x))
	x = self.dropout(x)
	# Apply the linear projection
	x = self.linear_projection(x)
	x = x.transpose(1, 2)

	return x




	class CNNPostNet(torch.nn.Module):
	"""
	Conv Postnet
	Arguments
	---------
	n_mel_channels: int
	input feature dimension for convolution layers
	postnet_embedding_dim: int
	output feature dimension for convolution layers
	postnet_kernel_size: int
	postnet convolution kernal size
	postnet_n_convolutions: int
	number of convolution layers
	postnet_dropout: float
	dropout probability fot postnet
	"""

	def __init__(
	self,
	n_mel_channels=80,
	postnet_embedding_dim=512,
	postnet_kernel_size=5,
	postnet_n_convolutions=5,
	postnet_dropout=0.1,
	):
	super(CNNPostNet, self).__init__()

	self.conv_pre = nn.Conv1d(
	in_channels=n_mel_channels,
	out_channels=postnet_embedding_dim,
	kernel_size=postnet_kernel_size,
	padding="same",
	)

	self.convs_intermedite = nn.ModuleList()
	for i in range(1, postnet_n_convolutions - 1):
	self.convs_intermedite.append(
	nn.Conv1d(
	in_channels=postnet_embedding_dim,
	out_channels=postnet_embedding_dim,
	kernel_size=postnet_kernel_size,
	padding="same",
	),
	)

	self.conv_post = nn.Conv1d(
	in_channels=postnet_embedding_dim,
	out_channels=n_mel_channels,
	kernel_size=postnet_kernel_size,
	padding="same",
	)

	self.tanh = nn.Tanh()
	self.ln1 = nn.LayerNorm(postnet_embedding_dim)
	self.ln2 = nn.LayerNorm(postnet_embedding_dim)
	self.ln3 = nn.LayerNorm(n_mel_channels)
	self.dropout1 = nn.Dropout(postnet_dropout)
	self.dropout2 = nn.Dropout(postnet_dropout)
	self.dropout3 = nn.Dropout(postnet_dropout)


	def forward(self, x):
	"""Computes the forward pass
	Arguments
	---------
	x: torch.Tensor
	a (batch, time_steps, features) input tensor
	Returns
	-------
	output: torch.Tensor (the spectrogram predicted)
	"""
	x = self.conv_pre(x)
	x = self.ln1(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length]
	x = self.tanh(x)
	x = self.dropout1(x)

	for i in range(len(self.convs_intermedite)):
	x = self.convs_intermedite[i](x)
	x = self.ln2(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length]
	x = self.tanh(x)
	x = self.dropout2(x)

	x = self.conv_post(x)
	x = self.ln3(x.permute(0, 2, 1)).permute(0, 2, 1) # Transpose to [batch_size, feature_dim, sequence_length]
	x = self.dropout3(x)

	return x


	class ScaledPositionalEncoding(nn.Module):
	"""
	This class implements the absolute sinusoidal positional encoding function
	with an adaptive weight parameter alpha.

	PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
	PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))

	Arguments
	---------
	input_size: int
	Embedding dimension.
	max_len : int, optional
	Max length of the input sequences (default 2500).
	Example
	-------
	>>> a = torch.rand((8, 120, 512))
	>>> enc = PositionalEncoding(input_size=a.shape[-1])
	>>> b = enc(a)
	>>> b.shape
	torch.Size([1, 120, 512])
	"""

	def __init__(self, input_size, max_len=2500):
	super().__init__()
	if input_size % 2 != 0:
	raise ValueError(
	f"Cannot use sin/cos positional encoding with odd channels (got channels={input_size})"
	)
	self.max_len = max_len
	self.alpha = nn.Parameter(torch.ones(1)) # Define alpha as a trainable parameter
	pe = torch.zeros(self.max_len, input_size, requires_grad=False)
	positions = torch.arange(0, self.max_len).unsqueeze(1).float()
	denominator = torch.exp(
	torch.arange(0, input_size, 2).float()
	* -(math.log(10000.0) / input_size)
	)

	pe[:, 0::2] = torch.sin(positions * denominator)
	pe[:, 1::2] = torch.cos(positions * denominator)
	pe = pe.unsqueeze(0)
	self.register_buffer("pe", pe)

	def forward(self, x):
	"""
	Arguments
	---------
	x : tensor
	Input feature shape (batch, time, fea)
	"""
	pe_scaled = self.pe[:, :x.size(1)].clone().detach() * self.alpha # Scale positional encoding by alpha
	return pe_scaled