Spaces:

yerfor
/

SyntaSpeech

Build error

App Files Files Community

SyntaSpeech / modules /vocoder /parallel_wavegan /models /parallel_wavegan.py

yerfor

init

22871e7 over 2 years ago

raw

history blame contribute delete

18 kB

	# -- coding: utf-8 --

	# Copyright 2019 Tomoki Hayashi
	# MIT License (https://opensource.org/licenses/MIT)

	"""Parallel WaveGAN Modules."""

	import logging
	import math

	import torch
	from torch import nn

	from modules.vocoder.parallel_wavegan.layers import Conv1d
	from modules.vocoder.parallel_wavegan.layers import Conv1d1x1
	from modules.vocoder.parallel_wavegan.layers import ResidualBlock
	from modules.vocoder.parallel_wavegan.layers import upsample
	from modules.vocoder.parallel_wavegan import models
	from modules.vocoder.parallel_wavegan.models import SourceModuleCycNoise_v1
	from utils.commons.hparams import hparams
	import numpy as np

	class ParallelWaveGANGenerator(torch.nn.Module):
	"""Parallel WaveGAN Generator module."""

	def __init__(self,
	in_channels=1,
	out_channels=1,
	kernel_size=3,
	layers=30,
	stacks=3,
	residual_channels=64,
	gate_channels=128,
	skip_channels=64,
	aux_channels=80,
	aux_context_window=2,
	dropout=0.0,
	bias=True,
	use_weight_norm=True,
	use_causal_conv=False,
	upsample_conditional_features=True,
	upsample_net="ConvInUpsampleNetwork",
	upsample_params={"upsample_scales": [4, 4, 4, 4]},
	use_pitch_embed=False,
	use_nsf=False,
	sample_rate=22050,
	):
	"""Initialize Parallel WaveGAN Generator module.

	Args:
	in_channels (int): Number of input channels.
	out_channels (int): Number of output channels.
	kernel_size (int): Kernel size of dilated convolution.
	layers (int): Number of residual block layers.
	stacks (int): Number of stacks i.e., dilation cycles.
	residual_channels (int): Number of channels in residual conv.
	gate_channels (int): Number of channels in gated conv.
	skip_channels (int): Number of channels in skip conv.
	aux_channels (int): Number of channels for auxiliary feature conv.
	aux_context_window (int): Context window size for auxiliary feature.
	dropout (float): Dropout rate. 0.0 means no dropout applied.
	bias (bool): Whether to use bias parameter in conv layer.
	use_weight_norm (bool): Whether to use weight norm.
	If set to true, it will be applied to all of the conv layers.
	use_causal_conv (bool): Whether to use causal structure.
	upsample_conditional_features (bool): Whether to use upsampling network.
	upsample_net (str): Upsampling network architecture.
	upsample_params (dict): Upsampling network parameters.

	"""
	super(ParallelWaveGANGenerator, self).__init__()
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.aux_channels = aux_channels
	self.layers = layers
	self.stacks = stacks
	self.kernel_size = kernel_size

	# check the number of layers and stacks
	assert layers % stacks == 0
	layers_per_stack = layers // stacks

	# define first convolution
	self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True)

	# define conv + upsampling network
	self.aux_context_window = aux_context_window
	if upsample_conditional_features:
	upsample_params.update({
	"use_causal_conv": use_causal_conv,
	})
	if upsample_net == "MelGANGenerator":
	assert aux_context_window == 0
	upsample_params.update({
	"use_weight_norm": False, # not to apply twice
	"use_final_nonlinear_activation": False,
	})
	self.upsample_net = getattr(models, upsample_net)(**upsample_params)
	else:
	if upsample_net == "ConvInUpsampleNetwork":
	upsample_params.update({
	"aux_channels": aux_channels,
	"aux_context_window": aux_context_window,
	})
	self.upsample_net = getattr(upsample, upsample_net)(**upsample_params)
	else:
	self.upsample_net = None

	# define residual blocks
	self.conv_layers = torch.nn.ModuleList()
	for layer in range(layers):
	dilation = 2 ** (layer % layers_per_stack)
	conv = ResidualBlock(
	kernel_size=kernel_size,
	residual_channels=residual_channels,
	gate_channels=gate_channels,
	skip_channels=skip_channels,
	aux_channels=aux_channels,
	dilation=dilation,
	dropout=dropout,
	bias=bias,
	use_causal_conv=use_causal_conv,
	)
	self.conv_layers += [conv]

	# define output layers
	self.last_conv_layers = torch.nn.ModuleList([
	torch.nn.ReLU(inplace=True),
	Conv1d1x1(skip_channels, skip_channels, bias=True),
	torch.nn.ReLU(inplace=True),
	Conv1d1x1(skip_channels, out_channels, bias=True),
	])

	self.use_pitch_embed = use_pitch_embed
	if use_pitch_embed:
	self.pitch_embed = nn.Embedding(300, aux_channels, 0)
	self.c_proj = nn.Linear(2 * aux_channels, aux_channels)
	self.use_nsf = use_nsf
	if use_nsf:
	self.harmonic_num = 8
	hop_size = np.prod(upsample_params['upsample_scales'])
	self.f0_upsamp = torch.nn.Upsample(scale_factor=hop_size)
	self.m_source = SourceModuleCycNoise_v1(sample_rate, 0.003)
	self.nsf_conv = nn.Sequential(nn.Conv1d(1, aux_channels, 1), torch.nn.Tanh())

	# apply weight norm
	if use_weight_norm:
	self.apply_weight_norm()

	def forward(self, x, c=None, pitch=None, f0=None, **kwargs):
	"""Calculate forward propagation.

	Args:
	x (Tensor): Input noise signal (B, C_in, T).
	c (Tensor): Local conditioning auxiliary features (B, C ,T').
	pitch (Tensor): Local conditioning pitch (B, T').

	Returns:
	Tensor: Output tensor (B, C_out, T)

	"""
	# perform upsampling
	if c is not None and self.upsample_net is not None:
	if self.use_pitch_embed:
	p = self.pitch_embed(pitch)
	c = self.c_proj(torch.cat([c.transpose(1, 2), p], -1)).transpose(1, 2)
	c = self.upsample_net(c)
	if self.use_nsf:
	f0_upsample = self.f0_upsamp(
	f0[:, None, :][:, :, self.aux_context_window:-self.aux_context_window])
	f0_upsample = self.nsf_conv(f0_upsample)
	c = c + f0_upsample
	if x is None:
	x = torch.randn([c.size(0), 1, c.size(-1)]).to(c.device)
	assert c.size(-1) == x.size(-1), (c.size(-1), x.size(-1))

	# encode to hidden representation
	x = self.first_conv(x)
	skips = 0
	for f in self.conv_layers:
	x, h = f(x, c)
	skips += h
	skips *= math.sqrt(1.0 / len(self.conv_layers))

	# apply final layers
	x = skips
	for f in self.last_conv_layers:
	x = f(x)

	return x

	def remove_weight_norm(self):
	"""Remove weight normalization module from all of the layers."""
	def _remove_weight_norm(m):
	try:
	logging.debug(f"Weight norm is removed from {m}.")
	torch.nn.utils.remove_weight_norm(m)
	except ValueError: # this module didn't have weight norm
	return

	self.apply(_remove_weight_norm)

	def apply_weight_norm(self):
	"""Apply weight normalization module from all of the layers."""
	def _apply_weight_norm(m):
	if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
	torch.nn.utils.weight_norm(m)
	logging.debug(f"Weight norm is applied to {m}.")

	self.apply(_apply_weight_norm)

	@staticmethod
	def _get_receptive_field_size(layers, stacks, kernel_size,
	dilation=lambda x: 2 ** x):
	assert layers % stacks == 0
	layers_per_cycle = layers // stacks
	dilations = [dilation(i % layers_per_cycle) for i in range(layers)]
	return (kernel_size - 1) * sum(dilations) + 1

	@property
	def receptive_field_size(self):
	"""Return receptive field size."""
	return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)


	class ParallelWaveGANDiscriminator(torch.nn.Module):
	"""Parallel WaveGAN Discriminator module."""

	def __init__(self,
	in_channels=1,
	out_channels=1,
	kernel_size=3,
	layers=10,
	conv_channels=64,
	dilation_factor=1,
	nonlinear_activation="LeakyReLU",
	nonlinear_activation_params={"negative_slope": 0.2},
	bias=True,
	use_weight_norm=True,
	):
	"""Initialize Parallel WaveGAN Discriminator module.

	Args:
	in_channels (int): Number of input channels.
	out_channels (int): Number of output channels.
	kernel_size (int): Number of output channels.
	layers (int): Number of conv layers.
	conv_channels (int): Number of chnn layers.
	dilation_factor (int): Dilation factor. For example, if dilation_factor = 2,
	the dilation will be 2, 4, 8, ..., and so on.
	nonlinear_activation (str): Nonlinear function after each conv.
	nonlinear_activation_params (dict): Nonlinear function parameters
	bias (bool): Whether to use bias parameter in conv.
	use_weight_norm (bool) Whether to use weight norm.
	If set to true, it will be applied to all of the conv layers.

	"""
	super(ParallelWaveGANDiscriminator, self).__init__()
	assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."
	assert dilation_factor > 0, "Dilation factor must be > 0."
	self.conv_layers = torch.nn.ModuleList()
	conv_in_channels = in_channels
	for i in range(layers - 1):
	if i == 0:
	dilation = 1
	else:
	dilation = i if dilation_factor == 1 else dilation_factor ** i
	conv_in_channels = conv_channels
	padding = (kernel_size - 1) // 2 * dilation
	conv_layer = [
	Conv1d(conv_in_channels, conv_channels,
	kernel_size=kernel_size, padding=padding,
	dilation=dilation, bias=bias),
	getattr(torch.nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params)
	]
	self.conv_layers += conv_layer
	padding = (kernel_size - 1) // 2
	last_conv_layer = Conv1d(
	conv_in_channels, out_channels,
	kernel_size=kernel_size, padding=padding, bias=bias)
	self.conv_layers += [last_conv_layer]

	# apply weight norm
	if use_weight_norm:
	self.apply_weight_norm()

	def forward(self, x, cond=None):
	"""Calculate forward propagation.

	Args:
	x (Tensor): Input noise signal (B, 1, T).
	cond (Tensor): Input noise signal (B, H, T_frame).

	Returns:
	Tensor: Output tensor (B, 1, T)

	"""
	cond_layer_i = len(self.conv_layers) // 2
	for i, f in enumerate(self.conv_layers):
	if i == cond_layer_i and cond is not None:
	aux_context_window = hparams['aux_context_window']
	cond = cond[:, :, aux_context_window:-aux_context_window]
	cond = cond[:, :, :, None].repeat([1, 1, 1, hparams['hop_size']]).reshape(
	cond.shape[0], cond.shape[1], -1)
	x = x + cond
	x = f(x)
	return x

	def apply_weight_norm(self):
	"""Apply weight normalization module from all of the layers."""
	def _apply_weight_norm(m):
	if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
	torch.nn.utils.weight_norm(m)
	logging.debug(f"Weight norm is applied to {m}.")

	self.apply(_apply_weight_norm)

	def remove_weight_norm(self):
	"""Remove weight normalization module from all of the layers."""
	def _remove_weight_norm(m):
	try:
	logging.debug(f"Weight norm is removed from {m}.")
	torch.nn.utils.remove_weight_norm(m)
	except ValueError: # this module didn't have weight norm
	return

	self.apply(_remove_weight_norm)


	class ResidualParallelWaveGANDiscriminator(torch.nn.Module):
	"""Parallel WaveGAN Discriminator module."""

	def __init__(self,
	in_channels=1,
	out_channels=1,
	kernel_size=3,
	layers=30,
	stacks=3,
	residual_channels=64,
	gate_channels=128,
	skip_channels=64,
	dropout=0.0,
	bias=True,
	use_weight_norm=True,
	use_causal_conv=False,
	nonlinear_activation="LeakyReLU",
	nonlinear_activation_params={"negative_slope": 0.2},
	):
	"""Initialize Parallel WaveGAN Discriminator module.

	Args:
	in_channels (int): Number of input channels.
	out_channels (int): Number of output channels.
	kernel_size (int): Kernel size of dilated convolution.
	layers (int): Number of residual block layers.
	stacks (int): Number of stacks i.e., dilation cycles.
	residual_channels (int): Number of channels in residual conv.
	gate_channels (int): Number of channels in gated conv.
	skip_channels (int): Number of channels in skip conv.
	dropout (float): Dropout rate. 0.0 means no dropout applied.
	bias (bool): Whether to use bias parameter in conv.
	use_weight_norm (bool): Whether to use weight norm.
	If set to true, it will be applied to all of the conv layers.
	use_causal_conv (bool): Whether to use causal structure.
	nonlinear_activation_params (dict): Nonlinear function parameters

	"""
	super(ResidualParallelWaveGANDiscriminator, self).__init__()
	assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size."

	self.in_channels = in_channels
	self.out_channels = out_channels
	self.layers = layers
	self.stacks = stacks
	self.kernel_size = kernel_size

	# check the number of layers and stacks
	assert layers % stacks == 0
	layers_per_stack = layers // stacks

	# define first convolution
	self.first_conv = torch.nn.Sequential(
	Conv1d1x1(in_channels, residual_channels, bias=True),
	getattr(torch.nn, nonlinear_activation)(
	inplace=True, **nonlinear_activation_params),
	)

	# define residual blocks
	self.conv_layers = torch.nn.ModuleList()
	for layer in range(layers):
	dilation = 2 ** (layer % layers_per_stack)
	conv = ResidualBlock(
	kernel_size=kernel_size,
	residual_channels=residual_channels,
	gate_channels=gate_channels,
	skip_channels=skip_channels,
	aux_channels=-1,
	dilation=dilation,
	dropout=dropout,
	bias=bias,
	use_causal_conv=use_causal_conv,
	)
	self.conv_layers += [conv]

	# define output layers
	self.last_conv_layers = torch.nn.ModuleList([
	getattr(torch.nn, nonlinear_activation)(
	inplace=True, **nonlinear_activation_params),
	Conv1d1x1(skip_channels, skip_channels, bias=True),
	getattr(torch.nn, nonlinear_activation)(
	inplace=True, **nonlinear_activation_params),
	Conv1d1x1(skip_channels, out_channels, bias=True),
	])

	# apply weight norm
	if use_weight_norm:
	self.apply_weight_norm()

	def forward(self, x):
	"""Calculate forward propagation.

	Args:
	x (Tensor): Input noise signal (B, 1, T).

	Returns:
	Tensor: Output tensor (B, 1, T)

	"""
	x = self.first_conv(x)

	skips = 0
	for f in self.conv_layers:
	x, h = f(x, None)
	skips += h
	skips *= math.sqrt(1.0 / len(self.conv_layers))

	# apply final layers
	x = skips
	for f in self.last_conv_layers:
	x = f(x)
	return x

	def apply_weight_norm(self):
	"""Apply weight normalization module from all of the layers."""
	def _apply_weight_norm(m):
	if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d):
	torch.nn.utils.weight_norm(m)
	logging.debug(f"Weight norm is applied to {m}.")

	self.apply(_apply_weight_norm)

	def remove_weight_norm(self):
	"""Remove weight normalization module from all of the layers."""
	def _remove_weight_norm(m):
	try:
	logging.debug(f"Weight norm is removed from {m}.")
	torch.nn.utils.remove_weight_norm(m)
	except ValueError: # this module didn't have weight norm
	return

	self.apply(_remove_weight_norm)