Spaces:

KdaiP
/

StableTTS1.1

Runtime error

App Files Files Community

StableTTS1.1 / vocoders /ffgan /backbone.py

KdaiP

Upload 80 files

3dd84f8 verified 2 months ago

raw

history blame contribute delete

7.7 kB

	import torch
	import torch.nn.functional as F
	from torch import nn


	# DropPath copied from timm library
	def drop_path(
	x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
	):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

	This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
	the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
	See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
	changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
	'survival rate' as the argument.

	""" # noqa: E501

	if drop_prob == 0.0 or not training:
	return x
	keep_prob = 1 - drop_prob
	shape = (x.shape[0],) + (1,) * (
	x.ndim - 1
	) # work with diff dim tensors, not just 2D ConvNets
	random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
	if keep_prob > 0.0 and scale_by_keep:
	random_tensor.div_(keep_prob)
	return x * random_tensor


	class DropPath(nn.Module):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" # noqa: E501

	def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob
	self.scale_by_keep = scale_by_keep

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)

	def extra_repr(self):
	return f"drop_prob={round(self.drop_prob,3):0.3f}"


	class LayerNorm(nn.Module):
	r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
	The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
	shape (batch_size, height, width, channels) while channels_first corresponds to inputs
	with shape (batch_size, channels, height, width).
	""" # noqa: E501

	def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(normalized_shape))
	self.bias = nn.Parameter(torch.zeros(normalized_shape))
	self.eps = eps
	self.data_format = data_format
	if self.data_format not in ["channels_last", "channels_first"]:
	raise NotImplementedError
	self.normalized_shape = (normalized_shape,)

	def forward(self, x):
	if self.data_format == "channels_last":
	return F.layer_norm(
	x, self.normalized_shape, self.weight, self.bias, self.eps
	)
	elif self.data_format == "channels_first":
	u = x.mean(1, keepdim=True)
	s = (x - u).pow(2).mean(1, keepdim=True)
	x = (x - u) / torch.sqrt(s + self.eps)
	x = self.weight[:, None] * x + self.bias[:, None]
	return x


	# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py
	class ConvNeXtBlock(nn.Module):
	r"""ConvNeXt Block. There are two equivalent implementations:
	(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
	(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
	We use (2) as we find it slightly faster in PyTorch

	Args:
	dim (int): Number of input channels.
	drop_path (float): Stochastic depth rate. Default: 0.0
	layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
	kernel_size (int): Kernel size for depthwise conv. Default: 7.
	dilation (int): Dilation for depthwise conv. Default: 1.
	""" # noqa: E501

	def __init__(
	self,
	dim: int,
	drop_path: float = 0.0,
	layer_scale_init_value: float = 1e-6,
	mlp_ratio: float = 4.0,
	kernel_size: int = 7,
	dilation: int = 1,
	):
	super().__init__()

	self.dwconv = nn.Conv1d(
	dim,
	dim,
	kernel_size=kernel_size,
	padding=int(dilation * (kernel_size - 1) / 2),
	groups=dim,
	) # depthwise conv
	self.norm = LayerNorm(dim, eps=1e-6)
	self.pwconv1 = nn.Linear(
	dim, int(mlp_ratio * dim)
	) # pointwise/1x1 convs, implemented with linear layers
	self.act = nn.GELU()
	self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
	self.gamma = (
	nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
	if layer_scale_init_value > 0
	else None
	)
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()

	def forward(self, x, apply_residual: bool = True):
	input = x

	x = self.dwconv(x)
	x = x.permute(0, 2, 1) # (N, C, L) -> (N, L, C)
	x = self.norm(x)
	x = self.pwconv1(x)
	x = self.act(x)
	x = self.pwconv2(x)

	if self.gamma is not None:
	x = self.gamma * x

	x = x.permute(0, 2, 1) # (N, L, C) -> (N, C, L)
	x = self.drop_path(x)

	if apply_residual:
	x = input + x

	return x


	class ConvNeXtEncoder(nn.Module):
	def __init__(
	self,
	input_channels: int = 3,
	depths: list[int] = [3, 3, 9, 3],
	dims: list[int] = [96, 192, 384, 768],
	drop_path_rate: float = 0.0,
	layer_scale_init_value: float = 1e-6,
	kernel_size: int = 7,
	):
	super().__init__()
	assert len(depths) == len(dims)

	self.downsample_layers = nn.ModuleList()
	stem = nn.Sequential(
	nn.Conv1d(
	input_channels,
	dims[0],
	kernel_size=kernel_size,
	padding=kernel_size // 2,
	padding_mode="zeros",
	),
	LayerNorm(dims[0], eps=1e-6, data_format="channels_first"),
	)
	self.downsample_layers.append(stem)

	for i in range(len(depths) - 1):
	mid_layer = nn.Sequential(
	LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
	nn.Conv1d(dims[i], dims[i + 1], kernel_size=1),
	)
	self.downsample_layers.append(mid_layer)

	self.stages = nn.ModuleList()
	dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]

	cur = 0
	for i in range(len(depths)):
	stage = nn.Sequential(
	*[
	ConvNeXtBlock(
	dim=dims[i],
	drop_path=dp_rates[cur + j],
	layer_scale_init_value=layer_scale_init_value,
	kernel_size=kernel_size,
	)
	for j in range(depths[i])
	]
	)
	self.stages.append(stage)
	cur += depths[i]

	self.norm = LayerNorm(dims[-1], eps=1e-6, data_format="channels_first")
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, (nn.Conv1d, nn.Linear)):
	nn.init.trunc_normal_(m.weight, std=0.02)
	nn.init.constant_(m.bias, 0)

	def forward(
	self,
	x: torch.Tensor,
	) -> torch.Tensor:
	for i in range(len(self.downsample_layers)):
	x = self.downsample_layers[i](x)
	x = self.stages[i](x)

	return self.norm(x)