Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

xVASynth-TTS / python /xvapitch /audio.py

Pendrokar

xVASynth v3 code for English

19c8b95 9 months ago

raw

history blame

33.8 kB

	from typing import Dict, Tuple

	import librosa
	import numpy as np
	import scipy.io.wavfile
	import scipy.signal
	import soundfile as sf
	import torch
	from torch import nn

	# from TTS.tts.utils.helpers import StandardScaler
	class StandardScaler:
	"""StandardScaler for mean-scale normalization with the given mean and scale values."""

	def __init__(self, mean: np.ndarray = None, scale: np.ndarray = None) -> None:
	self.mean_ = mean
	self.scale_ = scale

	def set_stats(self, mean, scale):
	self.mean_ = mean
	self.scale_ = scale

	def reset_stats(self):
	delattr(self, "mean_")
	delattr(self, "scale_")

	def transform(self, X):
	X = np.asarray(X)
	X -= self.mean_
	X /= self.scale_
	return X

	def inverse_transform(self, X):
	X = np.asarray(X)
	X *= self.scale_
	X += self.mean_
	return X


	class TorchSTFT(nn.Module): # pylint: disable=abstract-method
	"""Some of the audio processing funtions using Torch for faster batch processing.

	TODO: Merge this with audio.py

	Args:

	n_fft (int):
	FFT window size for STFT.

	hop_length (int):
	number of frames between STFT columns.

	win_length (int, optional):
	STFT window length.

	pad_wav (bool, optional):
	If True pad the audio with (n_fft - hop_length) / 2). Defaults to False.

	window (str, optional):
	The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window"

	sample_rate (int, optional):
	target audio sampling rate. Defaults to None.

	mel_fmin (int, optional):
	minimum filter frequency for computing melspectrograms. Defaults to None.

	mel_fmax (int, optional):
	maximum filter frequency for computing melspectrograms. Defaults to None.

	n_mels (int, optional):
	number of melspectrogram dimensions. Defaults to None.

	use_mel (bool, optional):
	If True compute the melspectrograms otherwise. Defaults to False.

	do_amp_to_db_linear (bool, optional):
	enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False.

	spec_gain (float, optional):
	gain applied when converting amplitude to DB. Defaults to 1.0.

	power (float, optional):
	Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Defaults to None.

	use_htk (bool, optional):
	Use HTK formula in mel filter instead of Slaney.

	mel_norm (None, 'slaney', or number, optional):
	If 'slaney', divide the triangular mel weights by the width of the mel band
	(area normalization).

	If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm.
	See `librosa.util.normalize` for a full description of supported norm values
	(including `+-np.inf`).

	Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney".
	"""

	def __init__(
	self,
	n_fft,
	hop_length,
	win_length,
	pad_wav=False,
	window="hann_window",
	sample_rate=None,
	mel_fmin=0,
	mel_fmax=None,
	n_mels=80,
	use_mel=False,
	do_amp_to_db=False,
	spec_gain=1.0,
	power=None,
	use_htk=False,
	mel_norm="slaney",
	):
	super().__init__()
	self.n_fft = n_fft
	self.hop_length = hop_length
	self.win_length = win_length
	self.pad_wav = pad_wav
	self.sample_rate = sample_rate
	self.mel_fmin = mel_fmin
	self.mel_fmax = mel_fmax
	self.n_mels = n_mels
	self.use_mel = use_mel
	self.do_amp_to_db = do_amp_to_db
	self.spec_gain = spec_gain
	self.power = power
	self.use_htk = use_htk
	self.mel_norm = mel_norm
	self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
	self.mel_basis = None
	if use_mel:
	self._build_mel_basis()

	def __call__(self, x):
	"""Compute spectrogram frames by torch based stft.

	Args:
	x (Tensor): input waveform

	Returns:
	Tensor: spectrogram frames.

	Shapes:
	x: [B x T] or [:math:`[B, 1, T]`]
	"""
	if x.ndim == 2:
	x = x.unsqueeze(1)
	if self.pad_wav:
	padding = int((self.n_fft - self.hop_length) / 2)
	x = torch.nn.functional.pad(x, (padding, padding), mode="reflect")
	# B x D x T x 2
	x_device = x.device
	o = torch.stft(
	x.squeeze(1).to(self.window.device),
	self.n_fft,
	self.hop_length,
	self.win_length,
	self.window,
	center=True,
	pad_mode="reflect", # compatible with audio.py
	normalized=False,
	onesided=True,
	return_complex=False,
	)
	M = o[:, :, :, 0]
	P = o[:, :, :, 1]
	S = torch.sqrt(torch.clamp(M 2 + P 2, min=1e-8))

	if self.power is not None:
	S = S ** self.power

	if self.use_mel:
	S = torch.matmul(self.mel_basis.to(self.window.device), S)
	# S = torch.matmul(self.mel_basis, S)
	if self.do_amp_to_db:
	S = self._amp_to_db(S, spec_gain=self.spec_gain)
	return S.to(x_device)

	def _build_mel_basis(self):
	mel_basis = librosa.filters.mel(
	self.sample_rate,
	self.n_fft,
	n_mels=self.n_mels,
	fmin=self.mel_fmin,
	fmax=self.mel_fmax,
	htk=self.use_htk,
	norm=self.mel_norm,
	)
	self.mel_basis = torch.from_numpy(mel_basis).float()

	@staticmethod
	def _amp_to_db(x, spec_gain=1.0):
	return torch.log(torch.clamp(x, min=1e-5) * spec_gain)

	@staticmethod
	def _db_to_amp(x, spec_gain=1.0):
	return torch.exp(x) / spec_gain


	# pylint: disable=too-many-public-methods
	class AudioProcessor(object):
	"""Audio Processor for TTS used by all the data pipelines.

	TODO: Make this a dataclass to replace `BaseAudioConfig`.

	Note:
	All the class arguments are set to default values to enable a flexible initialization
	of the class with the model config. They are not meaningful for all the arguments.

	Args:
	sample_rate (int, optional):
	target audio sampling rate. Defaults to None.

	resample (bool, optional):
	enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False.

	num_mels (int, optional):
	number of melspectrogram dimensions. Defaults to None.

	log_func (int, optional):
	log exponent used for converting spectrogram aplitude to DB.

	min_level_db (int, optional):
	minimum db threshold for the computed melspectrograms. Defaults to None.

	frame_shift_ms (int, optional):
	milliseconds of frames between STFT columns. Defaults to None.

	frame_length_ms (int, optional):
	milliseconds of STFT window length. Defaults to None.

	hop_length (int, optional):
	number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None.

	win_length (int, optional):
	STFT window length. Used if ```frame_length_ms``` is None. Defaults to None.

	ref_level_db (int, optional):
	reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None.

	fft_size (int, optional):
	FFT window size for STFT. Defaults to 1024.

	power (int, optional):
	Exponent value applied to the spectrogram before GriffinLim. Defaults to None.

	preemphasis (float, optional):
	Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0.

	signal_norm (bool, optional):
	enable/disable signal normalization. Defaults to None.

	symmetric_norm (bool, optional):
	enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None.

	max_norm (float, optional):
	```k``` defining the normalization range. Defaults to None.

	mel_fmin (int, optional):
	minimum filter frequency for computing melspectrograms. Defaults to None.

	mel_fmax (int, optional):
	maximum filter frequency for computing melspectrograms. Defaults to None.

	spec_gain (int, optional):
	gain applied when converting amplitude to DB. Defaults to 20.

	stft_pad_mode (str, optional):
	Padding mode for STFT. Defaults to 'reflect'.

	clip_norm (bool, optional):
	enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.

	griffin_lim_iters (int, optional):
	Number of GriffinLim iterations. Defaults to None.

	do_trim_silence (bool, optional):
	enable/disable silence trimming when loading the audio signal. Defaults to False.

	trim_db (int, optional):
	DB threshold used for silence trimming. Defaults to 60.

	do_sound_norm (bool, optional):
	enable/disable signal normalization. Defaults to False.

	do_amp_to_db_linear (bool, optional):
	enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.

	do_amp_to_db_mel (bool, optional):
	enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.

	do_rms_norm (bool, optional):
	enable/disable RMS volume normalization when loading an audio file. Defaults to False.

	db_level (int, optional):
	dB level used for rms normalization. The range is -99 to 0. Defaults to None.

	stats_path (str, optional):
	Path to the computed stats file. Defaults to None.

	verbose (bool, optional):
	enable/disable logging. Defaults to True.

	"""

	def __init__(
	self,
	# sample_rate=None,
	sample_rate=22050,
	resample=False,
	num_mels=None,
	log_func="np.log10",
	min_level_db=None,
	frame_shift_ms=None,
	frame_length_ms=None,
	# hop_length=None,
	hop_length=256,
	# win_length=None,
	win_length=1024,
	ref_level_db=None,
	fft_size=1024,
	power=None,
	preemphasis=0.0,
	signal_norm=None,
	symmetric_norm=None,
	max_norm=None,
	mel_fmin=None,
	mel_fmax=None,
	spec_gain=20,
	stft_pad_mode="reflect",
	clip_norm=True,
	griffin_lim_iters=None,
	do_trim_silence=False,
	trim_db=60,
	do_sound_norm=False,
	do_amp_to_db_linear=True,
	do_amp_to_db_mel=True,
	do_rms_norm=False,
	db_level=None,
	stats_path=None,
	verbose=True,
	**_,
	):

	# setup class attributed
	self.sample_rate = sample_rate
	self.resample = resample
	self.num_mels = num_mels
	self.log_func = log_func
	self.min_level_db = min_level_db or 0
	self.frame_shift_ms = frame_shift_ms
	self.frame_length_ms = frame_length_ms
	self.ref_level_db = ref_level_db
	self.fft_size = fft_size
	self.power = power
	self.preemphasis = preemphasis
	self.griffin_lim_iters = griffin_lim_iters
	self.signal_norm = signal_norm
	self.symmetric_norm = symmetric_norm
	self.mel_fmin = mel_fmin or 0
	self.mel_fmax = mel_fmax
	self.spec_gain = float(spec_gain)
	self.stft_pad_mode = stft_pad_mode
	self.max_norm = 1.0 if max_norm is None else float(max_norm)
	self.clip_norm = clip_norm
	self.do_trim_silence = do_trim_silence
	self.trim_db = trim_db
	self.do_sound_norm = do_sound_norm
	self.do_amp_to_db_linear = do_amp_to_db_linear
	self.do_amp_to_db_mel = do_amp_to_db_mel
	self.do_rms_norm = do_rms_norm
	self.db_level = db_level
	self.stats_path = stats_path
	# setup exp_func for db to amp conversion
	if log_func == "np.log":
	self.base = np.e
	elif log_func == "np.log10":
	self.base = 10
	else:
	raise ValueError(" [!] unknown `log_func` value.")
	# setup stft parameters
	if hop_length is None:
	# compute stft parameters from given time values
	self.hop_length, self.win_length = self._stft_parameters()
	else:
	# use stft parameters from config file
	self.hop_length = hop_length
	self.win_length = win_length
	assert min_level_db != 0.0, " [!] min_level_db is 0"
	assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
	# members = vars(self)
	# if verbose:
	# print(" > Setting up Audio Processor...")
	# for key, value in members.items():
	# print(" \| > {}:{}".format(key, value))
	# create spectrogram utils
	self.mel_basis = self._build_mel_basis()
	self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis())
	# setup scaler
	if stats_path and signal_norm:
	mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path)
	self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)
	self.signal_norm = True
	self.max_norm = None
	self.clip_norm = None
	self.symmetric_norm = None

	### setting up the parameters ###
	def _build_mel_basis(
	self,
	) -> np.ndarray:
	"""Build melspectrogram basis.

	Returns:
	np.ndarray: melspectrogram basis.
	"""
	if self.mel_fmax is not None:
	assert self.mel_fmax <= self.sample_rate // 2
	return librosa.filters.mel(
	self.sample_rate, self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax
	)

	def _stft_parameters(
	self,
	) -> Tuple[int, int]:
	"""Compute the real STFT parameters from the time values.

	Returns:
	Tuple[int, int]: hop length and window length for STFT.
	"""
	factor = self.frame_length_ms / self.frame_shift_ms
	assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
	hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
	win_length = int(hop_length * factor)
	return hop_length, win_length

	### normalization ###
	def normalize(self, S: np.ndarray) -> np.ndarray:
	"""Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`

	Args:
	S (np.ndarray): Spectrogram to normalize.

	Raises:
	RuntimeError: Mean and variance is computed from incompatible parameters.

	Returns:
	np.ndarray: Normalized spectrogram.
	"""
	# pylint: disable=no-else-return
	return S

	# S = S.copy()
	# if self.signal_norm:
	# # mean-var scaling
	# if hasattr(self, "mel_scaler"):
	# if S.shape[0] == self.num_mels:
	# return self.mel_scaler.transform(S.T).T
	# elif S.shape[0] == self.fft_size / 2:
	# return self.linear_scaler.transform(S.T).T
	# else:
	# raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
	# # range normalization
	# S -= self.ref_level_db # discard certain range of DB assuming it is air noise
	# S_norm = (S - self.min_level_db) / (-self.min_level_db)
	# if self.symmetric_norm:
	# S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
	# if self.clip_norm:
	# S_norm = np.clip(
	# S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type
	# )
	# return S_norm
	# else:
	# S_norm = self.max_norm * S_norm
	# if self.clip_norm:
	# S_norm = np.clip(S_norm, 0, self.max_norm)
	# return S_norm
	# else:
	# return S

	def denormalize(self, S: np.ndarray) -> np.ndarray:
	"""Denormalize spectrogram values.

	Args:
	S (np.ndarray): Spectrogram to denormalize.

	Raises:
	RuntimeError: Mean and variance are incompatible.

	Returns:
	np.ndarray: Denormalized spectrogram.
	"""
	# pylint: disable=no-else-return
	S_denorm = S.copy()
	if self.signal_norm:
	# mean-var scaling
	if hasattr(self, "mel_scaler"):
	if S_denorm.shape[0] == self.num_mels:
	return self.mel_scaler.inverse_transform(S_denorm.T).T
	elif S_denorm.shape[0] == self.fft_size / 2:
	return self.linear_scaler.inverse_transform(S_denorm.T).T
	else:
	raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
	if self.symmetric_norm:
	if self.clip_norm:
	S_denorm = np.clip(
	S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type
	)
	S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
	return S_denorm + self.ref_level_db
	else:
	if self.clip_norm:
	S_denorm = np.clip(S_denorm, 0, self.max_norm)
	S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db
	return S_denorm + self.ref_level_db
	else:
	return S_denorm

	### Mean-STD scaling ###
	def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]:
	"""Loading mean and variance statistics from a `npy` file.

	Args:
	stats_path (str): Path to the `npy` file containing

	Returns:
	Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to
	compute them.
	"""
	stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg
	mel_mean = stats["mel_mean"]
	mel_std = stats["mel_std"]
	linear_mean = stats["linear_mean"]
	linear_std = stats["linear_std"]
	stats_config = stats["audio_config"]
	# check all audio parameters used for computing stats
	skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"]
	for key in stats_config.keys():
	if key in skip_parameters:
	continue
	if key not in ["sample_rate", "trim_db"]:
	assert (
	stats_config[key] == self.__dict__[key]
	), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
	return mel_mean, mel_std, linear_mean, linear_std, stats_config

	# pylint: disable=attribute-defined-outside-init
	def setup_scaler(
	self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray
	) -> None:
	"""Initialize scaler objects used in mean-std normalization.

	Args:
	mel_mean (np.ndarray): Mean for melspectrograms.
	mel_std (np.ndarray): STD for melspectrograms.
	linear_mean (np.ndarray): Mean for full scale spectrograms.
	linear_std (np.ndarray): STD for full scale spectrograms.
	"""
	self.mel_scaler = StandardScaler()
	self.mel_scaler.set_stats(mel_mean, mel_std)
	self.linear_scaler = StandardScaler()
	self.linear_scaler.set_stats(linear_mean, linear_std)

	### DB and AMP conversion ###
	# pylint: disable=no-self-use
	def _amp_to_db(self, x: np.ndarray) -> np.ndarray:
	"""Convert amplitude values to decibels.

	Args:
	x (np.ndarray): Amplitude spectrogram.

	Returns:
	np.ndarray: Decibels spectrogram.
	"""
	return self.spec_gain * _log(np.maximum(1e-5, x), self.base)

	# pylint: disable=no-self-use
	def _db_to_amp(self, x: np.ndarray) -> np.ndarray:
	"""Convert decibels spectrogram to amplitude spectrogram.

	Args:
	x (np.ndarray): Decibels spectrogram.

	Returns:
	np.ndarray: Amplitude spectrogram.
	"""
	return _exp(x / self.spec_gain, self.base)

	### Preemphasis ###
	def apply_preemphasis(self, x: np.ndarray) -> np.ndarray:
	"""Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.

	Args:
	x (np.ndarray): Audio signal.

	Raises:
	RuntimeError: Preemphasis coeff is set to 0.

	Returns:
	np.ndarray: Decorrelated audio signal.
	"""
	if self.preemphasis == 0:
	raise RuntimeError(" [!] Preemphasis is set 0.0.")
	return scipy.signal.lfilter([1, -self.preemphasis], [1], x)

	def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray:
	"""Reverse pre-emphasis."""
	if self.preemphasis == 0:
	raise RuntimeError(" [!] Preemphasis is set 0.0.")
	return scipy.signal.lfilter([1], [1, -self.preemphasis], x)

	### SPECTROGRAMs ###
	def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray:
	"""Project a full scale spectrogram to a melspectrogram.

	Args:
	spectrogram (np.ndarray): Full scale spectrogram.

	Returns:
	np.ndarray: Melspectrogram
	"""
	return np.dot(self.mel_basis, spectrogram)

	def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray:
	"""Convert a melspectrogram to full scale spectrogram."""
	return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))

	def spectrogram(self, y: np.ndarray) -> np.ndarray:
	"""Compute a spectrogram from a waveform.

	Args:
	y (np.ndarray): Waveform.

	Returns:
	np.ndarray: Spectrogram.
	"""
	# if self.preemphasis != 0:
	# D = self._stft(self.apply_preemphasis(y))
	# else:
	# D = self._stft(y)
	D = self._stft(y)
	# if self.do_amp_to_db_linear:
	# S = self._amp_to_db(np.abs(D))
	# else:
	# S = np.abs(D)
	S = np.abs(D)
	return self.normalize(S).astype(np.float32)

	def melspectrogram(self, y: np.ndarray) -> np.ndarray:
	"""Compute a melspectrogram from a waveform."""
	# if self.preemphasis != 0:
	# D = self._stft(self.apply_preemphasis(y))
	# else:
	# D = self._stft(y)
	D = self._stft(y)
	# if self.do_amp_to_db_mel:
	# S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
	# else:
	# S = self._linear_to_mel(np.abs(D))
	S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
	return self.normalize(S).astype(np.float32)

	def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray:
	"""Convert a spectrogram to a waveform using Griffi-Lim vocoder."""
	S = self.denormalize(spectrogram)
	S = self._db_to_amp(S)
	# Reconstruct phase
	if self.preemphasis != 0:
	return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
	return self._griffin_lim(S ** self.power)

	def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray:
	"""Convert a melspectrogram to a waveform using Griffi-Lim vocoder."""
	D = self.denormalize(mel_spectrogram)
	S = self._db_to_amp(D)
	S = self._mel_to_linear(S) # Convert back to linear
	if self.preemphasis != 0:
	return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
	return self._griffin_lim(S ** self.power)

	def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray:
	"""Convert a full scale linear spectrogram output of a network to a melspectrogram.

	Args:
	linear_spec (np.ndarray): Normalized full scale linear spectrogram.

	Returns:
	np.ndarray: Normalized melspectrogram.
	"""
	S = self.denormalize(linear_spec)
	S = self._db_to_amp(S)
	S = self._linear_to_mel(np.abs(S))
	S = self._amp_to_db(S)
	mel = self.normalize(S)
	return mel

	### STFT and ISTFT ###
	def _stft(self, y: np.ndarray) -> np.ndarray:
	"""Librosa STFT wrapper.

	Args:
	y (np.ndarray): Audio signal.

	Returns:
	np.ndarray: Complex number array.
	"""
	return librosa.stft(
	y=y,
	n_fft=self.fft_size,
	hop_length=self.hop_length,
	win_length=self.win_length,
	pad_mode=self.stft_pad_mode,
	window="hann",
	center=True,
	)

	def _istft(self, y: np.ndarray) -> np.ndarray:
	"""Librosa iSTFT wrapper."""
	return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length)

	def _griffin_lim(self, S):
	angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
	S_complex = np.abs(S).astype(np.complex)
	y = self._istft(S_complex * angles)
	if not np.isfinite(y).all():
	print(" [!] Waveform is not finite everywhere. Skipping the GL.")
	return np.array([0.0])
	for _ in range(self.griffin_lim_iters):
	angles = np.exp(1j * np.angle(self._stft(y)))
	y = self._istft(S_complex * angles)
	return y

	def compute_stft_paddings(self, x, pad_sides=1):
	"""Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding
	(first and final frames)"""
	assert pad_sides in (1, 2)
	pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
	if pad_sides == 1:
	return 0, pad
	return pad // 2, pad // 2 + pad % 2

	def compute_f0(self, x: np.ndarray) -> np.ndarray:
	import pyworld as pw
	"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

	Args:
	x (np.ndarray): Waveform.

	Returns:
	np.ndarray: Pitch.

	Examples:
	>>> WAV_FILE = filename = librosa.util.example_audio_file()
	>>> from TTS.config import BaseAudioConfig
	>>> from TTS.utils.audio import AudioProcessor
	>>> conf = BaseAudioConfig(mel_fmax=8000)
	>>> ap = AudioProcessor(**conf)
	>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
	>>> pitch = ap.compute_f0(wav)
	"""
	# align F0 length to the spectrogram length
	# if len(x) % self.hop_length == 0:
	# x = np.pad(x, (0, self.hop_length // 2), mode="reflect")

	# f0, t = pw.dio(
	# x.astype(np.double),
	# fs=self.sample_rate,
	# f0_ceil=self.mel_fmax,
	# frame_period=1000 * self.hop_length / self.sample_rate,
	# )
	# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
	# return f0
	pass

	### Audio Processing ###
	def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int:
	"""Find the last point without silence at the end of a audio signal.

	Args:
	wav (np.ndarray): Audio signal.
	threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
	min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.

	Returns:
	int: Last point without silence.
	"""
	window_length = int(self.sample_rate * min_silence_sec)
	hop_length = int(window_length / 4)
	threshold = self._db_to_amp(-self.trim_db)
	for x in range(hop_length, len(wav) - window_length, hop_length):
	if np.max(wav[x : x + window_length]) < threshold:
	return x + hop_length
	return len(wav)

	def trim_silence(self, wav):
	"""Trim silent parts with a threshold and 0.01 sec margin"""
	margin = int(self.sample_rate * 0.01)
	wav = wav[margin:-margin]
	return librosa.effects.trim(wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[
	0
	]

	@staticmethod
	def sound_norm(x: np.ndarray) -> np.ndarray:
	"""Normalize the volume of an audio signal.

	Args:
	x (np.ndarray): Raw waveform.

	Returns:
	np.ndarray: Volume normalized waveform.
	"""
	return x / abs(x).max() * 0.95

	@staticmethod
	def _rms_norm(wav, db_level=-27):
	r = 10 ** (db_level / 20)
	a = np.sqrt((len(wav) * (r 2)) / np.sum(wav 2))
	return wav * a

	def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
	"""Normalize the volume based on RMS of the signal.

	Args:
	x (np.ndarray): Raw waveform.

	Returns:
	np.ndarray: RMS normalized waveform.
	"""
	if db_level is None:
	db_level = self.db_level
	assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
	wav = self._rms_norm(x, db_level)
	return wav

	### save and load ###
	def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
	"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize.

	Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.

	Args:
	filename (str): Path to the wav file.
	sr (int, optional): Sampling rate for resampling. Defaults to None.

	Returns:
	np.ndarray: Loaded waveform.
	"""
	if self.resample:
	# loading with resampling. It is significantly slower.
	x, sr = librosa.load(filename, sr=self.sample_rate)
	elif sr is None:
	# SF is faster than librosa for loading files
	x, sr = sf.read(filename)
	assert self.sample_rate == sr, "%s vs %s (%s)" % (self.sample_rate, sr, filename)
	else:
	x, sr = librosa.load(filename, sr=sr)
	if self.do_trim_silence:
	try:
	x = self.trim_silence(x)
	except ValueError as e:
	print(f" [!] File cannot be trimmed for silence - {filename}:", e)
	return None#"==DEL_BAD_FILE=="
	if self.do_sound_norm:
	x = self.sound_norm(x)
	if self.do_rms_norm:
	x = self.rms_volume_norm(x, self.db_level)
	return x

	def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
	"""Save a waveform to a file using Scipy.

	Args:
	wav (np.ndarray): Waveform to save.
	path (str): Path to a output file.
	sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
	"""
	wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
	scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))

	def get_duration(self, filename: str) -> float:
	"""Get the duration of a wav file using Librosa.

	Args:
	filename (str): Path to the wav file.
	"""
	return librosa.get_duration(filename)

	@staticmethod
	def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
	mu = 2 ** qc - 1
	# wav_abs = np.minimum(np.abs(wav), 1.0)
	signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu)
	# Quantize signal to the specified number of levels.
	signal = (signal + 1) / 2 * mu + 0.5
	return np.floor(
	signal,
	)

	@staticmethod
	def mulaw_decode(wav, qc):
	"""Recovers waveform from quantized values."""
	mu = 2 ** qc - 1
	x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
	return x

	@staticmethod
	def encode_16bits(x):
	return np.clip(x * 2 15, -(2 15), 2 ** 15 - 1).astype(np.int16)

	@staticmethod
	def quantize(x: np.ndarray, bits: int) -> np.ndarray:
	"""Quantize a waveform to a given number of bits.

	Args:
	x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`.
	bits (int): Number of quantization bits.

	Returns:
	np.ndarray: Quantized waveform.
	"""
	return (x + 1.0) * (2 ** bits - 1) / 2

	@staticmethod
	def dequantize(x, bits):
	"""Dequantize a waveform from the given number of bits."""
	return 2 * x / (2 ** bits - 1) - 1


	def _log(x, base):
	if base == 10:
	return np.log10(x)
	return np.log(x)


	def _exp(x, base):
	if base == 10:
	return np.power(10, x)
	return np.exp(x)