Spaces:
Running
Running
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import torch | |
import numpy as np | |
from tqdm import tqdm | |
from utils.util import pad_mels_to_tensors, pad_f0_to_tensors | |
def vocoder_inference(cfg, model, mels, f0s=None, device=None, fast_inference=False): | |
"""Inference the vocoder | |
Args: | |
mels: A tensor of mel-specs with the shape (batch_size, num_mels, frames) | |
Returns: | |
audios: A tensor of audios with the shape (batch_size, seq_len) | |
""" | |
model.eval() | |
with torch.no_grad(): | |
training_noise_schedule = np.array(cfg.model.diffwave.noise_schedule) | |
inference_noise_schedule = ( | |
np.array(cfg.model.diffwave.inference_noise_schedule) | |
if fast_inference | |
else np.array(cfg.model.diffwave.noise_schedule) | |
) | |
talpha = 1 - training_noise_schedule | |
talpha_cum = np.cumprod(talpha) | |
beta = inference_noise_schedule | |
alpha = 1 - beta | |
alpha_cum = np.cumprod(alpha) | |
T = [] | |
for s in range(len(inference_noise_schedule)): | |
for t in range(len(training_noise_schedule) - 1): | |
if talpha_cum[t + 1] <= alpha_cum[s] <= talpha_cum[t]: | |
twiddle = (talpha_cum[t] ** 0.5 - alpha_cum[s] ** 0.5) / ( | |
talpha_cum[t] ** 0.5 - talpha_cum[t + 1] ** 0.5 | |
) | |
T.append(t + twiddle) | |
break | |
T = np.array(T, dtype=np.float32) | |
mels = mels.to(device) | |
audio = torch.randn( | |
mels.shape[0], | |
cfg.preprocess.hop_size * mels.shape[-1], | |
device=device, | |
) | |
for n in tqdm(range(len(alpha) - 1, -1, -1)): | |
c1 = 1 / alpha[n] ** 0.5 | |
c2 = beta[n] / (1 - alpha_cum[n]) ** 0.5 | |
audio = c1 * ( | |
audio | |
- c2 | |
* model(audio, torch.tensor([T[n]], device=audio.device), mels).squeeze( | |
1 | |
) | |
) | |
if n > 0: | |
noise = torch.randn_like(audio) | |
sigma = ( | |
(1.0 - alpha_cum[n - 1]) / (1.0 - alpha_cum[n]) * beta[n] | |
) ** 0.5 | |
audio += sigma * noise | |
audio = torch.clamp(audio, -1.0, 1.0) | |
return audio.detach().cpu() | |
def synthesis_audios(cfg, model, mels, f0s=None, batch_size=None, fast_inference=False): | |
"""Inference the vocoder | |
Args: | |
mels: A list of mel-specs | |
Returns: | |
audios: A list of audios | |
""" | |
# Get the device | |
device = next(model.parameters()).device | |
audios = [] | |
# Pad the given list into tensors | |
mel_batches, mel_frames = pad_mels_to_tensors(mels, batch_size) | |
if f0s != None: | |
f0_batches = pad_f0_to_tensors(f0s, batch_size) | |
if f0s == None: | |
for mel_batch, mel_frame in zip(mel_batches, mel_frames): | |
for i in range(mel_batch.shape[0]): | |
mel = mel_batch[i] | |
frame = mel_frame[i] | |
audio = vocoder_inference( | |
cfg, | |
model, | |
mel.unsqueeze(0), | |
device=device, | |
fast_inference=fast_inference, | |
).squeeze(0) | |
# calculate the audio length | |
audio_length = frame * cfg.preprocess.hop_size | |
audio = audio[:audio_length] | |
audios.append(audio) | |
else: | |
for mel_batch, f0_batch, mel_frame in zip(mel_batches, f0_batches, mel_frames): | |
for i in range(mel_batch.shape[0]): | |
mel = mel_batch[i] | |
f0 = f0_batch[i] | |
frame = mel_frame[i] | |
audio = vocoder_inference( | |
cfg, | |
model, | |
mel.unsqueeze(0), | |
f0s=f0.unsqueeze(0), | |
device=device, | |
fast_inference=fast_inference, | |
).squeeze(0) | |
# calculate the audio length | |
audio_length = frame * cfg.preprocess.hop_size | |
audio = audio[:audio_length] | |
audios.append(audio) | |
return audios | |