|
|
|
|
|
|
|
|
|
|
|
import os |
|
import torch |
|
import json |
|
import json5 |
|
import time |
|
import accelerate |
|
import random |
|
import numpy as np |
|
import shutil |
|
|
|
from pathlib import Path |
|
from tqdm import tqdm |
|
from glob import glob |
|
from accelerate.logging import get_logger |
|
from torch.utils.data import DataLoader |
|
|
|
from models.vocoders.vocoder_dataset import ( |
|
VocoderDataset, |
|
VocoderCollator, |
|
VocoderConcatDataset, |
|
) |
|
|
|
from models.vocoders.gan.generator import bigvgan, hifigan, melgan, nsfhifigan, apnet |
|
from models.vocoders.flow.waveglow import waveglow |
|
from models.vocoders.diffusion.diffwave import diffwave |
|
from models.vocoders.autoregressive.wavenet import wavenet |
|
from models.vocoders.autoregressive.wavernn import wavernn |
|
|
|
from models.vocoders.gan import gan_vocoder_inference |
|
from models.vocoders.diffusion import diffusion_vocoder_inference |
|
|
|
from utils.io import save_audio |
|
|
|
_vocoders = { |
|
"diffwave": diffwave.DiffWave, |
|
"wavernn": wavernn.WaveRNN, |
|
"wavenet": wavenet.WaveNet, |
|
"waveglow": waveglow.WaveGlow, |
|
"nsfhifigan": nsfhifigan.NSFHiFiGAN, |
|
"bigvgan": bigvgan.BigVGAN, |
|
"hifigan": hifigan.HiFiGAN, |
|
"melgan": melgan.MelGAN, |
|
"apnet": apnet.APNet, |
|
} |
|
|
|
|
|
_vocoder_forward_funcs = { |
|
|
|
|
|
|
|
"diffwave": diffusion_vocoder_inference.vocoder_inference, |
|
"nsfhifigan": gan_vocoder_inference.vocoder_inference, |
|
"bigvgan": gan_vocoder_inference.vocoder_inference, |
|
"melgan": gan_vocoder_inference.vocoder_inference, |
|
"hifigan": gan_vocoder_inference.vocoder_inference, |
|
"apnet": gan_vocoder_inference.vocoder_inference, |
|
} |
|
|
|
|
|
_vocoder_infer_funcs = { |
|
|
|
|
|
|
|
"diffwave": diffusion_vocoder_inference.synthesis_audios, |
|
"nsfhifigan": gan_vocoder_inference.synthesis_audios, |
|
"bigvgan": gan_vocoder_inference.synthesis_audios, |
|
"melgan": gan_vocoder_inference.synthesis_audios, |
|
"hifigan": gan_vocoder_inference.synthesis_audios, |
|
"apnet": gan_vocoder_inference.synthesis_audios, |
|
} |
|
|
|
|
|
class VocoderInference(object): |
|
def __init__(self, args=None, cfg=None, infer_type="from_dataset"): |
|
super().__init__() |
|
|
|
start = time.monotonic_ns() |
|
self.args = args |
|
self.cfg = cfg |
|
self.infer_type = infer_type |
|
|
|
|
|
self.accelerator = accelerate.Accelerator() |
|
self.accelerator.wait_for_everyone() |
|
|
|
|
|
with self.accelerator.main_process_first(): |
|
self.logger = get_logger("inference", log_level=args.log_level) |
|
|
|
|
|
self.logger.info("=" * 56) |
|
self.logger.info("||\t\t" + "New inference process started." + "\t\t||") |
|
self.logger.info("=" * 56) |
|
self.logger.info("\n") |
|
|
|
self.vocoder_dir = args.vocoder_dir |
|
self.logger.debug(f"Vocoder dir: {args.vocoder_dir}") |
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
if os.path.exists(os.path.join(args.output_dir, "pred")): |
|
shutil.rmtree(os.path.join(args.output_dir, "pred")) |
|
if os.path.exists(os.path.join(args.output_dir, "gt")): |
|
shutil.rmtree(os.path.join(args.output_dir, "gt")) |
|
os.makedirs(os.path.join(args.output_dir, "pred"), exist_ok=True) |
|
os.makedirs(os.path.join(args.output_dir, "gt"), exist_ok=True) |
|
|
|
|
|
with self.accelerator.main_process_first(): |
|
start = time.monotonic_ns() |
|
self._set_random_seed(self.cfg.train.random_seed) |
|
end = time.monotonic_ns() |
|
self.logger.debug( |
|
f"Setting random seed done in {(end - start) / 1e6:.2f}ms" |
|
) |
|
self.logger.debug(f"Random seed: {self.cfg.train.random_seed}") |
|
|
|
|
|
if self.infer_type == "infer_from_dataset": |
|
self.cfg.dataset = self.args.infer_datasets |
|
elif self.infer_type == "infer_from_feature": |
|
self._build_tmp_dataset_from_feature() |
|
self.cfg.dataset = ["tmp"] |
|
elif self.infer_type == "infer_from_audio": |
|
self._build_tmp_dataset_from_audio() |
|
self.cfg.dataset = ["tmp"] |
|
|
|
|
|
with self.accelerator.main_process_first(): |
|
self.logger.info("Building dataset...") |
|
start = time.monotonic_ns() |
|
self.test_dataloader = self._build_dataloader() |
|
end = time.monotonic_ns() |
|
self.logger.info(f"Building dataset done in {(end - start) / 1e6:.2f}ms") |
|
|
|
|
|
with self.accelerator.main_process_first(): |
|
self.logger.info("Building model...") |
|
start = time.monotonic_ns() |
|
self.model = self._build_model() |
|
end = time.monotonic_ns() |
|
self.logger.info(f"Building model done in {(end - start) / 1e6:.3f}ms") |
|
|
|
|
|
self.logger.info("Initializing accelerate...") |
|
start = time.monotonic_ns() |
|
self.accelerator = accelerate.Accelerator() |
|
(self.model, self.test_dataloader) = self.accelerator.prepare( |
|
self.model, self.test_dataloader |
|
) |
|
end = time.monotonic_ns() |
|
self.accelerator.wait_for_everyone() |
|
self.logger.info(f"Initializing accelerate done in {(end - start) / 1e6:.3f}ms") |
|
|
|
with self.accelerator.main_process_first(): |
|
self.logger.info("Loading checkpoint...") |
|
start = time.monotonic_ns() |
|
if os.path.isdir(args.vocoder_dir): |
|
if os.path.isdir(os.path.join(args.vocoder_dir, "checkpoint")): |
|
self._load_model(os.path.join(args.vocoder_dir, "checkpoint")) |
|
else: |
|
self._load_model(os.path.join(args.vocoder_dir)) |
|
else: |
|
self._load_model(os.path.join(args.vocoder_dir)) |
|
end = time.monotonic_ns() |
|
self.logger.info(f"Loading checkpoint done in {(end - start) / 1e6:.3f}ms") |
|
|
|
self.model.eval() |
|
self.accelerator.wait_for_everyone() |
|
|
|
def _build_tmp_dataset_from_feature(self): |
|
if os.path.exists(os.path.join(self.cfg.preprocess.processed_dir, "tmp")): |
|
shutil.rmtree(os.path.join(self.cfg.preprocess.processed_dir, "tmp")) |
|
|
|
utts = [] |
|
mels = glob(os.path.join(self.args.feature_folder, "mels", "*.npy")) |
|
for i, mel in enumerate(mels): |
|
uid = mel.split("/")[-1].split(".")[0] |
|
utt = {"Dataset": "tmp", "Uid": uid, "index": i} |
|
utts.append(utt) |
|
|
|
os.makedirs(os.path.join(self.cfg.preprocess.processed_dir, "tmp")) |
|
with open( |
|
os.path.join(self.cfg.preprocess.processed_dir, "tmp", "test.json"), "w" |
|
) as f: |
|
json.dump(utts, f) |
|
|
|
meta_info = {"dataset": "tmp", "test": {"size": len(utts)}} |
|
|
|
with open( |
|
os.path.join(self.cfg.preprocess.processed_dir, "tmp", "meta_info.json"), |
|
"w", |
|
) as f: |
|
json.dump(meta_info, f) |
|
|
|
features = glob(os.path.join(self.args.feature_folder, "*")) |
|
for feature in features: |
|
feature_name = feature.split("/")[-1] |
|
if os.path.isfile(feature): |
|
continue |
|
shutil.copytree( |
|
os.path.join(self.args.feature_folder, feature_name), |
|
os.path.join(self.cfg.preprocess.processed_dir, "tmp", feature_name), |
|
) |
|
|
|
def _build_tmp_dataset_from_audio(self): |
|
if os.path.exists(os.path.join(self.cfg.preprocess.processed_dir, "tmp")): |
|
shutil.rmtree(os.path.join(self.cfg.preprocess.processed_dir, "tmp")) |
|
|
|
utts = [] |
|
audios = glob(os.path.join(self.args.audio_folder, "*")) |
|
for i, audio in enumerate(audios): |
|
uid = audio.split("/")[-1].split(".")[0] |
|
utt = {"Dataset": "tmp", "Uid": uid, "index": i, "Path": audio} |
|
utts.append(utt) |
|
|
|
os.makedirs(os.path.join(self.cfg.preprocess.processed_dir, "tmp")) |
|
with open( |
|
os.path.join(self.cfg.preprocess.processed_dir, "tmp", "test.json"), "w" |
|
) as f: |
|
json.dump(utts, f) |
|
|
|
meta_info = {"dataset": "tmp", "test": {"size": len(utts)}} |
|
|
|
with open( |
|
os.path.join(self.cfg.preprocess.processed_dir, "tmp", "meta_info.json"), |
|
"w", |
|
) as f: |
|
json.dump(meta_info, f) |
|
|
|
from processors import acoustic_extractor |
|
|
|
acoustic_extractor.extract_utt_acoustic_features_serial( |
|
utts, os.path.join(self.cfg.preprocess.processed_dir, "tmp"), self.cfg |
|
) |
|
|
|
def _build_test_dataset(self): |
|
return VocoderDataset, VocoderCollator |
|
|
|
def _build_model(self): |
|
model = _vocoders[self.cfg.model.generator](self.cfg) |
|
return model |
|
|
|
def _build_dataloader(self): |
|
"""Build dataloader which merges a series of datasets.""" |
|
Dataset, Collator = self._build_test_dataset() |
|
|
|
datasets_list = [] |
|
for dataset in self.cfg.dataset: |
|
subdataset = Dataset(self.cfg, dataset, is_valid=True) |
|
datasets_list.append(subdataset) |
|
test_dataset = VocoderConcatDataset(datasets_list, full_audio_inference=False) |
|
test_collate = Collator(self.cfg) |
|
test_batch_size = min(self.cfg.inference.batch_size, len(test_dataset)) |
|
test_dataloader = DataLoader( |
|
test_dataset, |
|
collate_fn=test_collate, |
|
num_workers=1, |
|
batch_size=test_batch_size, |
|
shuffle=False, |
|
) |
|
self.test_batch_size = test_batch_size |
|
self.test_dataset = test_dataset |
|
return test_dataloader |
|
|
|
def _load_model(self, checkpoint_dir, from_multi_gpu=False): |
|
"""Load model from checkpoint. If a folder is given, it will |
|
load the latest checkpoint in checkpoint_dir. If a path is given |
|
it will load the checkpoint specified by checkpoint_path. |
|
**Only use this method after** ``accelerator.prepare()``. |
|
""" |
|
if os.path.isdir(checkpoint_dir): |
|
if "epoch" in checkpoint_dir and "step" in checkpoint_dir: |
|
checkpoint_path = checkpoint_dir |
|
else: |
|
|
|
ls = [ |
|
str(i) |
|
for i in Path(checkpoint_dir).glob("*") |
|
if not "audio" in str(i) |
|
] |
|
ls.sort( |
|
key=lambda x: int(x.split("/")[-1].split("_")[0].split("-")[-1]), |
|
reverse=True, |
|
) |
|
checkpoint_path = ls[0] |
|
accelerate.load_checkpoint_and_dispatch( |
|
self.accelerator.unwrap_model(self.model), |
|
os.path.join(checkpoint_path, "pytorch_model.bin"), |
|
) |
|
return str(checkpoint_path) |
|
else: |
|
|
|
if self.cfg.model.generator in [ |
|
"bigvgan", |
|
"hifigan", |
|
"melgan", |
|
"nsfhifigan", |
|
]: |
|
ckpt = torch.load( |
|
checkpoint_dir, |
|
map_location=( |
|
torch.device("cuda") |
|
if torch.cuda.is_available() |
|
else torch.device("cpu") |
|
), |
|
) |
|
if from_multi_gpu: |
|
pretrained_generator_dict = ckpt["generator_state_dict"] |
|
generator_dict = self.model.state_dict() |
|
|
|
new_generator_dict = { |
|
k.split("module.")[-1]: v |
|
for k, v in pretrained_generator_dict.items() |
|
if ( |
|
k.split("module.")[-1] in generator_dict |
|
and v.shape == generator_dict[k.split("module.")[-1]].shape |
|
) |
|
} |
|
|
|
generator_dict.update(new_generator_dict) |
|
|
|
self.model.load_state_dict(generator_dict) |
|
else: |
|
self.model.load_state_dict(ckpt["generator_state_dict"]) |
|
else: |
|
self.model.load_state_dict(torch.load(checkpoint_dir)["state_dict"]) |
|
return str(checkpoint_dir) |
|
|
|
def inference(self): |
|
"""Inference via batches""" |
|
for i, batch in tqdm(enumerate(self.test_dataloader)): |
|
if self.cfg.preprocess.use_frame_pitch: |
|
audio_pred = _vocoder_forward_funcs[self.cfg.model.generator]( |
|
self.cfg, |
|
self.model, |
|
batch["mel"].transpose(-1, -2), |
|
f0s=batch["frame_pitch"].float(), |
|
device=next(self.model.parameters()).device, |
|
) |
|
else: |
|
audio_pred = _vocoder_forward_funcs[self.cfg.model.generator]( |
|
self.cfg, |
|
self.model, |
|
batch["mel"].transpose(-1, -2), |
|
device=next(self.model.parameters()).device, |
|
) |
|
audio_ls = audio_pred.chunk(self.test_batch_size) |
|
audio_gt_ls = batch["audio"].cpu().chunk(self.test_batch_size) |
|
length_ls = batch["target_len"].cpu().chunk(self.test_batch_size) |
|
j = 0 |
|
for it, it_gt, l in zip(audio_ls, audio_gt_ls, length_ls): |
|
l = l.item() |
|
it = it.squeeze(0).squeeze(0)[: l * self.cfg.preprocess.hop_size] |
|
it_gt = it_gt.squeeze(0)[: l * self.cfg.preprocess.hop_size] |
|
uid = self.test_dataset.metadata[i * self.test_batch_size + j]["Uid"] |
|
save_audio( |
|
os.path.join(self.args.output_dir, "pred", "{}.wav").format(uid), |
|
it, |
|
self.cfg.preprocess.sample_rate, |
|
) |
|
save_audio( |
|
os.path.join(self.args.output_dir, "gt", "{}.wav").format(uid), |
|
it_gt, |
|
self.cfg.preprocess.sample_rate, |
|
) |
|
j += 1 |
|
|
|
if os.path.exists(os.path.join(self.cfg.preprocess.processed_dir, "tmp")): |
|
shutil.rmtree(os.path.join(self.cfg.preprocess.processed_dir, "tmp")) |
|
|
|
def _set_random_seed(self, seed): |
|
"""Set random seed for all possible random modules.""" |
|
random.seed(seed) |
|
np.random.seed(seed) |
|
torch.random.manual_seed(seed) |
|
|
|
def _count_parameters(self, model): |
|
return sum(p.numel() for p in model.parameters()) |
|
|
|
def _dump_cfg(self, path): |
|
os.makedirs(os.path.dirname(path), exist_ok=True) |
|
json5.dump( |
|
self.cfg, |
|
open(path, "w"), |
|
indent=4, |
|
sort_keys=True, |
|
ensure_ascii=False, |
|
quote_keys=True, |
|
) |
|
|
|
|
|
def load_nnvocoder( |
|
cfg, |
|
vocoder_name, |
|
weights_file, |
|
from_multi_gpu=False, |
|
): |
|
"""Load the specified vocoder. |
|
cfg: the vocoder config filer. |
|
weights_file: a folder or a .pt path. |
|
from_multi_gpu: automatically remove the "module" string in state dicts if "True". |
|
""" |
|
print("Loading Vocoder from Weights file: {}".format(weights_file)) |
|
|
|
|
|
model = _vocoders[vocoder_name](cfg) |
|
if not os.path.isdir(weights_file): |
|
|
|
if vocoder_name in ["bigvgan", "hifigan", "melgan", "nsfhifigan"]: |
|
ckpt = torch.load( |
|
weights_file, |
|
map_location=( |
|
torch.device("cuda") |
|
if torch.cuda.is_available() |
|
else torch.device("cpu") |
|
), |
|
) |
|
if from_multi_gpu: |
|
pretrained_generator_dict = ckpt["generator_state_dict"] |
|
generator_dict = model.state_dict() |
|
|
|
new_generator_dict = { |
|
k.split("module.")[-1]: v |
|
for k, v in pretrained_generator_dict.items() |
|
if ( |
|
k.split("module.")[-1] in generator_dict |
|
and v.shape == generator_dict[k.split("module.")[-1]].shape |
|
) |
|
} |
|
|
|
generator_dict.update(new_generator_dict) |
|
|
|
model.load_state_dict(generator_dict) |
|
else: |
|
model.load_state_dict(ckpt["generator_state_dict"]) |
|
else: |
|
model.load_state_dict(torch.load(weights_file)["state_dict"]) |
|
else: |
|
|
|
weights_file = os.path.join(weights_file, "checkpoint") |
|
ls = [str(i) for i in Path(weights_file).glob("*") if not "audio" in str(i)] |
|
ls.sort(key=lambda x: int(x.split("_")[-3].split("-")[-1]), reverse=True) |
|
checkpoint_path = ls[0] |
|
accelerator = accelerate.Accelerator() |
|
model = accelerator.prepare(model) |
|
accelerator.load_state(checkpoint_path) |
|
|
|
if torch.cuda.is_available(): |
|
model = model.cuda() |
|
|
|
model = model.eval() |
|
return model |
|
|
|
|
|
def tensorize(data, device, n_samples): |
|
""" |
|
data: a list of numpy array |
|
""" |
|
assert type(data) == list |
|
if n_samples: |
|
data = data[:n_samples] |
|
data = [torch.as_tensor(x, device=device) for x in data] |
|
return data |
|
|
|
|
|
def synthesis( |
|
cfg, |
|
vocoder_weight_file, |
|
n_samples, |
|
pred, |
|
f0s=None, |
|
batch_size=64, |
|
fast_inference=False, |
|
): |
|
"""Synthesis audios from a given vocoder and series of given features. |
|
cfg: vocoder config. |
|
vocoder_weight_file: a folder of accelerator state dict or a path to the .pt file. |
|
pred: a list of numpy arrays. [(seq_len1, acoustic_features_dim), (seq_len2, acoustic_features_dim), ...] |
|
""" |
|
|
|
vocoder_name = cfg.model.generator |
|
|
|
print("Synthesis audios using {} vocoder...".format(vocoder_name)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocoder = load_nnvocoder( |
|
cfg, vocoder_name, weights_file=vocoder_weight_file, from_multi_gpu=True |
|
) |
|
device = next(vocoder.parameters()).device |
|
|
|
|
|
|
|
mels_pred = tensorize([p.T for p in pred], device, n_samples) |
|
print("For predicted mels, #sample = {}...".format(len(mels_pred))) |
|
audios_pred = _vocoder_infer_funcs[vocoder_name]( |
|
cfg, |
|
vocoder, |
|
mels_pred, |
|
f0s=f0s, |
|
batch_size=batch_size, |
|
fast_inference=fast_inference, |
|
) |
|
return audios_pred |
|
|