# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import shutil import warnings import argparse import torch import os import yaml warnings.simplefilter("ignore") from .modules.commons import * import time import torchaudio import librosa from collections import OrderedDict class FAcodecInference(object): def __init__(self, args=None, cfg=None): self.args = args self.cfg = cfg self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = self._build_model() self._load_checkpoint() def _build_model(self): model = build_model(self.cfg.model_params) _ = [model[key].to(self.device) for key in model] return model def _load_checkpoint(self): sd = torch.load(self.args.checkpoint_path, map_location="cpu") sd = sd["net"] if "net" in sd else sd new_params = dict() for key, state_dict in sd.items(): new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith("module."): k = k[7:] new_state_dict[k] = v new_params[key] = new_state_dict for key in new_params: if key in self.model: self.model[key].load_state_dict(new_params[key]) _ = [self.model[key].eval() for key in self.model] @torch.no_grad() def inference(self, source, output_dir): source_audio = librosa.load(source, sr=self.cfg.preprocess_params.sr)[0] source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(self.device) z = self.model.encoder(source_audio[None, ...].to(self.device).float()) ( z, quantized, commitment_loss, codebook_loss, timbre, codes, ) = self.model.quantizer( z, source_audio[None, ...].to(self.device).float(), n_c=self.cfg.model_params.n_c_codebooks, return_codes=True, ) full_pred_wave = self.model.decoder(z) os.makedirs(output_dir, exist_ok=True) source_name = source.split("/")[-1].split(".")[0] torchaudio.save( f"{output_dir}/reconstructed_{source_name}.wav", full_pred_wave[0].cpu(), self.cfg.preprocess_params.sr, ) print( "Reconstructed audio saved as: ", f"{output_dir}/reconstructed_{source_name}.wav", ) return quantized, codes @torch.no_grad() def voice_conversion(self, source, reference, output_dir): source_audio = librosa.load(source, sr=self.cfg.preprocess_params.sr)[0] source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(self.device) reference_audio = librosa.load(reference, sr=self.cfg.preprocess_params.sr)[0] reference_audio = ( torch.tensor(reference_audio).unsqueeze(0).float().to(self.device) ) z = self.model.encoder(source_audio[None, ...].to(self.device).float()) z, quantized, commitment_loss, codebook_loss, timbre = self.model.quantizer( z, source_audio[None, ...].to(self.device).float(), n_c=self.cfg.model_params.n_c_codebooks, ) z_ref = self.model.encoder(reference_audio[None, ...].to(self.device).float()) ( z_ref, quantized_ref, commitment_loss_ref, codebook_loss_ref, timbre_ref, ) = self.model.quantizer( z_ref, reference_audio[None, ...].to(self.device).float(), n_c=self.cfg.model_params.n_c_codebooks, ) z_conv = self.model.quantizer.voice_conversion( quantized[0] + quantized[1], reference_audio[None, ...].to(self.device).float(), ) full_pred_wave = self.model.decoder(z_conv) os.makedirs(output_dir, exist_ok=True) source_name = source.split("/")[-1].split(".")[0] reference_name = reference.split("/")[-1].split(".")[0] torchaudio.save( f"{output_dir}/converted_{source_name}_to_{reference_name}.wav", full_pred_wave[0].cpu(), self.cfg.preprocess_params.sr, ) print( "Voice conversion results saved as: ", f"{output_dir}/converted_{source_name}_to_{reference_name}.wav", )