import argparse import json import datetime as dt import numpy as np from scipy.io.wavfile import write import IPython.display as ipd import glob import torch from pydub import AudioSegment from torch.utils.data import DataLoader from text import text_to_sequence, cmudict from text.symbols import symbols import utils_data import re from num2words import num2words from kaldiio import WriteHelper import os from tqdm import tqdm from text import text_to_sequence, convert_text from model import GradTTSWithEmo import utils_data as utils from attrdict import AttrDict from models import Generator as HiFiGAN HIFIGAN_CONFIG = './configs/hifigan-config.json' HIFIGAN_CHECKPT = './checkpts/hifigan.pt' if __name__ == '__main__': hps, args = utils.get_hparams_decode() device = torch.device('cpu' if not torch.cuda.is_available() else "cuda") ckpt = utils_data.latest_checkpoint_path(hps.model_dir, "EMA_grad_*.pt") print(ckpt) model = GradTTSWithEmo(**hps.model).to(device) logger = utils_data.get_logger(hps.model_dir, "inference.log") utils_data.load_checkpoint(ckpt, model, None) _ = model.cuda().eval() print('Initializing HiFi-GAN...') with open(HIFIGAN_CONFIG) as f: h = AttrDict(json.load(f)) vocoder = HiFiGAN(h) vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) _ = vocoder.cuda().eval() vocoder.remove_weight_norm() emos = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) speakers = ['M1', 'F1', 'M2'] with open(args.file, 'r', encoding='utf-8') as f: texts = [line.strip() for line in f.readlines()] replace_nums = [] for i in texts: replace_nums.append(i.split('|', 1)) nums2word = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in np.array(replace_nums)[:, 0]] # Speakers id. # M1 = 0 # F1 = 1 # M2 = 2 text2speech = [] for i, j in zip(nums2word, np.array(replace_nums)[:, 1]): text2speech.append(f'{i}|{j}') for i, line in enumerate(text2speech): emo_i = int(line.split('|')[1]) control_spk_id = int(line.split('|')[2]) control_emo_id = emos.index(emos[emo_i]) text = line.split('|')[0] with torch.no_grad(): ### define emotion emo = torch.LongTensor([control_emo_id]).to(device) sid = torch.LongTensor([control_spk_id]).to(device) text_padded, text_len = convert_text(text) y_enc, y_dec, attn = model.forward(text_padded, text_len, n_timesteps=args.timesteps, temperature=args.noise, stoc=args.stoc, spk=sid,emo=emo, length_scale=1., classifier_free_guidance=args.guidance) res = y_dec.squeeze().cpu().numpy() x = torch.from_numpy(res).cuda().unsqueeze(0) y_g_hat = vocoder(x) audio = y_g_hat.squeeze() audio = audio * 32768.0 audio = audio.detach().cpu().numpy().astype('int16') audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1) audio.export(f'{args.generated_path}/{emos[emo_i]}_{speakers[int(line.split("|")[2])]}.wav', format="wav")