import matplotlib.pyplot as plt import IPython.display as ipd import os import json import math import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence from scipy.io.wavfile import write def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm config_path = "C:\\Users\\zelda\\Documents\\GitHub\\vits-finetuning\\models\\kayoko\\config.json" model_path = "C:\\Users\\zelda\\Documents\\GitHub\\vits-finetuning\\models\\kayoko\\hayoko.pth" hps = utils.get_hparams_from_file(config_path) net_g = SynthesizerTrn( len(hps.symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model).cuda() model = net_g.eval() pythomodel = utils.load_checkpoint(model_path, net_g, None) speaker_id = 10 #@param {type:"number"} text = "\u306F\u3041... \u843D\u3061\u7740\u3044\u3066\u304F\u308C\u306A\u3044\u304B\uFF1F" noise_scale=0.6 #@param {type:"number"} noise_scale_w=0.668 #@param {type:"number"} length_scale=1.0 #@param {type:"number"} stn_tst = get_text(text, hps) with torch.no_grad(): x_tst = stn_tst.cuda().unsqueeze(0) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda() sid = torch.LongTensor([speaker_id]).cuda() audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy() ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))