vits-for-ba / run.py
tovaru's picture
initial commit
f2f3712
raw
history blame
1.74 kB
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import json
import math
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
config_path = "C:\\Users\\zelda\\Documents\\GitHub\\vits-finetuning\\models\\kayoko\\config.json"
model_path = "C:\\Users\\zelda\\Documents\\GitHub\\vits-finetuning\\models\\kayoko\\hayoko.pth"
hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model).cuda()
model = net_g.eval()
pythomodel = utils.load_checkpoint(model_path, net_g, None)
speaker_id = 10 #@param {type:"number"}
text = "\u306F\u3041... \u843D\u3061\u7740\u3044\u3066\u304F\u308C\u306A\u3044\u304B\uFF1F"
noise_scale=0.6 #@param {type:"number"}
noise_scale_w=0.668 #@param {type:"number"}
length_scale=1.0 #@param {type:"number"}
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.cuda().unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
sid = torch.LongTensor([speaker_id]).cuda()
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))