File size: 1,740 Bytes
f2f3712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch


import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

config_path = "C:\\Users\\zelda\\Documents\\GitHub\\vits-finetuning\\models\\kayoko\\config.json" 
model_path = "C:\\Users\\zelda\\Documents\\GitHub\\vits-finetuning\\models\\kayoko\\hayoko.pth"
hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
    len(hps.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
model = net_g.eval()
pythomodel = utils.load_checkpoint(model_path, net_g, None)

speaker_id = 10 #@param {type:"number"}
text = "\u306F\u3041... \u843D\u3061\u7740\u3044\u3066\u304F\u308C\u306A\u3044\u304B\uFF1F"
noise_scale=0.6 #@param {type:"number"}
noise_scale_w=0.668 #@param {type:"number"}
length_scale=1.0 #@param {type:"number"}
stn_tst = get_text(text, hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([speaker_id]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))