vits-for-ba / run_gradio.py
tovaru's picture
initial commit
f2f3712
raw
history blame
2.13 kB
import matplotlib.pyplot as plt
#import IPython.display as ipd
import os
import json
import math
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def speak(text, speaker_id, character, noise_scale, noise_scale_w, length_scale, device_name):
# load character
model_path = f"models\\{character}.pth"
config_path = "models\\config.json"
device = torch.device(device_name)
hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
len(hps.symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
model = net_g.eval().to(device)
pythomodel = utils.load_checkpoint(model_path, net_g, None)
# infere
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
sid = torch.LongTensor([speaker_id]).to(device)
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy()
#ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
return (22050, audio)
import gradio as gr
demo = gr.Interface(
fn=speak,
inputs=[
gr.Textbox(value="\u304a\u306f\u3088\u3046\u3054\u3056\u3044\u307e\u3059\u3002"),
gr.Number(value=10),
gr.Radio(["aru", "haruka", "hifumi", "kayoko", "koharu", "mutsuki"], value="aru"),
gr.Number(value=0.6),
gr.Number(value=0.668),
gr.Number(value=1.0),
gr.Radio(["gpu", "cpu", "cuda"], value="cpu")
],
outputs="audio"
)
demo.launch(share=False)