import matplotlib.pyplot as plt #import IPython.display as ipd import os import json import math import torch import commons import utils from models import SynthesizerTrn from text.symbols import symbols from text import text_to_sequence from scipy.io.wavfile import write def get_text(text, hps): text_norm = text_to_sequence(text, hps.data.text_cleaners) if hps.data.add_blank: text_norm = commons.intersperse(text_norm, 0) text_norm = torch.LongTensor(text_norm) return text_norm def speak(text, speaker_id, character, noise_scale, noise_scale_w, length_scale, device_name): # load character model_path = f"models\\{character}.pth" config_path = "models\\config.json" device = torch.device(device_name) hps = utils.get_hparams_from_file(config_path) net_g = SynthesizerTrn( len(hps.symbols), hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, n_speakers=hps.data.n_speakers, **hps.model) model = net_g.eval().to(device) pythomodel = utils.load_checkpoint(model_path, net_g, None) # infere stn_tst = get_text(text, hps) with torch.no_grad(): x_tst = stn_tst.unsqueeze(0).to(device) x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device) sid = torch.LongTensor([speaker_id]).to(device) audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale)[0][0,0].data.cpu().float().numpy() #ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False)) return (22050, audio) import gradio as gr demo = gr.Interface( fn=speak, inputs=[ gr.Textbox(value="\u304a\u306f\u3088\u3046\u3054\u3056\u3044\u307e\u3059\u3002"), gr.Number(value=10), gr.Radio(["aru", "haruka", "hifumi", "kayoko", "koharu", "mutsuki"], value="aru"), gr.Number(value=0.6), gr.Number(value=0.668), gr.Number(value=1.0), gr.Radio(["gpu", "cpu", "cuda"], value="cpu") ], outputs="audio" ) demo.launch(share=False)