|
import gradio as gr |
|
import torch |
|
import commons |
|
import utils |
|
from models import SynthesizerTrn |
|
from text.symbols import symbols |
|
from text import text_to_sequence |
|
import random |
|
import os |
|
import datetime |
|
import numpy as np |
|
|
|
|
|
def get_text(text, hps): |
|
text_norm = text_to_sequence(text, hps.data.text_cleaners) |
|
if hps.data.add_blank: |
|
text_norm = commons.intersperse(text_norm, 0) |
|
text_norm = torch.LongTensor(text_norm) |
|
return text_norm |
|
|
|
|
|
def tts(txt, emotion, index, hps, net_g, random_emotion_root): |
|
"""emotion为参考情感音频路径 或random_sample(随机抽取)""" |
|
stn_tst = get_text(txt, hps) |
|
rand_wav = "" |
|
with torch.no_grad(): |
|
x_tst = stn_tst.unsqueeze(0) |
|
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) |
|
sid = torch.LongTensor([index]) |
|
if os.path.exists(f"{emotion}"): |
|
emo = torch.FloatTensor(np.load(f"{emotion}")).unsqueeze(0) |
|
rand_wav = emotion |
|
elif emotion == "random_sample": |
|
while True: |
|
rand_wav = random.sample(os.listdir(random_emotion_root), 1)[0] |
|
if os.path.exists(f"{random_emotion_root}/{rand_wav}"): |
|
break |
|
emo = torch.FloatTensor(np.load(f"{random_emotion_root}/{rand_wav}")).unsqueeze(0) |
|
print(f"{random_emotion_root}/{rand_wav}") |
|
else: |
|
print("emotion参数不正确") |
|
|
|
audio = \ |
|
net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1, emo=emo)[ |
|
0][ |
|
0, 0].data.float().numpy() |
|
path = random_emotion_root+"/"+rand_wav |
|
return audio,path |
|
|
|
|
|
def random_generate(txt, index, hps, net_g, random_emotion_root): |
|
|
|
audio ,rand_wav= tts(txt, emotion='random_sample', index=index, hps=hps, net_g=net_g, |
|
random_emotion_root=random_emotion_root) |
|
return audio,rand_wav |
|
|
|
|
|
def charaterRoot(name): |
|
global random_emotion_root |
|
if name == '九条都': |
|
random_emotion_root = "9nineEmo/my" |
|
index = 0 |
|
elif name == '新海天': |
|
random_emotion_root = "9nineEmo/sr" |
|
index = 1 |
|
elif name == '结城希亚': |
|
random_emotion_root = "9nineEmo/na" |
|
index = 2 |
|
elif name == '蕾娜': |
|
random_emotion_root = "9nineEmo/gt" |
|
index = 3 |
|
elif name == '索菲': |
|
random_emotion_root = "9nineEmo/sf" |
|
index = 4 |
|
return random_emotion_root, index |
|
|
|
|
|
def configSelect(config): |
|
global checkPonit, config_file |
|
if config == 'mul': |
|
config_file = "./configs/9nine_multi.json" |
|
checkPonit = "logs/9nineM/G_252000.pth" |
|
elif config == "single": |
|
config_file = "./configs/sora.json" |
|
checkPonit = "logs/sora/G_341200.pth" |
|
return config_file, checkPonit |
|
|
|
|
|
def runVits(name, config, txt,emotion): |
|
config_file, checkPoint = configSelect(config) |
|
random_emotion_root, index = charaterRoot(name=name) |
|
checkPonit = checkPoint |
|
hps = utils.get_hparams_from_file(config_file) |
|
net_g = SynthesizerTrn( |
|
len(symbols), |
|
hps.data.filter_length // 2 + 1, |
|
hps.train.segment_size // hps.data.hop_length, |
|
n_speakers=hps.data.n_speakers, |
|
**hps.model) |
|
_ = net_g.eval() |
|
|
|
_ = utils.load_checkpoint(checkPonit, net_g, None) |
|
audio, rand_wav = tts(txt, emotion=emotion, index=index, hps=hps, net_g=net_g, |
|
random_emotion_root=random_emotion_root) |
|
return (hps.data.sampling_rate, audio),rand_wav |
|
|
|
|
|
def nineMul(name, txt): |
|
config = 'mul' |
|
audio ,rand_wav= runVits(name, config, txt,'random_sample') |
|
return "multiple model success", audio,rand_wav |
|
|
|
|
|
def nineSingle(name,txt): |
|
config = 'single' |
|
|
|
audio ,rand_wav= runVits(name, config, txt,'random_sample') |
|
return "single model success", audio,rand_wav |
|
|
|
def nineMul_select_emo(name, txt,emo): |
|
config = 'mul' |
|
|
|
print(emo) |
|
audio, _ = runVits(name, config, txt, emo) |
|
message = "情感依赖:" + emo + "sythesis success!" |
|
return message,audio |
|
|
|
app = gr.Blocks() |
|
with app: |
|
with gr.Tabs(): |
|
with gr.TabItem("9nine multiple model"): |
|
character = gr.Radio(['九条都', '新海天', '结城希亚', '蕾娜', '索菲'], label='character', |
|
info="select character you want") |
|
|
|
text = gr.TextArea(label="input content,Japanese support only", value="祭りに行っただよね、知らない女の子と一緒にいて。") |
|
|
|
submit = gr.Button("generate", variant='privite') |
|
message = gr.Textbox(label="Message") |
|
audio = gr.Audio(label="output") |
|
emotion = gr.Textbox(label="参照情感:") |
|
submit.click(nineMul, [character, text], [message, audio,emotion]) |
|
with gr.TabItem("9nine single model"): |
|
character = gr.Radio(['新海天'], label='character', |
|
info="single model for 新海天 only") |
|
|
|
text = gr.TextArea(label="input content,Japanese support only", value="祭りに行っただよね、知らない女の子と一緒にいて。") |
|
|
|
submit = gr.Button("generate", variant='privite') |
|
message = gr.Textbox(label="Message") |
|
audio = gr.Audio(label="output") |
|
emotion = gr.Textbox(label="参照情感:") |
|
submit.click(nineSingle, [character, text], [message, audio,emotion]) |
|
with gr.TabItem("Choose Emotion Embedding"): |
|
character = gr.Radio(['九条都', '新海天', '结城希亚', '蕾娜', '索菲'], label='character', |
|
info="select character you want") |
|
|
|
text = gr.TextArea(label="input content, Japanese support only", value="祭りに行っただよね、知らない女の子と一緒にいて。") |
|
emotion = gr.Textbox(label="从多人模型中获得的情感依照。例如”./9nineEmo/sf/sf0207.wav.emo.npy“,尽量使用本人的情感他人的情感会串味") |
|
submit = gr.Button("generate", variant='privite') |
|
message = gr.Textbox(label="Message") |
|
audio = gr.Audio(label="output") |
|
|
|
submit.click(nineMul_select_emo, [character, text,emotion], [message, audio]) |
|
app.launch() |
|
|