File size: 8,108 Bytes
0bc5811 5f25dab a7adc86 a36f6e8 43ef6e1 a36f6e8 43ef6e1 a36f6e8 d1de440 a36f6e8 d1de440 d220bdb d1de440 a7adc86 c434eb9 d220bdb d1de440 d220bdb a7adc86 d1de440 d220bdb d1de440 a7adc86 4019e19 a7adc86 35b4786 4019e19 0b7b2a0 a7adc86 0b7b2a0 a7adc86 d220bdb d1de440 b1a29ae 853d005 cfdd268 853d005 cfdd268 8c892cf a7adc86 0b7b2a0 a7adc86 3d64286 a7adc86 0b7b2a0 0d34cc5 7d4fd9a a36f6e8 d1de440 a36f6e8 d1de440 a7adc86 d1de440 43ef6e1 066e743 0bc5811 066e743 3d64286 a7adc86 0bc5811 43ef6e1 5bab0bb 43ef6e1 0abe5d6 43ef6e1 0bc5811 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import ONNXVITS_models
import utils
from text import text_to_sequence
import torch
import commons
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
hps = utils.get_hparams_from_file("lovelive/config.json")
symbols = hps.symbols
net_g = ONNXVITS_models.SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model)
_ = net_g.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g)
text1 = get_text("[JA]ありがとうございます。[JA]", hps)
stn_tst = text1
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.tensor([0])
o = net_g(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
'''
import romajitable
import re
import numpy as np
import logging
logging.getLogger('numba').setLevel(logging.WARNING)
import IPython.display as ipd
import torch
import commons
import utils
import ONNXVITS_infer
from text.symbols import symbols
from text import text_to_sequence
import gradio as gr
import time
def get_text(text, hps):
text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def selection(speaker):
if speaker == "高咲侑":
spk = 0
return spk
elif speaker == "歩夢":
spk = 1
return spk
elif speaker == "かすみ":
spk = 2
return spk
elif speaker == "しずく":
spk = 3
return spk
elif speaker == "果林":
spk = 4
return spk
elif speaker == "愛":
spk = 5
return spk
elif speaker == "彼方":
spk = 6
return spk
elif speaker == "せつ菜":
spk = 7
return spk
elif speaker == "エマ":
spk = 8
return spk
elif speaker == "璃奈":
spk = 9
return spk
elif speaker == "栞子":
spk = 10
return spk
elif speaker == "ランジュ":
spk = 11
return spk
elif speaker == "ミア":
spk = 12
return spk
elif speaker == "三色绘恋1":
spk = 13
return spk
elif speaker == "三色绘恋2":
spk = 15
elif speaker == "派蒙":
spk = 16
return spk
def is_japanese(string):
for ch in string:
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
return True
return False
def is_english(string):
import re
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
if pattern.fullmatch(string):
return True
else:
return False
def sle(language,tts_input0):
if language == "中文":
tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
return tts_input1
if language == "自动":
tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
return tts_input1
elif language == "日文":
tts_input1 = "[JA]" + tts_input0.replace('\n','。').replace(' ',',') + "[JA]"
return tts_input1
def extrac(text):
text = re.sub("<[^>]*>","",text)
result_list = re.split(r'\n', text)
final_list = []
for i in result_list:
if is_english(i):
i = romajitable.to_kana(i).katakana
i = i.replace('\n','').replace(' ','')
if len(i)>1:
if len(i) > 20:
try:
cur_list = re.split(r'。', i)
for i in cur_list:
if len(i)>1:
final_list.append(i+'。')
except:
pass
final_list.append(i)
final_list = [x for x in final_list if x != '']
print(final_list)
return final_list
def infer(language,text,speaker_id, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
speaker_id = int(selection(speaker_id))
a = ['【','[','(','(']
b = ['】',']',')',')']
for i in a:
text = text.replace(i,'<')
for i in b:
text = text.replace(i,'>')
final_list = extrac(text.replace('“','').replace('”',''))
audio_fin = []
c = 0
for sentence in final_list:
c +=1
try:
stn_tst = get_text(sle(language,sentence), hps_ms)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0).to(dev)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
sid = torch.LongTensor([speaker_id]).to(dev)
t1 = time.time()
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
t2 = time.time()
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
print(spending_time)
audio_fin.append(audio)
except:
print('存在非法字符')
return (hps_ms.data.sampling_rate, np.concatenate(audio_fin))
lan = ["中文","日文","自动"]
idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
dev = torch.device("cpu")
hps_ms = utils.get_hparams_from_file("lovelive/config.json")
net_g_ms = ONNXVITS_infer.SynthesizerTrn(
len(symbols),
hps_ms.data.filter_length // 2 + 1,
hps_ms.train.segment_size // hps_ms.data.hop_length,
n_speakers=hps_ms.data.n_speakers,
**hps_ms.model)
_ = net_g_ms.eval()
_ = utils.load_checkpoint("lovelive/G_525000.pth", net_g_ms)
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("虹团vits模型,现可按句分割实现长文本合成,onnx导出后存在质量损失,建议本地运行vits模型"):
tts_input1 = gr.TextArea(label="去标贝新模型,老版本在lovelive文件夹中", value="数千怀言者已经为你集结,列队在通往主舰桥的过道上。他们歌唱着你们名字,高声呼喊,以一种原始的、咆哮般的合唱作为对你的致敬。你从他们中间走过,一边点头,一边接受他们的赞美,你沉溺其中,几乎被他们巨大的音量所震撼。\n他们之中没有一个胆敢直视你。没有一个能够承受。你对他们超人类的眼睛来说都太过光辉。从他们中间走过时,你巨大的影子从他们身上掠过,他们立时将目光挪开,眼含泪水,吟诵你的大名时甚至不敢看你一眼。他们的吟唱中含有愤怒。几乎是疯狂的绝望。那感觉就好像他们害怕停下来,害怕自己会喘息停顿,好像尖叫出你的名字是唯一能让他们活着的事情。\n或许确实如此。作为对他们崇拜的回应,你谦虚地抬抬手,随后走进主舰桥。\nI In a word, Horus is a joker.")
language = gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True)
para_input1 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声比例,以控制情感", value=0.667)
para_input2 = gr.Slider(minimum= 0.01,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7)
para_input3 = gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)
tts_submit = gr.Button("Generate", variant="primary")
speaker1 = gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True)
tts_output2 = gr.Audio(label="Output")
tts_submit.click(infer, [language,tts_input1,speaker1,para_input1,para_input2,para_input3], [tts_output2])
#app.launch(share=True)
app.launch()
''' |