Mahiruoshi
commited on
Commit
•
84b0af6
1
Parent(s):
49849fc
Update app.py
Browse files
app.py
CHANGED
@@ -14,170 +14,246 @@ import gradio as gr
|
|
14 |
import time
|
15 |
import datetime
|
16 |
import os
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
else:
|
120 |
-
final_list.append(i)
|
121 |
-
final_list = [x for x in final_list if x != '']
|
122 |
-
print(final_list)
|
123 |
-
return final_list
|
124 |
-
def infer(text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
|
125 |
-
speaker_id = int(selection(speaker_id))
|
126 |
-
a = ['【','[','(','(']
|
127 |
-
b = ['】',']',')',')']
|
128 |
-
for i in a:
|
129 |
-
text = text.replace(i,'<')
|
130 |
-
for i in b:
|
131 |
-
text = text.replace(i,'>')
|
132 |
-
final_list = extrac(text.replace('“','').replace('”',''))
|
133 |
-
audio_fin = []
|
134 |
-
c = 0
|
135 |
-
t = datetime.timedelta(seconds=0)
|
136 |
-
f1 = open("subtitles.srt",'w',encoding='utf-8')
|
137 |
-
for sentence in final_list:
|
138 |
-
c +=1
|
139 |
-
stn_tst = get_text(sle(language,sentence), hps_ms)
|
140 |
-
with torch.no_grad():
|
141 |
-
x_tst = stn_tst.unsqueeze(0).to(dev)
|
142 |
-
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
143 |
-
sid = torch.LongTensor([speaker_id]).to(dev)
|
144 |
t1 = time.time()
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
)
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
import time
|
15 |
import datetime
|
16 |
import os
|
17 |
+
class VitsGradio:
|
18 |
+
def __init__(self):
|
19 |
+
self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
20 |
+
self.lan = ["中文","日文","自动","手动"]
|
21 |
+
self.idols = ["歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
|
22 |
+
self.modelPaths = []
|
23 |
+
for root,dirs,files in os.walk("checkpoints"):
|
24 |
+
for dir in dirs:
|
25 |
+
self.modelPaths.append(dir)
|
26 |
+
with gr.Blocks() as self.Vits:
|
27 |
+
gr.Markdown(
|
28 |
+
"## <center> Lovelive虹团中日双语VITS\n"
|
29 |
+
"### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
|
30 |
+
"<div align='center'>目前有标贝普通话版,去标贝版,少歌模型目前还是大饼状态</div>"
|
31 |
+
'<div align="center"><a>参数说明:由于爱抖露们过于有感情,合成日语时建议将噪声比例调节至0.2-0.3区间,噪声偏差对应着每个字之间的间隔,对普通话影响较大,duration代表整体语速</div>'
|
32 |
+
'<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
|
33 |
+
with gr.Tab("TTS合成"):
|
34 |
+
with gr.Row():
|
35 |
+
with gr.Column():
|
36 |
+
with gr.Row():
|
37 |
+
with gr.Column():
|
38 |
+
input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
|
39 |
+
input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
|
40 |
+
input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
|
41 |
+
btnVC = gr.Button("Submit")
|
42 |
+
with gr.Column():
|
43 |
+
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
|
44 |
+
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
|
45 |
+
input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
|
46 |
+
output1 = gr.Audio(label="采样率22050")
|
47 |
+
btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1])
|
48 |
+
with gr.Tab("选择模型"):
|
49 |
+
with gr.Column():
|
50 |
+
modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
|
51 |
+
btnMod = gr.Button("载入模型")
|
52 |
+
statusa = gr.TextArea()
|
53 |
+
btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa])
|
54 |
+
with gr.Tab("小说合成(带字幕)"):
|
55 |
+
with gr.Row():
|
56 |
+
with gr.Column():
|
57 |
+
with gr.Row():
|
58 |
+
with gr.Column():
|
59 |
+
input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
|
60 |
+
input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
|
61 |
+
input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
|
62 |
+
btnVC = gr.Button("Submit")
|
63 |
+
with gr.Column():
|
64 |
+
input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
|
65 |
+
input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
|
66 |
+
input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1)
|
67 |
+
output1 = gr.Audio(label="采样率22050")
|
68 |
+
subtitle = gr.outputs.File(label="字幕文件:subtitles.srt")
|
69 |
+
btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle])
|
70 |
+
|
71 |
+
def loadCk(self,path):
|
72 |
+
self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
|
73 |
+
self.net_g = SynthesizerTrn(
|
74 |
+
len(symbols),
|
75 |
+
self.hps.data.filter_length // 2 + 1,
|
76 |
+
self.hps.train.segment_size // self.hps.data.hop_length,
|
77 |
+
n_speakers=self.hps.data.n_speakers,
|
78 |
+
**self.hps.model).to(self.dev)
|
79 |
+
_ = self.net_g.eval()
|
80 |
+
_ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g)
|
81 |
+
return "success"
|
82 |
+
|
83 |
+
def get_text(self,text):
|
84 |
+
text_norm = text_to_sequence(text, self.hps.data.text_cleaners)
|
85 |
+
if self.hps.data.add_blank:
|
86 |
+
text_norm = commons.intersperse(text_norm, 0)
|
87 |
+
text_norm = torch.LongTensor(text_norm)
|
88 |
+
return text_norm
|
89 |
+
|
90 |
+
def is_japanese(self,string):
|
91 |
+
for ch in string:
|
92 |
+
if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
|
93 |
+
return True
|
94 |
+
return False
|
95 |
+
|
96 |
+
def is_english(self,string):
|
97 |
+
import re
|
98 |
+
pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
|
99 |
+
if pattern.fullmatch(string):
|
100 |
+
return True
|
101 |
+
else:
|
102 |
+
return False
|
103 |
+
|
104 |
+
def selection(self,speaker):
|
105 |
+
if speaker == "高咲侑":
|
106 |
+
spk = 0
|
107 |
+
return spk
|
108 |
|
109 |
+
elif speaker == "歩夢":
|
110 |
+
spk = 1
|
111 |
+
return spk
|
112 |
|
113 |
+
elif speaker == "かすみ":
|
114 |
+
spk = 2
|
115 |
+
return spk
|
116 |
|
117 |
+
elif speaker == "しずく":
|
118 |
+
spk = 3
|
119 |
+
return spk
|
120 |
|
121 |
+
elif speaker == "果林":
|
122 |
+
spk = 4
|
123 |
+
return spk
|
124 |
|
125 |
+
elif speaker == "愛":
|
126 |
+
spk = 5
|
127 |
+
return spk
|
128 |
|
129 |
+
elif speaker == "彼方":
|
130 |
+
spk = 6
|
131 |
+
return spk
|
132 |
|
133 |
+
elif speaker == "せつ菜":
|
134 |
+
spk = 7
|
135 |
+
return spk
|
136 |
+
elif speaker == "エマ":
|
137 |
+
spk = 8
|
138 |
+
return spk
|
139 |
+
elif speaker == "璃奈":
|
140 |
+
spk = 9
|
141 |
+
return spk
|
142 |
+
elif speaker == "栞子":
|
143 |
+
spk = 10
|
144 |
+
return spk
|
145 |
+
elif speaker == "ランジュ":
|
146 |
+
spk = 11
|
147 |
+
return spk
|
148 |
+
elif speaker == "ミア":
|
149 |
+
spk = 12
|
150 |
+
return spk
|
151 |
+
elif speaker == "派蒙":
|
152 |
+
spk = 16
|
153 |
+
return spk
|
154 |
+
|
155 |
+
def sle(self,language,text):
|
156 |
+
text = text.replace('\n','。').replace(' ',',')
|
157 |
+
if language == "中文":
|
158 |
+
tts_input1 = "[ZH]" + text + "[ZH]"
|
159 |
+
return tts_input1
|
160 |
+
elif language == "自动":
|
161 |
+
tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
|
162 |
+
return tts_input1
|
163 |
+
elif language == "日文":
|
164 |
+
tts_input1 = "[JA]" + text + "[JA]"
|
165 |
+
return tts_input1
|
166 |
+
elif language == "英文":
|
167 |
+
tts_input1 = "[EN]" + text + "[EN]"
|
168 |
+
return tts_input1
|
169 |
+
elif language == "手动":
|
170 |
+
return text
|
171 |
+
|
172 |
+
def extrac(self,text):
|
173 |
+
text = re.sub("<[^>]*>","",text)
|
174 |
+
result_list = re.split(r'\n', text)
|
175 |
+
final_list = []
|
176 |
+
for i in result_list:
|
177 |
+
if self.is_english(i):
|
178 |
+
i = romajitable.to_kana(i).katakana
|
179 |
+
i = i.replace('\n','').replace(' ','')
|
180 |
+
#Current length of single sentence: 20
|
181 |
+
if len(i)>1:
|
182 |
+
if len(i) > 20:
|
183 |
+
try:
|
184 |
+
cur_list = re.split(r'。|!', i)
|
185 |
+
for i in cur_list:
|
186 |
+
if len(i)>1:
|
187 |
+
final_list.append(i+'。')
|
188 |
+
except:
|
189 |
+
pass
|
190 |
+
else:
|
191 |
+
final_list.append(i)
|
192 |
+
final_list = [x for x in final_list if x != '']
|
193 |
+
print(final_list)
|
194 |
+
return final_list
|
195 |
+
|
196 |
+
def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
|
197 |
+
try:
|
198 |
+
speaker_id = int(self.selection(speaker_id))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
t1 = time.time()
|
200 |
+
stn_tst = self.get_text(self.sle(language,text))
|
201 |
+
with torch.no_grad():
|
202 |
+
x_tst = stn_tst.unsqueeze(0).to(self.dev)
|
203 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
|
204 |
+
sid = torch.LongTensor([speaker_id]).to(self.dev)
|
205 |
+
audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
|
206 |
+
t2 = time.time()
|
207 |
+
spending_time = "推理时间为:"+str(t2-t1)+"s"
|
208 |
+
print(spending_time)
|
209 |
+
return (self.hps.data.sampling_rate, audio)
|
210 |
+
except:
|
211 |
+
self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json")
|
212 |
+
self.net_g = SynthesizerTrn(
|
213 |
+
len(symbols),
|
214 |
+
self.hps.data.filter_length // 2 + 1,
|
215 |
+
self.hps.train.segment_size // self.hps.data.hop_length,
|
216 |
+
n_speakers=self.hps.data.n_speakers,
|
217 |
+
**self.hps.model).to(self.dev)
|
218 |
+
_ = self.net_g.eval()
|
219 |
+
_ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g)
|
220 |
+
|
221 |
+
def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
|
222 |
+
speaker_id = int(self.selection(speaker_id))
|
223 |
+
a = ['【','[','(','(']
|
224 |
+
b = ['】',']',')',')']
|
225 |
+
for i in a:
|
226 |
+
text = text.replace(i,'<')
|
227 |
+
for i in b:
|
228 |
+
text = text.replace(i,'>')
|
229 |
+
final_list = self.extrac(text.replace('“','').replace('”',''))
|
230 |
+
audio_fin = []
|
231 |
+
c = 0
|
232 |
+
t = datetime.timedelta(seconds=0)
|
233 |
+
f1 = open("subtitles.srt",'w',encoding='utf-8')
|
234 |
+
for sentence in final_list:
|
235 |
+
c +=1
|
236 |
+
stn_tst = self.get_text(self.sle(language,text))
|
237 |
+
with torch.no_grad():
|
238 |
+
x_tst = stn_tst.unsqueeze(0).to(self.dev)
|
239 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
|
240 |
+
sid = torch.LongTensor([speaker_id]).to(self.dev)
|
241 |
+
t1 = time.time()
|
242 |
+
audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
|
243 |
+
t2 = time.time()
|
244 |
+
spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
|
245 |
+
print(spending_time)
|
246 |
+
time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
|
247 |
+
last_time = datetime.timedelta(seconds=len(audio)/float(22050))
|
248 |
+
t+=last_time
|
249 |
+
time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
|
250 |
+
print(time_end)
|
251 |
+
f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
|
252 |
+
audio_fin.append(audio)
|
253 |
+
file_path = "subtitles.srt"
|
254 |
+
return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
|
255 |
+
|
256 |
+
grVits = VitsGradio()
|
257 |
+
|
258 |
+
grVits.Vits.launch()
|
259 |
+
|