Mahiruoshi commited on
Commit
84b0af6
1 Parent(s): 49849fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -159
app.py CHANGED
@@ -14,170 +14,246 @@ import gradio as gr
14
  import time
15
  import datetime
16
  import os
17
- def get_text(text, hps):
18
- text_norm = text_to_sequence(text, symbols, hps.data.text_cleaners)
19
- if hps.data.add_blank:
20
- text_norm = commons.intersperse(text_norm, 0)
21
- text_norm = torch.LongTensor(text_norm)
22
- return text_norm
23
- dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
- def selection(speaker):
25
- if speaker == "高咲侑":
26
- spk = 0
27
- return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- elif speaker == "歩夢":
30
- spk = 1
31
- return spk
32
 
33
- elif speaker == "かすみ":
34
- spk = 2
35
- return spk
36
 
37
- elif speaker == "しずく":
38
- spk = 3
39
- return spk
40
 
41
- elif speaker == "果林":
42
- spk = 4
43
- return spk
44
 
45
- elif speaker == "愛":
46
- spk = 5
47
- return spk
48
 
49
- elif speaker == "彼方":
50
- spk = 6
51
- return spk
52
 
53
- elif speaker == "せつ菜":
54
- spk = 7
55
- return spk
56
- elif speaker == "エマ":
57
- spk = 8
58
- return spk
59
- elif speaker == "璃奈":
60
- spk = 9
61
- return spk
62
- elif speaker == "栞子":
63
- spk = 10
64
- return spk
65
- elif speaker == "ランジュ":
66
- spk = 11
67
- return spk
68
- elif speaker == "ミア":
69
- spk = 12
70
- return spk
71
- elif speaker == "三色绘恋1":
72
- spk = 13
73
- return spk
74
- elif speaker == "三色绘恋2":
75
- spk = 15
76
- elif speaker == "派蒙":
77
- spk = 16
78
- return spk
79
- def is_japanese(string):
80
- for ch in string:
81
- if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
82
- return True
83
- return False
84
- def is_english(string):
85
- import re
86
- pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
87
- if pattern.fullmatch(string):
88
- return True
89
- else:
90
- return False
91
- def sle(language,tts_input0):
92
- if language == "中文":
93
- tts_input1 = "[ZH]" + tts_input0.replace('\n','。').replace(' ',',') + "[ZH]"
94
- return tts_input1
95
- if language == "自动":
96
- tts_input1 = f"[JA]{tts_input0}[JA]" if is_japanese(tts_input0) else f"[ZH]{tts_input0}[ZH]"
97
- return tts_input1
98
- elif language == "日文":
99
- tts_input1 = "[JA]" + tts_input0.replace('\n','').replace(' ',',') + "[JA]"
100
- return tts_input1
101
- def extrac(text):
102
- text = re.sub("<[^>]*>","",text)
103
- result_list = re.split(r'\n', text)
104
- final_list = []
105
- for i in result_list:
106
- if is_english(i):
107
- i = romajitable.to_kana(i).katakana
108
- i = i.replace('\n','').replace(' ','')
109
- #Current length of single sentence: 20
110
- if len(i)>1:
111
- if len(i) > 20:
112
- try:
113
- cur_list = re.split(r'。|!', i)
114
- for i in cur_list:
115
- if len(i)>1:
116
- final_list.append(i+'。')
117
- except:
118
- pass
119
- else:
120
- final_list.append(i)
121
- final_list = [x for x in final_list if x != '']
122
- print(final_list)
123
- return final_list
124
- def infer(text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
125
- speaker_id = int(selection(speaker_id))
126
- a = ['【','[','(','(']
127
- b = ['】',']',')',')']
128
- for i in a:
129
- text = text.replace(i,'<')
130
- for i in b:
131
- text = text.replace(i,'>')
132
- final_list = extrac(text.replace('“','').replace('”',''))
133
- audio_fin = []
134
- c = 0
135
- t = datetime.timedelta(seconds=0)
136
- f1 = open("subtitles.srt",'w',encoding='utf-8')
137
- for sentence in final_list:
138
- c +=1
139
- stn_tst = get_text(sle(language,sentence), hps_ms)
140
- with torch.no_grad():
141
- x_tst = stn_tst.unsqueeze(0).to(dev)
142
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
143
- sid = torch.LongTensor([speaker_id]).to(dev)
144
  t1 = time.time()
145
- audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
146
- t2 = time.time()
147
- spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
148
- print(spending_time)
149
- time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
150
- last_time = datetime.timedelta(seconds=len(audio)/float(22050))
151
- t+=last_time
152
- time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
153
- print(time_end)
154
- f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
155
- audio_fin.append(audio)
156
- file_path = "subtitles.srt"
157
- return (hps_ms.data.sampling_rate, np.concatenate(audio_fin)),file_path
158
- lan = ["中文","日文","自动"]
159
- idols = ["高咲侑","歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
160
- hps_ms = utils.get_hparams_from_file("lovelive/config.json")
161
- net_g_ms = SynthesizerTrn(
162
- len(symbols),
163
- hps_ms.data.filter_length // 2 + 1,
164
- hps_ms.train.segment_size // hps_ms.data.hop_length,
165
- n_speakers=hps_ms.data.n_speakers,
166
- **hps_ms.model).to(dev)
167
- _ = net_g_ms.eval()
168
- _ = utils.load_checkpoint("lovelive/G_936000.pth", net_g_ms)
169
- inputs = [gr.TextArea(label="如需实现快速合成,建议在colab上克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了?我想做只属于你一个人的学院偶像,所以,请只注视我一个人,好吗?【中文】\nなんでそんなに慣れてんだよっ?せつ菜と…何回キスしたんだよ?どこまであたしを置いてきぼりにすれば気が済むんだよ?[日文]\nI can't choose just one(English)"),
170
- gr.Dropdown(label="选择语言,目前勉强可以做到自动识别",choices=lan, value="自动", interactive=True),
171
- gr.Dropdown(label="选择说话人",choices=idols, value="歩夢", interactive=True),
172
- gr.Slider(minimum= 0,maximum=1.0,label="更改噪声比例,以控制情感", value=0.267),
173
- gr.Slider(minimum= 0,maximum=1.0,label="更改噪声偏差,以控制音素长短", value=0.7),
174
- gr.Slider(minimum= 0.1,maximum=10,label="更改时间比例", value=1)]
175
- outputs=[gr.Audio(label="采样率22050"), gr.outputs.File(label="字幕文件:subtitles.srt")]
176
- iface = gr.Interface(
177
- fn=infer,
178
- inputs=inputs,
179
- outputs=outputs,
180
- title="Vits",
181
- description="虹团11人模型",
182
- )
183
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import time
15
  import datetime
16
  import os
17
+ class VitsGradio:
18
+ def __init__(self):
19
+ self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
20
+ self.lan = ["中文","日文","自动","手动"]
21
+ self.idols = ["歩夢","かすみ","しずく","果林","愛","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","派蒙"]
22
+ self.modelPaths = []
23
+ for root,dirs,files in os.walk("checkpoints"):
24
+ for dir in dirs:
25
+ self.modelPaths.append(dir)
26
+ with gr.Blocks() as self.Vits:
27
+ gr.Markdown(
28
+ "## <center> Lovelive虹团中日双语VITS\n"
29
+ "### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
30
+ "<div align='center'>目前有标贝普通话版,去标贝版,少歌模型目前还是大饼状态</div>"
31
+ '<div align="center"><a>参数说明:由于爱抖露们过于有感情,合成日语时建议将噪声比例调节至0.2-0.3区间,噪声偏差对应着每个字之间的间隔,对普通话影响较大,duration代表整体语速</div>'
32
+ '<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
33
+ with gr.Tab("TTS合成"):
34
+ with gr.Row():
35
+ with gr.Column():
36
+ with gr.Row():
37
+ with gr.Column():
38
+ input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
39
+ input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
40
+ input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
41
+ btnVC = gr.Button("Submit")
42
+ with gr.Column():
43
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
44
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
45
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
46
+ output1 = gr.Audio(label="采样率22050")
47
+ btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1])
48
+ with gr.Tab("选择模型"):
49
+ with gr.Column():
50
+ modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
51
+ btnMod = gr.Button("载入模型")
52
+ statusa = gr.TextArea()
53
+ btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa])
54
+ with gr.Tab("小说合成(带字幕)"):
55
+ with gr.Row():
56
+ with gr.Column():
57
+ with gr.Row():
58
+ with gr.Column():
59
+ input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
60
+ input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
61
+ input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
62
+ btnVC = gr.Button("Submit")
63
+ with gr.Column():
64
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
65
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
66
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1)
67
+ output1 = gr.Audio(label="采样率22050")
68
+ subtitle = gr.outputs.File(label="字幕文件:subtitles.srt")
69
+ btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle])
70
+
71
+ def loadCk(self,path):
72
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
73
+ self.net_g = SynthesizerTrn(
74
+ len(symbols),
75
+ self.hps.data.filter_length // 2 + 1,
76
+ self.hps.train.segment_size // self.hps.data.hop_length,
77
+ n_speakers=self.hps.data.n_speakers,
78
+ **self.hps.model).to(self.dev)
79
+ _ = self.net_g.eval()
80
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g)
81
+ return "success"
82
+
83
+ def get_text(self,text):
84
+ text_norm = text_to_sequence(text, self.hps.data.text_cleaners)
85
+ if self.hps.data.add_blank:
86
+ text_norm = commons.intersperse(text_norm, 0)
87
+ text_norm = torch.LongTensor(text_norm)
88
+ return text_norm
89
+
90
+ def is_japanese(self,string):
91
+ for ch in string:
92
+ if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
93
+ return True
94
+ return False
95
+
96
+ def is_english(self,string):
97
+ import re
98
+ pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
99
+ if pattern.fullmatch(string):
100
+ return True
101
+ else:
102
+ return False
103
+
104
+ def selection(self,speaker):
105
+ if speaker == "高咲侑":
106
+ spk = 0
107
+ return spk
108
 
109
+ elif speaker == "歩夢":
110
+ spk = 1
111
+ return spk
112
 
113
+ elif speaker == "かすみ":
114
+ spk = 2
115
+ return spk
116
 
117
+ elif speaker == "しずく":
118
+ spk = 3
119
+ return spk
120
 
121
+ elif speaker == "果林":
122
+ spk = 4
123
+ return spk
124
 
125
+ elif speaker == "愛":
126
+ spk = 5
127
+ return spk
128
 
129
+ elif speaker == "彼方":
130
+ spk = 6
131
+ return spk
132
 
133
+ elif speaker == "せつ菜":
134
+ spk = 7
135
+ return spk
136
+ elif speaker == "エマ":
137
+ spk = 8
138
+ return spk
139
+ elif speaker == "璃奈":
140
+ spk = 9
141
+ return spk
142
+ elif speaker == "栞子":
143
+ spk = 10
144
+ return spk
145
+ elif speaker == "ランジュ":
146
+ spk = 11
147
+ return spk
148
+ elif speaker == "ミア":
149
+ spk = 12
150
+ return spk
151
+ elif speaker == "派蒙":
152
+ spk = 16
153
+ return spk
154
+
155
+ def sle(self,language,text):
156
+ text = text.replace('\n','。').replace(' ',',')
157
+ if language == "中文":
158
+ tts_input1 = "[ZH]" + text + "[ZH]"
159
+ return tts_input1
160
+ elif language == "自动":
161
+ tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
162
+ return tts_input1
163
+ elif language == "日文":
164
+ tts_input1 = "[JA]" + text + "[JA]"
165
+ return tts_input1
166
+ elif language == "英文":
167
+ tts_input1 = "[EN]" + text + "[EN]"
168
+ return tts_input1
169
+ elif language == "手动":
170
+ return text
171
+
172
+ def extrac(self,text):
173
+ text = re.sub("<[^>]*>","",text)
174
+ result_list = re.split(r'\n', text)
175
+ final_list = []
176
+ for i in result_list:
177
+ if self.is_english(i):
178
+ i = romajitable.to_kana(i).katakana
179
+ i = i.replace('\n','').replace(' ','')
180
+ #Current length of single sentence: 20
181
+ if len(i)>1:
182
+ if len(i) > 20:
183
+ try:
184
+ cur_list = re.split(r'。|!', i)
185
+ for i in cur_list:
186
+ if len(i)>1:
187
+ final_list.append(i+'。')
188
+ except:
189
+ pass
190
+ else:
191
+ final_list.append(i)
192
+ final_list = [x for x in final_list if x != '']
193
+ print(final_list)
194
+ return final_list
195
+
196
+ def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
197
+ try:
198
+ speaker_id = int(self.selection(speaker_id))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  t1 = time.time()
200
+ stn_tst = self.get_text(self.sle(language,text))
201
+ with torch.no_grad():
202
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
203
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
204
+ sid = torch.LongTensor([speaker_id]).to(self.dev)
205
+ audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
206
+ t2 = time.time()
207
+ spending_time = "推理时间为:"+str(t2-t1)+"s"
208
+ print(spending_time)
209
+ return (self.hps.data.sampling_rate, audio)
210
+ except:
211
+ self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json")
212
+ self.net_g = SynthesizerTrn(
213
+ len(symbols),
214
+ self.hps.data.filter_length // 2 + 1,
215
+ self.hps.train.segment_size // self.hps.data.hop_length,
216
+ n_speakers=self.hps.data.n_speakers,
217
+ **self.hps.model).to(self.dev)
218
+ _ = self.net_g.eval()
219
+ _ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g)
220
+
221
+ def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
222
+ speaker_id = int(self.selection(speaker_id))
223
+ a = ['【','[','(','(']
224
+ b = ['】',']',')',')']
225
+ for i in a:
226
+ text = text.replace(i,'<')
227
+ for i in b:
228
+ text = text.replace(i,'>')
229
+ final_list = self.extrac(text.replace('“','').replace('”',''))
230
+ audio_fin = []
231
+ c = 0
232
+ t = datetime.timedelta(seconds=0)
233
+ f1 = open("subtitles.srt",'w',encoding='utf-8')
234
+ for sentence in final_list:
235
+ c +=1
236
+ stn_tst = self.get_text(self.sle(language,text))
237
+ with torch.no_grad():
238
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
239
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
240
+ sid = torch.LongTensor([speaker_id]).to(self.dev)
241
+ t1 = time.time()
242
+ audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
243
+ t2 = time.time()
244
+ spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
245
+ print(spending_time)
246
+ time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
247
+ last_time = datetime.timedelta(seconds=len(audio)/float(22050))
248
+ t+=last_time
249
+ time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
250
+ print(time_end)
251
+ f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
252
+ audio_fin.append(audio)
253
+ file_path = "subtitles.srt"
254
+ return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
255
+
256
+ grVits = VitsGradio()
257
+
258
+ grVits.Vits.launch()
259
+