Mahiruoshi commited on
Commit
3604243
1 Parent(s): c4466c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +447 -330
app.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
  logging.getLogger('numba').setLevel(logging.WARNING)
3
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
4
  logging.getLogger('urllib3').setLevel(logging.WARNING)
5
- import romajitable
6
  import re
7
  import numpy as np
8
  import IPython.display as ipd
@@ -16,251 +16,126 @@ import gradio as gr
16
  import time
17
  import datetime
18
  import os
 
 
 
19
  import librosa
20
  from mel_processing import spectrogram_torch
21
- class VitsGradio:
22
- def __init__(self):
23
- self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
- self.lan = ["中文","日文","自动","手动"]
25
- self.idols = ["c1","c2","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"]
26
- self.modelPaths = []
27
- for root,dirs,files in os.walk("checkpoints"):
28
- for dir in dirs:
29
- self.modelPaths.append(dir)
30
- with gr.Blocks() as self.Vits:
31
- gr.Markdown(
32
- "## <center> Lovelive虹团中日双语VITS\n"
33
- "### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
34
- "<div align='center'>目前有标贝普通话版,去标贝版,少歌模型还是大饼状态</div>"
35
- '<div align="center"><a>参数说明:由于爱抖露们过于有感情,合成日语时建议将噪声比例调节至0.2-0.3区间,噪声偏差对应着每个字之间的间隔,对普通话影响较大,duration代表整体语速</div>'
36
- '<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
37
- with gr.Tab("TTS合成"):
38
- with gr.Row():
39
- with gr.Column():
40
- with gr.Row():
41
- with gr.Column():
42
- input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
43
- input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
44
- input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
45
- btnVC = gr.Button("Submit")
46
- with gr.Column():
47
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
48
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
49
- input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
50
- output1 = gr.Audio(label="采样率22050")
51
- btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1])
52
- with gr.Tab("选择模型"):
53
- with gr.Column():
54
- modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
55
- btnMod = gr.Button("载入模型")
56
- statusa = gr.TextArea()
57
- btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa])
58
- with gr.Tab("Voice Conversion"):
59
- gr.Markdown("""
60
- 录制或上传声音,并选择要转换的音色。
61
- """)
62
- with gr.Column():
63
- record_audio = gr.Audio(label="record your voice", source="microphone")
64
- upload_audio = gr.Audio(label="or upload audio here", source="upload")
65
- source_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="source speaker")
66
- target_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="target speaker")
67
- with gr.Column():
68
- message_box = gr.Textbox(label="Message")
69
- converted_audio = gr.Audio(label='converted audio')
70
- btn = gr.Button("Convert!")
71
- btn.click(self.vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
72
- outputs=[message_box, converted_audio])
73
- with gr.Tab("小说合成(带字幕)"):
74
- with gr.Row():
75
- with gr.Column():
76
- with gr.Row():
77
- with gr.Column():
78
- input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
79
- input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
80
- input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
81
- btnVC = gr.Button("Submit")
82
- with gr.Column():
83
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
84
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
85
- input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1)
86
- output1 = gr.Audio(label="采样率22050")
87
- subtitle = gr.outputs.File(label="字幕文件:subtitles.srt")
88
- btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle])
89
-
90
- def loadCk(self,path):
91
- self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
92
- self.net_g = SynthesizerTrn(
93
- len(symbols),
94
- self.hps.data.filter_length // 2 + 1,
95
- self.hps.train.segment_size // self.hps.data.hop_length,
96
- n_speakers=self.hps.data.n_speakers,
97
- **self.hps.model).to(self.dev)
98
- _ = self.net_g.eval()
99
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g)
100
- return "success"
101
-
102
- def get_text(self,text):
103
- text_norm = text_to_sequence(text,self.hps.data.text_cleaners)
104
- if self.hps.data.add_blank:
105
- text_norm = commons.intersperse(text_norm, 0)
106
- text_norm = torch.LongTensor(text_norm)
107
- return text_norm
108
-
109
- def is_japanese(self,string):
110
  for ch in string:
111
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
112
  return True
113
  return False
114
-
115
- def is_english(self,string):
116
  import re
117
  pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
118
  if pattern.fullmatch(string):
119
  return True
120
  else:
121
  return False
122
-
123
- def selection(self,speaker):
124
- if speaker == "高咲侑":
125
- spk = 0
126
- return spk
127
-
128
- elif speaker == "歩夢":
129
- spk = 1
130
- return spk
131
-
132
- elif speaker == "かすみ":
133
- spk = 2
134
- return spk
135
 
136
- elif speaker == "しずく":
137
- spk = 3
138
- return spk
139
-
140
- elif speaker == "果林":
141
- spk = 4
142
- return spk
143
-
144
- elif speaker == "愛":
145
- spk = 5
146
- return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- elif speaker == "彼方":
149
- spk = 6
150
- return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- elif speaker == "せつ菜":
153
- spk = 7
154
- return spk
155
- elif speaker == "エマ":
156
- spk = 8
157
- return spk
158
- elif speaker == "璃奈":
159
- spk = 9
160
- return spk
161
- elif speaker == "栞子":
162
- spk = 10
163
- return spk
164
- elif speaker == "ランジュ":
165
- spk = 11
166
- return spk
167
- elif speaker == "ミア":
168
- spk = 12
169
- return spk
170
-
171
- elif speaker == "派蒙":
172
- spk = 16
173
- return spk
174
-
175
- elif speaker == "c1":
176
- spk = 18
177
- return spk
178
 
179
- elif speaker == "c2":
180
- spk = 19
181
- return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- elif speaker == "華恋":
184
- spk = 21
185
- return spk
 
 
186
 
187
- elif speaker == "まひる":
188
- spk = 22
189
- return spk
190
-
191
- elif speaker == "なな":
192
- spk = 23
193
- return spk
194
-
195
- elif speaker == "クロディーヌ":
196
- spk = 24
197
- return spk
198
-
199
- elif speaker == "ひかり":
200
- spk = 25
201
- return spk
202
-
203
- elif speaker == "純那":
204
- spk = 26
205
- return spk
206
-
207
- elif speaker == "香子":
208
- spk = 27
209
- return spk
210
-
211
- elif speaker == "真矢":
212
- spk = 28
213
- return spk
214
- elif speaker == "双葉":
215
- spk = 29
216
- return spk
217
- elif speaker == "ミチル":
218
- spk = 30
219
- return spk
220
- elif speaker == "メイファン":
221
- spk = 31
222
- return spk
223
- elif speaker == "やちよ":
224
- spk = 32
225
- return spk
226
- elif speaker == "晶":
227
- spk = 33
228
- return spk
229
- elif speaker == "いちえ":
230
- spk = 34
231
- return spk
232
- elif speaker == "ゆゆ子":
233
- spk = 35
234
- return spk
235
- elif speaker == "塁":
236
- spk = 36
237
- return spk
238
- elif speaker == "珠緒":
239
- spk = 37
240
- return spk
241
- elif speaker == "あるる":
242
- spk = 38
243
- return spk
244
- elif speaker == "ララフィン":
245
- spk = 39
246
- return spk
247
- elif speaker == "美空":
248
- spk = 40
249
- return spk
250
- elif speaker == "静羽":
251
- spk = 41
252
- return spk
253
- else:
254
- return 0
255
-
256
-
257
- def sle(self,language,text):
258
- text = text.replace('\n','。').replace(' ',',')
259
  if language == "中文":
260
  tts_input1 = "[ZH]" + text + "[ZH]"
261
  return tts_input1
262
  elif language == "自动":
263
- tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
264
  return tts_input1
265
  elif language == "日文":
266
  tts_input1 = "[JA]" + text + "[JA]"
@@ -270,119 +145,361 @@ class VitsGradio:
270
  return tts_input1
271
  elif language == "手动":
272
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
- def extrac(self,text):
275
- text = re.sub("<[^>]*>","",text)
276
- result_list = re.split(r'\n', text)
277
- final_list = []
278
- for i in result_list:
279
- if self.is_english(i):
280
- i = romajitable.to_kana(i).katakana
281
- i = i.replace('\n','').replace(' ','')
282
- #Current length of single sentence: 20
283
- if len(i)>1:
284
- if len(i) > 20:
285
- try:
286
- cur_list = re.split(r'。|!', i)
287
- for i in cur_list:
288
- if len(i)>1:
289
- final_list.append(i+'。')
290
- except:
291
- pass
292
- else:
293
- final_list.append(i)
294
- final_list = [x for x in final_list if x != '']
295
- print(final_list)
296
- return final_list
297
 
298
- def vc_fn(self,original_speaker, target_speaker, record_audio, upload_audio):
299
- input_audio = record_audio if record_audio is not None else upload_audio
300
- if input_audio is None:
301
- return "You need to record or upload an audio", None
302
- sampling_rate, audio = input_audio
303
- original_speaker_id = self.selection(original_speaker)
304
- target_speaker_id = self.selection(target_speaker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
307
- if len(audio.shape) > 1:
308
- audio = librosa.to_mono(audio.transpose(1, 0))
309
- if sampling_rate != self.hps.data.sampling_rate:
310
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=self.hps.data.sampling_rate)
311
- with torch.no_grad():
312
- y = torch.FloatTensor(audio)
313
- y = y / max(-y.min(), y.max()) / 0.99
314
- y = y.to(self.dev)
315
- y = y.unsqueeze(0)
316
- spec = spectrogram_torch(y, self.hps.data.filter_length,
317
- self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
318
- center=False).to(self.dev)
319
- spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.dev)
320
- sid_src = torch.LongTensor([original_speaker_id]).to(self.dev)
321
- sid_tgt = torch.LongTensor([target_speaker_id]).to(self.dev)
322
- audio = self.net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
323
- 0, 0].data.cpu().float().numpy()
324
- del y, spec, spec_lengths, sid_src, sid_tgt
325
- return "Success", (self.hps.data.sampling_rate, audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
- def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
328
- try:
329
- speaker_id = int(self.selection(speaker_id))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  t1 = time.time()
331
- stn_tst = self.get_text(self.sle(language,text))
332
  with torch.no_grad():
333
- x_tst = stn_tst.unsqueeze(0).to(self.dev)
334
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
335
- sid = torch.LongTensor([speaker_id]).to(self.dev)
336
- audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
337
  t2 = time.time()
338
  spending_time = "推理时间为:"+str(t2-t1)+"s"
339
  print(spending_time)
340
- return (self.hps.data.sampling_rate, audio)
341
- except:
342
- self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json")
343
- self.net_g = SynthesizerTrn(
344
- len(symbols),
345
- self.hps.data.filter_length // 2 + 1,
346
- self.hps.train.segment_size // self.hps.data.hop_length,
347
- n_speakers=self.hps.data.n_speakers,
348
- **self.hps.model).to(self.dev)
349
- _ = self.net_g.eval()
350
- _ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
353
- speaker_id = int(self.selection(speaker_id))
354
- a = ['【','[','(','(']
355
- b = ['】',']',')',')']
356
- for i in a:
357
- text = text.replace(i,'<')
358
- for i in b:
359
- text = text.replace(i,'>')
360
- final_list = self.extrac(text.replace('“','').replace('”',''))
361
- audio_fin = []
362
- c = 0
363
- t = datetime.timedelta(seconds=0)
364
- f1 = open("subtitles.srt",'w',encoding='utf-8')
365
- for sentence in final_list:
366
- c +=1
367
- stn_tst = self.get_text(self.sle(language,sentence))
368
- with torch.no_grad():
369
- x_tst = stn_tst.unsqueeze(0).to(self.dev)
370
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
371
- sid = torch.LongTensor([speaker_id]).to(self.dev)
372
- t1 = time.time()
373
- audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
374
- t2 = time.time()
375
- spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
376
- print(spending_time)
377
- time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
378
- last_time = datetime.timedelta(seconds=len(audio)/float(22050))
379
- t+=last_time
380
- time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
381
- print(time_end)
382
- f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
383
- audio_fin.append(audio)
384
- file_path = "subtitles.srt"
385
- return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
386
- print("开始部署")
387
- grVits = VitsGradio()
388
- grVits.Vits.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  logging.getLogger('numba').setLevel(logging.WARNING)
3
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
4
  logging.getLogger('urllib3').setLevel(logging.WARNING)
5
+ import json
6
  import re
7
  import numpy as np
8
  import IPython.display as ipd
 
16
  import time
17
  import datetime
18
  import os
19
+ import pickle
20
+ import openai
21
+ from scipy.io.wavfile import write
22
  import librosa
23
  from mel_processing import spectrogram_torch
24
+ def is_japanese(string):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  for ch in string:
26
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
27
  return True
28
  return False
29
+
30
+ def is_english(string):
31
  import re
32
  pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
33
  if pattern.fullmatch(string):
34
  return True
35
  else:
36
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ def to_html(chat_history):
39
+ chat_html = ""
40
+ for item in chat_history:
41
+ if item['role'] == 'user':
42
+ chat_html += f"""
43
+ <div style="margin-bottom: 20px;">
44
+ <div style="text-align: right; margin-right: 20px;">
45
+ <span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
46
+ {item['content']}
47
+ </span>
48
+ </div>
49
+ </div>
50
+ """
51
+ else:
52
+ chat_html += f"""
53
+ <div style="margin-bottom: 20px;">
54
+ <div style="text-align: left; margin-left: 20px;">
55
+ <span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
56
+ {item['content']}
57
+ </span>
58
+ </div>
59
+ </div>
60
+ """
61
+ output_html = f"""
62
+ <div style="height: 400px; overflow-y: scroll; padding: 10px;">
63
+ {chat_html}
64
+ </div>
65
+ """
66
+ return output_html
67
 
68
+ def extrac(text):
69
+ text = re.sub("<[^>]*>","",text)
70
+ result_list = re.split(r'\n', text)
71
+ final_list = []
72
+ for i in result_list:
73
+ if is_english(i):
74
+ i = romajitable.to_kana(i).katakana
75
+ i = i.replace('\n','').replace(' ','')
76
+ #Current length of single sentence: 20
77
+ if len(i)>1:
78
+ if len(i) > 20:
79
+ try:
80
+ cur_list = re.split(r'。|!', i)
81
+ for i in cur_list:
82
+ if len(i)>1:
83
+ final_list.append(i+'。')
84
+ except:
85
+ pass
86
+ else:
87
+ final_list.append(i)
88
+ final_list = [x for x in final_list if x != '']
89
+ print(final_list)
90
+ return final_list
91
 
92
+ def to_numpy(tensor: torch.Tensor):
93
+ return tensor.detach().cpu().numpy() if tensor.requires_grad \
94
+ else tensor.detach().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ def chatgpt(text):
97
+ messages = []
98
+ try:
99
+ with open('log.pickle', 'rb') as f:
100
+ messages = pickle.load(f)
101
+ messages.append({"role": "user", "content": text},)
102
+ chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
103
+ reply = chat.choices[0].message.content
104
+ messages.append({"role": "assistant", "content": reply})
105
+ print(messages[-1])
106
+ if len(messages) == 12:
107
+ messages[6:10] = messages[8:]
108
+ del messages[-2:]
109
+ with open('log.pickle', 'wb') as f:
110
+ messages2 = []
111
+ pickle.dump(messages2, f)
112
+ return reply,messages
113
+ except:
114
+ messages.append({"role": "user", "content": text},)
115
+ chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
116
+ reply = chat.choices[0].message.content
117
+ messages.append({"role": "assistant", "content": reply})
118
+ print(messages[-1])
119
+ if len(messages) == 12:
120
+ messages[6:10] = messages[8:]
121
+ del messages[-2:]
122
+ with open('log.pickle', 'wb') as f:
123
+ pickle.dump(messages, f)
124
+ return reply,messages
125
 
126
+ def get_symbols_from_json(path):
127
+ assert os.path.isfile(path)
128
+ with open(path, 'r') as f:
129
+ data = json.load(f)
130
+ return data['symbols']
131
 
132
+ def sle(language,text):
133
+ text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  if language == "中文":
135
  tts_input1 = "[ZH]" + text + "[ZH]"
136
  return tts_input1
137
  elif language == "自动":
138
+ tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
139
  return tts_input1
140
  elif language == "日文":
141
  tts_input1 = "[JA]" + text + "[JA]"
 
145
  return tts_input1
146
  elif language == "手动":
147
  return text
148
+
149
+ def get_text(text,hps_ms):
150
+ text_norm = text_to_sequence(text,hps_ms.data.text_cleaners)
151
+ if hps_ms.data.add_blank:
152
+ text_norm = commons.intersperse(text_norm, 0)
153
+ text_norm = torch.LongTensor(text_norm)
154
+ return text_norm
155
+
156
+ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
157
+ input_audio = record_audio if record_audio is not None else upload_audio
158
+ if input_audio is None:
159
+ return "You need to record or upload an audio", None
160
+ sampling_rate, audio = input_audio
161
+ original_speaker_id = selection(original_speaker)
162
+ target_speaker_id = selection(target_speaker)
163
+
164
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
165
+ if len(audio.shape) > 1:
166
+ audio = librosa.to_mono(audio.transpose(1, 0))
167
+ if sampling_rate != hps.data.sampling_rate:
168
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
169
+ with torch.no_grad():
170
+ y = torch.FloatTensor(audio)
171
+ y = y / max(-y.min(), y.max()) / 0.99
172
+ y = y.to(dev)
173
+ y = y.unsqueeze(0)
174
+ spec = spectrogram_torch(y, hps.data.filter_length,
175
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
176
+ center=False).to(dev)
177
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
178
+ sid_src = torch.LongTensor([original_speaker_id]).to(dev)
179
+ sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
180
+ audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
181
+ 0, 0].data.cpu().float().numpy()
182
+ del y, spec, spec_lengths, sid_src, sid_tgt
183
+ return "Success", (hps.data.sampling_rate, audio)
184
+
185
+ def selection(speaker):
186
+ if speaker == "高咲侑":
187
+ spk = 0
188
+ return spk
189
+
190
+ elif speaker == "歩夢":
191
+ spk = 1
192
+ return spk
193
+
194
+ elif speaker == "かすみ":
195
+ spk = 2
196
+ return spk
197
+
198
+ elif speaker == "しずく":
199
+ spk = 3
200
+ return spk
201
+
202
+ elif speaker == "果林":
203
+ spk = 4
204
+ return spk
205
+
206
+ elif speaker == "愛":
207
+ spk = 5
208
+ return spk
209
+
210
+ elif speaker == "彼方":
211
+ spk = 6
212
+ return spk
213
+
214
+ elif speaker == "せつ菜":
215
+ spk = 7
216
+ return spk
217
+
218
+ elif speaker == "エマ":
219
+ spk = 8
220
+ return spk
221
+
222
+ elif speaker == "璃奈":
223
+ spk = 9
224
+ return spk
225
+
226
+ elif speaker == "栞子":
227
+ spk = 10
228
+ return spk
229
+
230
+ elif speaker == "ランジュ":
231
+ spk = 11
232
+ return spk
233
 
234
+ elif speaker == "ミア":
235
+ spk = 12
236
+ return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
+ elif speaker == "派蒙":
239
+ spk = 16
240
+ return spk
241
+
242
+ elif speaker == "c1":
243
+ spk = 18
244
+ return spk
245
+
246
+ elif speaker == "c2":
247
+ spk = 19
248
+ return spk
249
+
250
+ elif speaker == "華恋":
251
+ spk = 21
252
+ return spk
253
+
254
+ elif speaker == "まひる":
255
+ spk = 22
256
+ return spk
257
+
258
+ elif speaker == "なな":
259
+ spk = 23
260
+ return spk
261
+
262
+ elif speaker == "クロディーヌ":
263
+ spk = 24
264
+ return spk
265
 
266
+ elif speaker == "ひかり":
267
+ spk = 25
268
+ return spk
269
+
270
+ elif speaker == "純那":
271
+ spk = 26
272
+ return spk
273
+
274
+ elif speaker == "香子":
275
+ spk = 27
276
+ return spk
277
+
278
+ elif speaker == "真矢":
279
+ spk = 28
280
+ return spk
281
+
282
+ elif speaker == "双葉":
283
+ spk = 29
284
+ return spk
285
+
286
+ elif speaker == "ミチル":
287
+ spk = 30
288
+ return spk
289
+
290
+ elif speaker == "メイファン":
291
+ spk = 31
292
+ return spk
293
+
294
+ elif speaker == "やちよ":
295
+ spk = 32
296
+ return spk
297
+
298
+ elif speaker == "晶":
299
+ spk = 33
300
+ return spk
301
+
302
+ elif speaker == "いちえ":
303
+ spk = 34
304
+ return spk
305
+
306
+ elif speaker == "ゆゆ子":
307
+ spk = 35
308
+ return spk
309
+
310
+ elif speaker == "塁":
311
+ spk = 36
312
+ return spk
313
+
314
+ elif speaker == "珠緒":
315
+ spk = 37
316
+ return spk
317
 
318
+ elif speaker == "あるる":
319
+ spk = 38
320
+ return spk
321
+
322
+ elif speaker == "ララフィン":
323
+ spk = 39
324
+ return spk
325
+
326
+ elif speaker == "美空":
327
+ spk = 40
328
+ return spk
329
+
330
+ elif speaker == "静羽":
331
+ spk = 41
332
+ return spk
333
+
334
+ else:
335
+ return 0
336
+
337
+ def create_tts_fn(net_g,hps,speaker_id):
338
+ speaker_id = int(speaker_id)
339
+ def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
340
+ repeat_ime = int(repeat_time)
341
+ if is_gpt:
342
+ openai.api_key = api_key
343
+ text,messages = chatgpt(text)
344
+ htm = to_html(messages)
345
+ else:
346
+ htm = ''
347
+ if not extract:
348
  t1 = time.time()
349
+ stn_tst = get_text(sle(language,text),hps)
350
  with torch.no_grad():
351
+ x_tst = stn_tst.unsqueeze(0).to(dev)
352
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
353
+ sid = torch.LongTensor([speaker_id]).to(dev)
354
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
355
  t2 = time.time()
356
  spending_time = "推理时间为:"+str(t2-t1)+"s"
357
  print(spending_time)
358
+ file_path = "subtitles.srt"
359
+ try:
360
+ write(audiopath + '.wav',22050,audio)
361
+ if is_audio:
362
+ for i in range(repeat_time):
363
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
364
+ os.system(cmd)
365
+ except:
366
+ pass
367
+ return (hps.data.sampling_rate, audio),file_path,htm
368
+ else:
369
+ a = ['【','[','(','(']
370
+ b = ['】',']',')',')']
371
+ for i in a:
372
+ text = text.replace(i,'<')
373
+ for i in b:
374
+ text = text.replace(i,'>')
375
+ final_list = extrac(text.replace('“','').replace('”',''))
376
+ audio_fin = []
377
+ c = 0
378
+ t = datetime.timedelta(seconds=0)
379
+ for sentence in final_list:
380
+ try:
381
+ f1 = open("subtitles.srt",'w',encoding='utf-8')
382
+ c +=1
383
+ stn_tst = get_text(sle(language,sentence),hps)
384
+ with torch.no_grad():
385
+ x_tst = stn_tst.unsqueeze(0).to(dev)
386
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
387
+ sid = torch.LongTensor([speaker_id]).to(dev)
388
+ t1 = time.time()
389
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
390
+ t2 = time.time()
391
+ spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
392
+ print(spending_time)
393
+ time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
394
+ last_time = datetime.timedelta(seconds=len(audio)/float(22050))
395
+ t+=last_time
396
+ time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
397
+ print(time_end)
398
+ f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
399
+ audio_fin.append(audio)
400
+ except:
401
+ pass
402
+ try:
403
+ write(audiopath + '.wav',22050,np.concatenate(audio_fin))
404
+ if is_audio:
405
+ for i in range(repeat_time):
406
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
407
+ os.system(cmd)
408
+
409
+ except:
410
+ pass
411
+
412
+ file_path = "subtitles.srt"
413
+ return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm
414
+ return tts_fn
415
 
416
+ if __name__ == '__main__':
417
+ hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
418
+ dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
419
+ models = []
420
+ schools = ["Nijigasaki","ShojoKageki","ShojoKageki-Nijigasaki"]
421
+ lan = ["中文","日文","自动","手动"]
422
+ with open("checkpoints/info.json", "r", encoding="utf-8") as f:
423
+ models_info = json.load(f)
424
+ for i in models_info:
425
+ school = models_info[i]
426
+ speakers = school["speakers"]
427
+ phone_dict = {
428
+ symbol: i for i, symbol in enumerate(symbols)
429
+ }
430
+ checkpoint = models_info[i]["checkpoint"]
431
+ net_g = SynthesizerTrn(
432
+ len(symbols),
433
+ hps.data.filter_length // 2 + 1,
434
+ hps.train.segment_size // hps.data.hop_length,
435
+ n_speakers=hps.data.n_speakers,
436
+ **hps.model).to(dev)
437
+ _ = net_g.eval()
438
+ _ = utils.load_checkpoint(checkpoint , net_g)
439
+ content = []
440
+ for j in speakers:
441
+ sid = int(speakers[j]['sid'])
442
+ title = school
443
+ example = speakers[j]['speech']
444
+ name = speakers[j]["name"]
445
+ content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
446
+ models.append(content)
447
+ idols = ["c1","c2","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"]
448
+ with gr.Blocks() as app:
449
+ with gr.Tabs():
450
+ for i in schools:
451
+ with gr.TabItem(i):
452
+ for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
453
+ with gr.TabItem(name):
454
+ with gr.Column():
455
+ with gr.Row():
456
+ with gr.Row():
457
+ gr.Markdown(
458
+ '<div align="center">'
459
+ f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
460
+ '</div>'
461
+ )
462
+ output_UI = gr.outputs.HTML()
463
+ with gr.Row():
464
+ with gr.Column(scale=0.85):
465
+ input1 = gr.TextArea(label="Text", value=example,lines = 1)
466
+ with gr.Column(scale=0.15, min_width=0):
467
+ btnVC = gr.Button("Send")
468
+ output1 = gr.Audio(label="采样率22050")
469
+ with gr.Accordion(label="Setting(TTS)", open=False):
470
+ input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
471
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
472
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
473
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
474
+ with gr.Accordion(label="Advanced Setting(GPT3.5接口+长句子合成,建议克隆本仓库后运行main.py)", open=False):
475
+ input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
476
+ output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
477
+ api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
478
+ api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
479
+ audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
480
+ audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
481
+ audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
482
+ btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
483
+ with gr.Tab("Voice Conversion(弱化版sovits)"):
484
+ gr.Markdown("""
485
+ 录制或上传声音,并选择要转换的音色。
486
+ """)
487
+ with gr.Column():
488
+ record_audio = gr.Audio(label="record your voice", source="microphone")
489
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
490
+ source_speaker = gr.Dropdown(choices=idols, value="歩夢", label="source speaker")
491
+ target_speaker = gr.Dropdown(choices=idols, value="まひる", label="target speaker")
492
+ with gr.Column():
493
+ message_box = gr.Textbox(label="Message")
494
+ converted_audio = gr.Audio(label='converted audio')
495
+ btn = gr.Button("Convert!")
496
+ btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
497
+ outputs=[message_box, converted_audio])
498
+ with gr.Tab("说明"):
499
+ gr.Markdown(
500
+ "### <center> 请不要生成会对个人以及企划造成侵害的内容,自觉遵守相关法律,静止商业使用或让他人产生困扰\n"
501
+ "<div align='center'>从左到右分别是虹团,少歌中文特化版,以及五校混合版。这三个均为不同的模型,效果也有差异</div>\n"
502
+ "<div align='center'>因为我会时不时地更新模型,所以会碰到平台抽风问题,大部分情况下一天就能恢复了。</div>\n"
503
+ '<div align="center"><a>参数说明:这个十分���学,我还没找到最合适的,如果效果不佳可以将噪声比例和噪声偏差调节至0。按照经验,合成日语时也可以将噪声比例调节至0.2-0.3区间,语调会正常一些。duration代表整体语速,1.0大部分情况应该就够了</div>'
504
+ '<div align="center"><a>建议只在平台上体验最基础的功能,强烈建议将该仓库克隆至本地或者于colab运行 main.py或app.py</div>')
505
+ app.launch()