Mahiruoshi commited on
Commit
b522165
1 Parent(s): be9e927

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -476
app.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
  logging.getLogger('numba').setLevel(logging.WARNING)
3
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
4
  logging.getLogger('urllib3').setLevel(logging.WARNING)
5
- import json
6
  import re
7
  import numpy as np
8
  import IPython.display as ipd
@@ -16,129 +16,251 @@ import gradio as gr
16
  import time
17
  import datetime
18
  import os
19
- import pickle
20
- import openai
21
- from scipy.io.wavfile import write
22
  import librosa
23
  from mel_processing import spectrogram_torch
24
- def is_japanese(string):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  for ch in string:
26
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
27
  return True
28
  return False
29
-
30
- def is_english(string):
31
  import re
32
  pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
33
  if pattern.fullmatch(string):
34
  return True
35
  else:
36
  return False
 
 
 
 
 
37
 
38
- def to_html(chat_history):
39
- chat_html = ""
40
- for item in chat_history:
41
- if item['role'] == 'user':
42
- chat_html += f"""
43
- <div style="margin-bottom: 20px;">
44
- <div style="text-align: right; margin-right: 20px;">
45
- <span style="background-color: #4CAF50; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
46
- {item['content']}
47
- </span>
48
- </div>
49
- </div>
50
- """
51
- else:
52
- chat_html += f"""
53
- <div style="margin-bottom: 20px;">
54
- <div style="text-align: left; margin-left: 20px;">
55
- <span style="background-color: white; color: black; padding: 10px; border-radius: 10px; display: inline-block; max-width: 80%; word-wrap: break-word;">
56
- {item['content']}
57
- </span>
58
- </div>
59
- </div>
60
- """
61
- output_html = f"""
62
- <div style="height: 400px; overflow-y: scroll; padding: 10px;">
63
- {chat_html}
64
- </div>
65
- """
66
- return output_html
67
 
68
- def extrac(text):
69
- text = re.sub("<[^>]*>","",text)
70
- result_list = re.split(r'\n', text)
71
- final_list = []
72
- if not torch.cuda.is_available():
73
- if len(final_list) > 10:
74
- return ['对不起,做不到']
75
- for i in result_list:
76
- if is_english(i):
77
- i = romajitable.to_kana(i).katakana
78
- i = i.replace('\n','').replace(' ','')
79
- #Current length of single sentence: 20
80
- if len(i)>1:
81
- if len(i) > 20:
82
- try:
83
- cur_list = re.split(r'。|!', i)
84
- for i in cur_list:
85
- if len(i)>1:
86
- final_list.append(i+'。')
87
- except:
88
- pass
89
- else:
90
- final_list.append(i)
91
- final_list = [x for x in final_list if x != '']
92
- print(final_list)
93
- return final_list
94
 
95
- def to_numpy(tensor: torch.Tensor):
96
- return tensor.detach().cpu().numpy() if tensor.requires_grad \
97
- else tensor.detach().numpy()
 
 
 
 
98
 
99
- def chatgpt(text):
100
- messages = []
101
- try:
102
- with open('log.pickle', 'rb') as f:
103
- messages = pickle.load(f)
104
- messages.append({"role": "user", "content": text},)
105
- chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
106
- reply = chat.choices[0].message.content
107
- messages.append({"role": "assistant", "content": reply})
108
- print(messages[-1])
109
- if len(messages) == 12:
110
- messages[6:10] = messages[8:]
111
- del messages[-2:]
112
- with open('log.pickle', 'wb') as f:
113
- messages2 = []
114
- pickle.dump(messages2, f)
115
- return reply,messages
116
- except:
117
- messages.append({"role": "user", "content": text},)
118
- chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
119
- reply = chat.choices[0].message.content
120
- messages.append({"role": "assistant", "content": reply})
121
- print(messages[-1])
122
- if len(messages) == 12:
123
- messages[6:10] = messages[8:]
124
- del messages[-2:]
125
- with open('log.pickle', 'wb') as f:
126
- pickle.dump(messages, f)
127
- return reply,messages
128
 
129
- def get_symbols_from_json(path):
130
- assert os.path.isfile(path)
131
- with open(path, 'r') as f:
132
- data = json.load(f)
133
- return data['symbols']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- def sle(language,text):
136
- text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  if language == "中文":
138
  tts_input1 = "[ZH]" + text + "[ZH]"
139
  return tts_input1
140
  elif language == "自动":
141
- tts_input1 = f"[JA]{text}[JA]" if is_japanese(text) else f"[ZH]{text}[ZH]"
142
  return tts_input1
143
  elif language == "日文":
144
  tts_input1 = "[JA]" + text + "[JA]"
@@ -148,394 +270,125 @@ def sle(language,text):
148
  return tts_input1
149
  elif language == "手动":
150
  return text
151
-
152
- def get_text(text,hps_ms):
153
- text_norm = text_to_sequence(text,hps_ms.data.text_cleaners)
154
- if hps_ms.data.add_blank:
155
- text_norm = commons.intersperse(text_norm, 0)
156
- text_norm = torch.LongTensor(text_norm)
157
- return text_norm
158
-
159
- def create_vc_fn(net_g,hps):
160
- def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  input_audio = record_audio if record_audio is not None else upload_audio
162
- original_speaker_id = selection(original_speaker)
163
- target_speaker_id = selection(target_speaker)
164
  if input_audio is None:
165
- stn_tst = get_text(sle(language,text),hps)
166
- with torch.no_grad():
167
- x_tst = stn_tst.unsqueeze(0).to(dev)
168
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
169
- sid = torch.LongTensor([original_speaker_id]).to(dev)
170
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
171
- sampling_rate = hps.data.sampling_rate
172
- else:
173
- sampling_rate, audio = input_audio
174
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
175
  if len(audio.shape) > 1:
176
  audio = librosa.to_mono(audio.transpose(1, 0))
177
- if sampling_rate != hps.data.sampling_rate:
178
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
179
  with torch.no_grad():
180
  y = torch.FloatTensor(audio)
181
  y = y / max(-y.min(), y.max()) / 0.99
182
- y = y.to(dev)
183
  y = y.unsqueeze(0)
184
- spec = spectrogram_torch(y, hps.data.filter_length,
185
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
186
- center=False).to(dev)
187
- spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
188
- sid_src = torch.LongTensor([original_speaker_id]).to(dev)
189
- sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
190
- audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
191
  0, 0].data.cpu().float().numpy()
192
  del y, spec, spec_lengths, sid_src, sid_tgt
193
- return "Success", (hps.data.sampling_rate, audio)
194
- return vc_fn
195
-
196
- def selection(speaker):
197
- if speaker == "高咲侑":
198
- spk = 0
199
- return spk
200
-
201
- elif speaker == "歩夢":
202
- spk = 1
203
- return spk
204
-
205
- elif speaker == "かすみ":
206
- spk = 2
207
- return spk
208
-
209
- elif speaker == "しずく":
210
- spk = 3
211
- return spk
212
-
213
- elif speaker == "果林":
214
- spk = 4
215
- return spk
216
-
217
- elif speaker == "愛":
218
- spk = 5
219
- return spk
220
-
221
- elif speaker == "彼方":
222
- spk = 6
223
- return spk
224
-
225
- elif speaker == "せつ菜":
226
- spk = 7
227
- return spk
228
-
229
- elif speaker == "エマ":
230
- spk = 8
231
- return spk
232
-
233
- elif speaker == "璃奈":
234
- spk = 9
235
- return spk
236
-
237
- elif speaker == "栞子":
238
- spk = 10
239
- return spk
240
-
241
- elif speaker == "ランジュ":
242
- spk = 11
243
- return spk
244
-
245
- elif speaker == "ミア":
246
- spk = 12
247
- return spk
248
-
249
- elif speaker == "派蒙":
250
- spk = 16
251
- return spk
252
-
253
- elif speaker == "c1":
254
- spk = 18
255
- return spk
256
-
257
- elif speaker == "c2":
258
- spk = 19
259
- return spk
260
-
261
- elif speaker == "華恋":
262
- spk = 21
263
- return spk
264
-
265
- elif speaker == "まひる":
266
- spk = 22
267
- return spk
268
-
269
- elif speaker == "なな":
270
- spk = 23
271
- return spk
272
-
273
- elif speaker == "クロディーヌ":
274
- spk = 24
275
- return spk
276
-
277
- elif speaker == "ひかり":
278
- spk = 25
279
- return spk
280
-
281
- elif speaker == "純那":
282
- spk = 26
283
- return spk
284
-
285
- elif speaker == "香子":
286
- spk = 27
287
- return spk
288
-
289
- elif speaker == "真矢":
290
- spk = 28
291
- return spk
292
-
293
- elif speaker == "双葉":
294
- spk = 29
295
- return spk
296
-
297
- elif speaker == "ミチル":
298
- spk = 30
299
- return spk
300
-
301
- elif speaker == "メイファン":
302
- spk = 31
303
- return spk
304
 
305
- elif speaker == "やちよ":
306
- spk = 32
307
- return spk
308
-
309
- elif speaker == "晶":
310
- spk = 33
311
- return spk
312
-
313
- elif speaker == "いちえ":
314
- spk = 34
315
- return spk
316
-
317
- elif speaker == "ゆゆ子":
318
- spk = 35
319
- return spk
320
-
321
- elif speaker == "塁":
322
- spk = 36
323
- return spk
324
-
325
- elif speaker == "珠緒":
326
- spk = 37
327
- return spk
328
-
329
- elif speaker == "あるる":
330
- spk = 38
331
- return spk
332
-
333
- elif speaker == "ララフィン":
334
- spk = 39
335
- return spk
336
-
337
- elif speaker == "美空":
338
- spk = 40
339
- return spk
340
-
341
- elif speaker == "静羽":
342
- spk = 41
343
- return spk
344
-
345
- else:
346
- return 0
347
-
348
- def check_text(input):
349
- if isinstance(input, str):
350
- return input
351
- else:
352
- with open(input.name, "r", encoding="utf-8") as f:
353
- return f.read()
354
-
355
- def create_tts_fn(net_g,hps,speaker_id):
356
- speaker_id = int(speaker_id)
357
- def tts_fn(is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
358
- text = check_text(text)
359
- repeat_ime = int(repeat_time)
360
- if is_gpt:
361
- openai.api_key = api_key
362
- text,messages = chatgpt(text)
363
- htm = to_html(messages)
364
- else:
365
- messages = []
366
- messages.append({"role": "assistant", "content": text})
367
- htm = to_html(messages)
368
- if language == '自动':
369
- l_scale = 1.1 if is_japanese(text) else l_scale
370
- if not extract:
371
  t1 = time.time()
372
- stn_tst = get_text(sle(language,text),hps)
373
  with torch.no_grad():
374
- x_tst = stn_tst.unsqueeze(0).to(dev)
375
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
376
- sid = torch.LongTensor([speaker_id]).to(dev)
377
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
378
  t2 = time.time()
379
  spending_time = "推理时间为:"+str(t2-t1)+"s"
380
  print(spending_time)
381
- file_path = "subtitles.srt"
382
- try:
383
- write(audiopath + '.wav',22050,audio)
384
- if is_audio:
385
- for i in range(repeat_time):
386
- cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
387
- os.system(cmd)
388
- except:
389
- pass
390
- return (hps.data.sampling_rate, audio),file_path,htm
391
- else:
392
- a = ['【','[','(','(']
393
- b = ['】',']',')',')']
394
- for i in a:
395
- text = text.replace(i,'<')
396
- for i in b:
397
- text = text.replace(i,'>')
398
- final_list = extrac(text.replace('“','').replace('”',''))
399
- audio_fin = []
400
- c = 0
401
- t = datetime.timedelta(seconds=0)
402
- for sentence in final_list:
403
- try:
404
- f1 = open("subtitles.srt",'w',encoding='utf-8')
405
- c +=1
406
- stn_tst = get_text(sle(language,sentence),hps)
407
- with torch.no_grad():
408
- x_tst = stn_tst.unsqueeze(0).to(dev)
409
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
410
- sid = torch.LongTensor([speaker_id]).to(dev)
411
- t1 = time.time()
412
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
413
- t2 = time.time()
414
- spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
415
- print(spending_time)
416
- time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
417
- last_time = datetime.timedelta(seconds=len(audio)/float(22050))
418
- t+=last_time
419
- time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
420
- print(time_end)
421
- f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
422
- audio_fin.append(audio)
423
- except:
424
- pass
425
- try:
426
- write(audiopath + '.wav',22050,np.concatenate(audio_fin))
427
- if is_audio:
428
- for i in range(repeat_time):
429
- cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
430
- os.system(cmd)
431
-
432
- except:
433
- pass
434
-
435
- file_path = "subtitles.srt"
436
- return (hps.data.sampling_rate, np.concatenate(audio_fin)),file_path,htm
437
- return tts_fn
438
 
439
- if __name__ == '__main__':
440
- hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
441
- dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
442
- models = []
443
- schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
444
- schools = []
445
- lan = ["中文","日文","自动","手动"]
446
- with open("checkpoints/info.json", "r", encoding="utf-8") as f:
447
- models_info = json.load(f)
448
- for i in models_info:
449
- school = models_info[i]
450
- speakers = school["speakers"]
451
- phone_dict = {
452
- symbol: i for i, symbol in enumerate(symbols)
453
- }
454
- checkpoint = models_info[i]["checkpoint"]
455
- net_g = SynthesizerTrn(
456
- len(symbols),
457
- hps.data.filter_length // 2 + 1,
458
- hps.train.segment_size // hps.data.hop_length,
459
- n_speakers=hps.data.n_speakers,
460
- **hps.model).to(dev)
461
- _ = net_g.eval()
462
- _ = utils.load_checkpoint(checkpoint , net_g)
463
- content = []
464
- for j in speakers:
465
- sid = int(speakers[j]['sid'])
466
- title = school
467
- example = speakers[j]['speech']
468
- name = speakers[j]["name"]
469
- content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
470
- models.append(content)
471
- schools.append((i,create_vc_fn(net_g,hps)))
472
- with gr.Blocks() as app:
473
- with gr.Tabs():
474
- for (i,vc_fn) in schools:
475
- with gr.TabItem(i):
476
- idols = ["派蒙"]
477
- for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]:
478
- idols.append(name)
479
- with gr.TabItem(name):
480
- with gr.Column():
481
- with gr.Row():
482
- with gr.Row():
483
- gr.Markdown(
484
- '<div align="center">'
485
- f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
486
- '</div>'
487
- )
488
- output_UI = gr.outputs.HTML()
489
- with gr.Row():
490
- with gr.Column(scale=0.85):
491
- input1 = gr.TextArea(label="Text", value=example,lines = 1)
492
- with gr.Column(scale=0.15, min_width=0):
493
- btnVC = gr.Button("Send")
494
- output1 = gr.Audio(label="采样率22050")
495
- with gr.Accordion(label="Setting(TTS)", open=False):
496
- input2 = gr.Dropdown(label="参数及语言选择方式", choices=lan, value="自动", interactive=True)
497
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
498
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
499
- input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
500
- with gr.Accordion(label="Advanced Setting(GPT3.5接口+小说合成,仅展示用,大部分功能用不了。需克隆本仓库后本地运行main.py)", open=False):
501
- input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
502
- inputxt = gr.File(label="Text")
503
- btnbook = gr.Button("小说合成")
504
- output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
505
- api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
506
- api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
507
- audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
508
- audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
509
- audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
510
- btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
511
- btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
512
- with gr.Tab("Voice Conversion(类似sovits)"):
513
- gr.Markdown("""
514
- 声线转化,使用模型中的说话人作为音源时效果更佳
515
- """)
516
- with gr.Column():
517
- with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False):
518
- record_audio = gr.Audio(label="record your voice", source="microphone")
519
- upload_audio = gr.Audio(label="or upload audio here", source="upload")
520
- with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True):
521
- text = gr.TextArea(label="Text", value='输入文本',lines = 1)
522
- language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
523
- n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
524
- n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
525
- l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1)
526
- source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker")
527
- target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker")
528
- with gr.Column():
529
- message_box = gr.Textbox(label="Message")
530
- converted_audio = gr.Audio(label='converted audio')
531
- btn = gr.Button("Convert!")
532
- btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio],
533
- outputs=[message_box, converted_audio])
534
- with gr.Tab("说明"):
535
- gr.Markdown(
536
- "### <center> 请不要生成会对个人以及企划造成侵害的内容,自觉遵守相关法律,静止商业使用或让他人产生困扰\n"
537
- "<div align='center'>从左到右分别是虹团,少歌中文特化版,以及五校混合版。这三个均为不同的模型,效果也有差异</div>\n"
538
- "<div align='center'>因为我会时不时地更新模型,所以会碰到平台抽风问题,大部分情况下一天就能恢复了。</div>\n"
539
- '<div align="center"><a>参数说明:这个十分玄学,如果效果不佳可以将噪声比例和噪声偏差调节至0,这会完全随机化音频源。按照经验,合成日语时也可以将噪声比例调节至0.2-0.3区间,语调会正常一些。duration代表整体语速,可视情况调至1.1或1.2,目前已自动匹配,如需调整将language项调为日文或中文。</div>'
540
- '<div align="center"><a>建议只在平台上体验最基础的功能,强烈建议将该仓库克隆至本地或者于colab运行,启动程序为main.py或app.py</div>')
541
- app.launch()
 
2
  logging.getLogger('numba').setLevel(logging.WARNING)
3
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
4
  logging.getLogger('urllib3').setLevel(logging.WARNING)
5
+ import romajitable
6
  import re
7
  import numpy as np
8
  import IPython.display as ipd
 
16
  import time
17
  import datetime
18
  import os
 
 
 
19
  import librosa
20
  from mel_processing import spectrogram_torch
21
+ class VitsGradio:
22
+ def __init__(self):
23
+ self.dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24
+ self.lan = ["中文","日文","自动","手动"]
25
+ self.idols = ["c1","c2","高咲侑","歩夢","かすみ","しずく","果林","愛","彼方","せつ菜","璃奈","栞子","エマ","ランジュ","ミア","華恋","まひる","なな","クロディーヌ","ひかり",'純那',"香子","真矢","双葉","ミチル","メイファン","やちよ","晶","いちえ","ゆゆ子","塁","珠緒","あるる","ララフィン","美空","静羽","あるる"]
26
+ self.modelPaths = []
27
+ for root,dirs,files in os.walk("checkpoints"):
28
+ for dir in dirs:
29
+ self.modelPaths.append(dir)
30
+ with gr.Blocks() as self.Vits:
31
+ gr.Markdown(
32
+ "## <center> Lovelive虹团中日双语VITS\n"
33
+ "### <center> 请不要生成会对个人以及企划造成侵害的内容\n"
34
+ "<div align='center'>目前有标贝普通话版,去标贝版,少歌模型还是大饼状态</div>"
35
+ '<div align="center"><a>参数说明:由于爱抖露们过于有感情,合成日语时建议将噪声比例调节至0.2-0.3区间,噪声偏差对应着每个字之间的间隔,对普通话影响较大,duration代表整体语速</div>'
36
+ '<div align="center"><a>合成前请先选择模型,否则第一次合成不一定成功。长段落/小说合成建议colab或本地运行</div>')
37
+ with gr.Tab("TTS合成"):
38
+ with gr.Row():
39
+ with gr.Column():
40
+ with gr.Row():
41
+ with gr.Column():
42
+ input1 = gr.TextArea(label="Text", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
43
+ input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
44
+ input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
45
+ btnVC = gr.Button("Submit")
46
+ with gr.Column():
47
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
48
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
49
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
50
+ output1 = gr.Audio(label="采样率22050")
51
+ btnVC.click(self.infer, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1])
52
+ with gr.Tab("选择模型"):
53
+ with gr.Column():
54
+ modelstrs = gr.Dropdown(label = "模型", choices = self.modelPaths, value = self.modelPaths[0], type = "value")
55
+ btnMod = gr.Button("载入模型")
56
+ statusa = gr.TextArea()
57
+ btnMod.click(self.loadCk, inputs=[modelstrs], outputs = [statusa])
58
+ with gr.Tab("Voice Conversion"):
59
+ gr.Markdown("""
60
+ 录制或上传声音,并选择要转换的音色。
61
+ """)
62
+ with gr.Column():
63
+ record_audio = gr.Audio(label="record your voice", source="microphone")
64
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
65
+ source_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="source speaker")
66
+ target_speaker = gr.Dropdown(choices=self.idols, value="歩夢", label="target speaker")
67
+ with gr.Column():
68
+ message_box = gr.Textbox(label="Message")
69
+ converted_audio = gr.Audio(label='converted audio')
70
+ btn = gr.Button("Convert!")
71
+ btn.click(self.vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
72
+ outputs=[message_box, converted_audio])
73
+ with gr.Tab("小说合成(带字幕)"):
74
+ with gr.Row():
75
+ with gr.Column():
76
+ with gr.Row():
77
+ with gr.Column():
78
+ input1 = gr.TextArea(label="建议colab或本地克隆后运行本仓库", value="为什么你会那么熟练啊?你和雪菜亲过多少次了")
79
+ input2 = gr.Dropdown(label="Language", choices=self.lan, value="自动", interactive=True)
80
+ input3 = gr.Dropdown(label="Speaker", choices=self.idols, value="歩夢", interactive=True)
81
+ btnVC = gr.Button("Submit")
82
+ with gr.Column():
83
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.267)
84
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
85
+ input6 = gr.Slider(minimum=0.1, maximum=10, label="Duration", value=1)
86
+ output1 = gr.Audio(label="采样率22050")
87
+ subtitle = gr.outputs.File(label="字幕文件:subtitles.srt")
88
+ btnVC.click(self.infer2, inputs=[input1, input2, input3, input4, input5, input6], outputs=[output1,subtitle])
89
+
90
+ def loadCk(self,path):
91
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
92
+ self.net_g = SynthesizerTrn(
93
+ len(symbols),
94
+ self.hps.data.filter_length // 2 + 1,
95
+ self.hps.train.segment_size // self.hps.data.hop_length,
96
+ n_speakers=self.hps.data.n_speakers,
97
+ **self.hps.model).to(self.dev)
98
+ _ = self.net_g.eval()
99
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.net_g)
100
+ return "success"
101
+
102
+ def get_text(self,text):
103
+ text_norm = text_to_sequence(text,self.hps.data.text_cleaners)
104
+ if self.hps.data.add_blank:
105
+ text_norm = commons.intersperse(text_norm, 0)
106
+ text_norm = torch.LongTensor(text_norm)
107
+ return text_norm
108
+
109
+ def is_japanese(self,string):
110
  for ch in string:
111
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
112
  return True
113
  return False
114
+
115
+ def is_english(self,string):
116
  import re
117
  pattern = re.compile('^[A-Za-z0-9.,:;!?()_*"\' ]+$')
118
  if pattern.fullmatch(string):
119
  return True
120
  else:
121
  return False
122
+
123
+ def selection(self,speaker):
124
+ if speaker == "高咲侑":
125
+ spk = 0
126
+ return spk
127
 
128
+ elif speaker == "歩夢":
129
+ spk = 1
130
+ return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ elif speaker == "かすみ":
133
+ spk = 2
134
+ return spk
135
+
136
+ elif speaker == "しずく":
137
+ spk = 3
138
+ return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
+ elif speaker == "果林":
141
+ spk = 4
142
+ return spk
143
+
144
+ elif speaker == "愛":
145
+ spk = 5
146
+ return spk
147
 
148
+ elif speaker == "彼方":
149
+ spk = 6
150
+ return spk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
+ elif speaker == "せつ菜":
153
+ spk = 7
154
+ return spk
155
+ elif speaker == "エマ":
156
+ spk = 8
157
+ return spk
158
+ elif speaker == "璃奈":
159
+ spk = 9
160
+ return spk
161
+ elif speaker == "栞子":
162
+ spk = 10
163
+ return spk
164
+ elif speaker == "ランジュ":
165
+ spk = 11
166
+ return spk
167
+ elif speaker == "ミア":
168
+ spk = 12
169
+ return spk
170
+
171
+ elif speaker == "派蒙":
172
+ spk = 16
173
+ return spk
174
+
175
+ elif speaker == "c1":
176
+ spk = 18
177
+ return spk
178
 
179
+ elif speaker == "c2":
180
+ spk = 19
181
+ return spk
182
+
183
+ elif speaker == "華恋":
184
+ spk = 21
185
+ return spk
186
+
187
+ elif speaker == "まひる":
188
+ spk = 22
189
+ return spk
190
+
191
+ elif speaker == "なな":
192
+ spk = 23
193
+ return spk
194
+
195
+ elif speaker == "クロディーヌ":
196
+ spk = 24
197
+ return spk
198
+
199
+ elif speaker == "ひかり":
200
+ spk = 25
201
+ return spk
202
+
203
+ elif speaker == "純那":
204
+ spk = 26
205
+ return spk
206
+
207
+ elif speaker == "香子":
208
+ spk = 27
209
+ return spk
210
+
211
+ elif speaker == "真矢":
212
+ spk = 28
213
+ return spk
214
+ elif speaker == "双葉":
215
+ spk = 29
216
+ return spk
217
+ elif speaker == "ミチル":
218
+ spk = 30
219
+ return spk
220
+ elif speaker == "メイファン":
221
+ spk = 31
222
+ return spk
223
+ elif speaker == "やちよ":
224
+ spk = 32
225
+ return spk
226
+ elif speaker == "晶":
227
+ spk = 33
228
+ return spk
229
+ elif speaker == "いちえ":
230
+ spk = 34
231
+ return spk
232
+ elif speaker == "ゆゆ子":
233
+ spk = 35
234
+ return spk
235
+ elif speaker == "塁":
236
+ spk = 36
237
+ return spk
238
+ elif speaker == "珠緒":
239
+ spk = 37
240
+ return spk
241
+ elif speaker == "あるる":
242
+ spk = 38
243
+ return spk
244
+ elif speaker == "ララフィン":
245
+ spk = 39
246
+ return spk
247
+ elif speaker == "美空":
248
+ spk = 40
249
+ return spk
250
+ elif speaker == "静羽":
251
+ spk = 41
252
+ return spk
253
+ else:
254
+ return 0
255
+
256
+
257
+ def sle(self,language,text):
258
+ text = text.replace('\n','。').replace(' ',',')
259
  if language == "中文":
260
  tts_input1 = "[ZH]" + text + "[ZH]"
261
  return tts_input1
262
  elif language == "自动":
263
+ tts_input1 = f"[JA]{text}[JA]" if self.is_japanese(text) else f"[ZH]{text}[ZH]"
264
  return tts_input1
265
  elif language == "日文":
266
  tts_input1 = "[JA]" + text + "[JA]"
 
270
  return tts_input1
271
  elif language == "手动":
272
  return text
273
+
274
+ def extrac(self,text):
275
+ text = re.sub("<[^>]*>","",text)
276
+ result_list = re.split(r'\n', text)
277
+ final_list = []
278
+ for i in result_list:
279
+ if self.is_english(i):
280
+ i = romajitable.to_kana(i).katakana
281
+ i = i.replace('\n','').replace(' ','')
282
+ #Current length of single sentence: 20
283
+ '''
284
+ if len(i)>1:
285
+ if len(i) > 20:
286
+ try:
287
+ cur_list = re.split(r'。|!', i)
288
+ for i in cur_list:
289
+ if len(i)>1:
290
+ final_list.append(i+'。')
291
+ except:
292
+ pass
293
+ else:
294
+ final_list.append(i)
295
+ '''
296
+ try:
297
+ final_list.append(i)
298
+ except:
299
+ pass
300
+ final_list = [x for x in final_list if x != '']
301
+ print(final_list)
302
+ return final_list
303
+
304
+ def vc_fn(self,original_speaker, target_speaker, record_audio, upload_audio):
305
  input_audio = record_audio if record_audio is not None else upload_audio
 
 
306
  if input_audio is None:
307
+ return "You need to record or upload an audio", None
308
+ sampling_rate, audio = input_audio
309
+ original_speaker_id = self.selection(original_speaker)
310
+ target_speaker_id = self.selection(target_speaker)
311
+
312
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
 
 
 
 
313
  if len(audio.shape) > 1:
314
  audio = librosa.to_mono(audio.transpose(1, 0))
315
+ if sampling_rate != self.hps.data.sampling_rate:
316
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=self.hps.data.sampling_rate)
317
  with torch.no_grad():
318
  y = torch.FloatTensor(audio)
319
  y = y / max(-y.min(), y.max()) / 0.99
320
+ y = y.to(self.dev)
321
  y = y.unsqueeze(0)
322
+ spec = spectrogram_torch(y, self.hps.data.filter_length,
323
+ self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
324
+ center=False).to(self.dev)
325
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.dev)
326
+ sid_src = torch.LongTensor([original_speaker_id]).to(self.dev)
327
+ sid_tgt = torch.LongTensor([target_speaker_id]).to(self.dev)
328
+ audio = self.net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
329
  0, 0].data.cpu().float().numpy()
330
  del y, spec, spec_lengths, sid_src, sid_tgt
331
+ return "Success", (self.hps.data.sampling_rate, audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
 
333
+ def infer(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
334
+ try:
335
+ speaker_id = int(self.selection(speaker_id))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  t1 = time.time()
337
+ stn_tst = self.get_text(self.sle(language,text))
338
  with torch.no_grad():
339
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
340
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
341
+ sid = torch.LongTensor([speaker_id]).to(self.dev)
342
+ audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
343
  t2 = time.time()
344
  spending_time = "推理时间为:"+str(t2-t1)+"s"
345
  print(spending_time)
346
+ return (self.hps.data.sampling_rate, audio)
347
+ except:
348
+ self.hps = utils.get_hparams_from_file(f"checkpoints/biaobei/config.json")
349
+ self.net_g = SynthesizerTrn(
350
+ len(symbols),
351
+ self.hps.data.filter_length // 2 + 1,
352
+ self.hps.train.segment_size // self.hps.data.hop_length,
353
+ n_speakers=self.hps.data.n_speakers,
354
+ **self.hps.model).to(self.dev)
355
+ _ = self.net_g.eval()
356
+ _ = utils.load_checkpoint(f"checkpoints/biaobei/model.pth", self.net_g)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
 
358
+ def infer2(self, text ,language, speaker_id,n_scale= 0.667,n_scale_w = 0.8, l_scale = 1):
359
+ speaker_id = int(self.selection(speaker_id))
360
+ a = ['【','[','(','(']
361
+ b = ['】',']',')',')']
362
+ for i in a:
363
+ text = text.replace(i,'<')
364
+ for i in b:
365
+ text = text.replace(i,'>')
366
+ final_list = self.extrac(text.replace('“','').replace('”',''))
367
+ audio_fin = []
368
+ c = 0
369
+ t = datetime.timedelta(seconds=0)
370
+ f1 = open("subtitles.srt",'w',encoding='utf-8')
371
+ for sentence in final_list:
372
+ c +=1
373
+ stn_tst = self.get_text(self.sle(language,sentence))
374
+ with torch.no_grad():
375
+ x_tst = stn_tst.unsqueeze(0).to(self.dev)
376
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(self.dev)
377
+ sid = torch.LongTensor([speaker_id]).to(self.dev)
378
+ t1 = time.time()
379
+ audio = self.net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
380
+ t2 = time.time()
381
+ spending_time = "第"+str(c)+"句的推理时间为:"+str(t2-t1)+"s"
382
+ print(spending_time)
383
+ time_start = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
384
+ last_time = datetime.timedelta(seconds=len(audio)/float(22050))
385
+ t+=last_time
386
+ time_end = str(t).split(".")[0] + "," + str(t.microseconds)[:3]
387
+ print(time_end)
388
+ f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
389
+ audio_fin.append(audio)
390
+ file_path = "subtitles.srt"
391
+ return (self.hps.data.sampling_rate, np.concatenate(audio_fin)),file_path
392
+ print("开始部署")
393
+ grVits = VitsGradio()
394
+ grVits.Vits.launch()