Mahiruoshi commited on
Commit
d4ed48a
1 Parent(s): c25a639

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -37
app.py CHANGED
@@ -156,40 +156,42 @@ def get_text(text,hps_ms):
156
  text_norm = torch.LongTensor(text_norm)
157
  return text_norm
158
 
159
- def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
160
- input_audio = record_audio if record_audio is not None else upload_audio
161
- original_speaker_id = selection(original_speaker)
162
- target_speaker_id = selection(target_speaker)
163
- if input_audio is None:
164
- stn_tst = get_text(sle(language,text),hps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  with torch.no_grad():
166
- x_tst = stn_tst.unsqueeze(0).to(dev)
167
- x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
168
- sid = torch.LongTensor([original_speaker_id]).to(dev)
169
- audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
170
- sampling_rate = hps.data.sampling_rate
171
- else:
172
- sampling_rate, audio = input_audio
173
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
174
- if len(audio.shape) > 1:
175
- audio = librosa.to_mono(audio.transpose(1, 0))
176
- if sampling_rate != hps.data.sampling_rate:
177
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
178
- with torch.no_grad():
179
- y = torch.FloatTensor(audio)
180
- y = y / max(-y.min(), y.max()) / 0.99
181
- y = y.to(dev)
182
- y = y.unsqueeze(0)
183
- spec = spectrogram_torch(y, hps.data.filter_length,
184
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
185
- center=False).to(dev)
186
- spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
187
- sid_src = torch.LongTensor([original_speaker_id]).to(dev)
188
- sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
189
- audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
190
- 0, 0].data.cpu().float().numpy()
191
- del y, spec, spec_lengths, sid_src, sid_tgt
192
- return "Success", (hps.data.sampling_rate, audio)
193
 
194
  def selection(speaker):
195
  if speaker == "高咲侑":
@@ -436,7 +438,8 @@ if __name__ == '__main__':
436
  hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
437
  dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
438
  models = []
439
- schools = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
 
440
  lan = ["中文","日文","自动","手动"]
441
  with open("checkpoints/info.json", "r", encoding="utf-8") as f:
442
  models_info = json.load(f)
@@ -463,12 +466,13 @@ if __name__ == '__main__':
463
  name = speakers[j]["name"]
464
  content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
465
  models.append(content)
 
466
  with gr.Blocks() as app:
467
  with gr.Tabs():
468
- for i in schools:
469
  with gr.TabItem(i):
470
  idols = ["派蒙"]
471
- for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
472
  idols.append(name)
473
  with gr.TabItem(name):
474
  with gr.Column():
@@ -503,7 +507,7 @@ if __name__ == '__main__':
503
  audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
504
  btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
505
  btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
506
- with gr.Tab("Voice Conversion(就是sovits的原理)"):
507
  gr.Markdown("""
508
  声线转化,使用模型中的说话人作为音源时效果更佳
509
  """)
 
156
  text_norm = torch.LongTensor(text_norm)
157
  return text_norm
158
 
159
+ def create_vc_fn(net_g,hps):
160
+ def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
161
+ input_audio = record_audio if record_audio is not None else upload_audio
162
+ original_speaker_id = selection(original_speaker)
163
+ target_speaker_id = selection(target_speaker)
164
+ if input_audio is None:
165
+ stn_tst = get_text(sle(language,text),hps)
166
+ with torch.no_grad():
167
+ x_tst = stn_tst.unsqueeze(0).to(dev)
168
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
169
+ sid = torch.LongTensor([original_speaker_id]).to(dev)
170
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
171
+ sampling_rate = hps.data.sampling_rate
172
+ else:
173
+ sampling_rate, audio = input_audio
174
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
175
+ if len(audio.shape) > 1:
176
+ audio = librosa.to_mono(audio.transpose(1, 0))
177
+ if sampling_rate != hps.data.sampling_rate:
178
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
179
  with torch.no_grad():
180
+ y = torch.FloatTensor(audio)
181
+ y = y / max(-y.min(), y.max()) / 0.99
182
+ y = y.to(dev)
183
+ y = y.unsqueeze(0)
184
+ spec = spectrogram_torch(y, hps.data.filter_length,
185
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
186
+ center=False).to(dev)
187
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
188
+ sid_src = torch.LongTensor([original_speaker_id]).to(dev)
189
+ sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
190
+ audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
191
+ 0, 0].data.cpu().float().numpy()
192
+ del y, spec, spec_lengths, sid_src, sid_tgt
193
+ return "Success", (hps.data.sampling_rate, audio)
194
+ return vc_fn
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def selection(speaker):
197
  if speaker == "高咲侑":
 
438
  hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
439
  dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
440
  models = []
441
+ schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
442
+ schools = []
443
  lan = ["中文","日文","自动","手动"]
444
  with open("checkpoints/info.json", "r", encoding="utf-8") as f:
445
  models_info = json.load(f)
 
466
  name = speakers[j]["name"]
467
  content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
468
  models.append(content)
469
+ schools.append((i,create_vc_fn(net_g,hps)))
470
  with gr.Blocks() as app:
471
  with gr.Tabs():
472
+ for (i,vc_fn) in schools:
473
  with gr.TabItem(i):
474
  idols = ["派蒙"]
475
+ for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]:
476
  idols.append(name)
477
  with gr.TabItem(name):
478
  with gr.Column():
 
507
  audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
508
  btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
509
  btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
510
+ with gr.Tab("Voice Conversion(类似sovits)"):
511
  gr.Markdown("""
512
  声线转化,使用模型中的说话人作为音源时效果更佳
513
  """)