Mahiruoshi
commited on
Commit
•
d4ed48a
1
Parent(s):
c25a639
Update app.py
Browse files
app.py
CHANGED
@@ -156,40 +156,42 @@ def get_text(text,hps_ms):
|
|
156 |
text_norm = torch.LongTensor(text_norm)
|
157 |
return text_norm
|
158 |
|
159 |
-
def
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
with torch.no_grad():
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
y = y.to(dev)
|
182 |
-
y = y.unsqueeze(0)
|
183 |
-
spec = spectrogram_torch(y, hps.data.filter_length,
|
184 |
-
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
185 |
-
center=False).to(dev)
|
186 |
-
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
|
187 |
-
sid_src = torch.LongTensor([original_speaker_id]).to(dev)
|
188 |
-
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
|
189 |
-
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
190 |
-
0, 0].data.cpu().float().numpy()
|
191 |
-
del y, spec, spec_lengths, sid_src, sid_tgt
|
192 |
-
return "Success", (hps.data.sampling_rate, audio)
|
193 |
|
194 |
def selection(speaker):
|
195 |
if speaker == "高咲侑":
|
@@ -436,7 +438,8 @@ if __name__ == '__main__':
|
|
436 |
hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
|
437 |
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
438 |
models = []
|
439 |
-
|
|
|
440 |
lan = ["中文","日文","自动","手动"]
|
441 |
with open("checkpoints/info.json", "r", encoding="utf-8") as f:
|
442 |
models_info = json.load(f)
|
@@ -463,12 +466,13 @@ if __name__ == '__main__':
|
|
463 |
name = speakers[j]["name"]
|
464 |
content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
|
465 |
models.append(content)
|
|
|
466 |
with gr.Blocks() as app:
|
467 |
with gr.Tabs():
|
468 |
-
for i in schools:
|
469 |
with gr.TabItem(i):
|
470 |
idols = ["派蒙"]
|
471 |
-
for (sid, name, title, example, tts_fn) in models[
|
472 |
idols.append(name)
|
473 |
with gr.TabItem(name):
|
474 |
with gr.Column():
|
@@ -503,7 +507,7 @@ if __name__ == '__main__':
|
|
503 |
audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
|
504 |
btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
505 |
btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
506 |
-
with gr.Tab("Voice Conversion(
|
507 |
gr.Markdown("""
|
508 |
声线转化,使用模型中的说话人作为音源时效果更佳
|
509 |
""")
|
|
|
156 |
text_norm = torch.LongTensor(text_norm)
|
157 |
return text_norm
|
158 |
|
159 |
+
def create_vc_fn(net_g,hps):
|
160 |
+
def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
|
161 |
+
input_audio = record_audio if record_audio is not None else upload_audio
|
162 |
+
original_speaker_id = selection(original_speaker)
|
163 |
+
target_speaker_id = selection(target_speaker)
|
164 |
+
if input_audio is None:
|
165 |
+
stn_tst = get_text(sle(language,text),hps)
|
166 |
+
with torch.no_grad():
|
167 |
+
x_tst = stn_tst.unsqueeze(0).to(dev)
|
168 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
|
169 |
+
sid = torch.LongTensor([original_speaker_id]).to(dev)
|
170 |
+
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
|
171 |
+
sampling_rate = hps.data.sampling_rate
|
172 |
+
else:
|
173 |
+
sampling_rate, audio = input_audio
|
174 |
+
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
175 |
+
if len(audio.shape) > 1:
|
176 |
+
audio = librosa.to_mono(audio.transpose(1, 0))
|
177 |
+
if sampling_rate != hps.data.sampling_rate:
|
178 |
+
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
|
179 |
with torch.no_grad():
|
180 |
+
y = torch.FloatTensor(audio)
|
181 |
+
y = y / max(-y.min(), y.max()) / 0.99
|
182 |
+
y = y.to(dev)
|
183 |
+
y = y.unsqueeze(0)
|
184 |
+
spec = spectrogram_torch(y, hps.data.filter_length,
|
185 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
186 |
+
center=False).to(dev)
|
187 |
+
spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
|
188 |
+
sid_src = torch.LongTensor([original_speaker_id]).to(dev)
|
189 |
+
sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
|
190 |
+
audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
191 |
+
0, 0].data.cpu().float().numpy()
|
192 |
+
del y, spec, spec_lengths, sid_src, sid_tgt
|
193 |
+
return "Success", (hps.data.sampling_rate, audio)
|
194 |
+
return vc_fn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
def selection(speaker):
|
197 |
if speaker == "高咲侑":
|
|
|
438 |
hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
|
439 |
dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
440 |
models = []
|
441 |
+
schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
|
442 |
+
schools = []
|
443 |
lan = ["中文","日文","自动","手动"]
|
444 |
with open("checkpoints/info.json", "r", encoding="utf-8") as f:
|
445 |
models_info = json.load(f)
|
|
|
466 |
name = speakers[j]["name"]
|
467 |
content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
|
468 |
models.append(content)
|
469 |
+
schools.append((i,create_vc_fn(net_g,hps)))
|
470 |
with gr.Blocks() as app:
|
471 |
with gr.Tabs():
|
472 |
+
for (i,vc_fn) in schools:
|
473 |
with gr.TabItem(i):
|
474 |
idols = ["派蒙"]
|
475 |
+
for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]:
|
476 |
idols.append(name)
|
477 |
with gr.TabItem(name):
|
478 |
with gr.Column():
|
|
|
507 |
audio_input3 = gr.Dropdown(label="重复生成次数", choices=list(range(101)), value='0', interactive=True)
|
508 |
btnbook.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
509 |
btnVC.click(tts_fn, inputs=[api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[output1,output2,output_UI])
|
510 |
+
with gr.Tab("Voice Conversion(类似sovits)"):
|
511 |
gr.Markdown("""
|
512 |
声线转化,使用模型中的说话人作为音源时效果更佳
|
513 |
""")
|