Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files- app.py +22 -22
- requirements.txt +0 -1
app.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
import psutil
|
4 |
-
import time
|
5 |
-
from threading import Timer
|
6 |
import librosa
|
7 |
import numpy as np
|
8 |
import torch
|
@@ -28,7 +25,7 @@ def get_text(text, hps, is_phoneme):
|
|
28 |
def create_tts_fn(model, hps, speaker_ids):
|
29 |
def tts_fn(text, speaker, speed, is_phoneme):
|
30 |
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
31 |
-
|
32 |
speaker_id = speaker_ids[speaker]
|
33 |
stn_tst = get_text(text, hps, is_phoneme)
|
34 |
with no_grad():
|
@@ -38,7 +35,7 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
38 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
39 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
40 |
del stn_tst, x_tst, x_tst_lengths, sid
|
41 |
-
return
|
42 |
|
43 |
return tts_fn
|
44 |
|
@@ -46,11 +43,11 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
46 |
def create_vc_fn(model, hps, speaker_ids):
|
47 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
48 |
if input_audio is None:
|
49 |
-
|
50 |
sampling_rate, audio = input_audio
|
51 |
duration = audio.shape[0] / sampling_rate
|
52 |
if limitation and duration > 15:
|
53 |
-
|
54 |
original_speaker_id = speaker_ids[original_speaker]
|
55 |
target_speaker_id = speaker_ids[target_speaker]
|
56 |
|
@@ -71,7 +68,7 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
71 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
72 |
0, 0].data.cpu().float().numpy()
|
73 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
74 |
-
return
|
75 |
|
76 |
return vc_fn
|
77 |
|
@@ -144,21 +141,25 @@ if __name__ == '__main__':
|
|
144 |
with advanced_options:
|
145 |
phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
|
146 |
to_phoneme_btn = gr.Button("Covert text to phoneme")
|
147 |
-
phoneme_list = gr.
|
148 |
-
|
|
|
149 |
tts_submit = gr.Button("Generate", variant="primary")
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
158 |
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
159 |
-
[
|
160 |
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
161 |
[tts_input1], [tts_input1])
|
|
|
|
|
162 |
|
163 |
with gr.TabItem("Voice Conversion"):
|
164 |
with gr.Tabs():
|
@@ -172,7 +173,6 @@ if __name__ == '__main__':
|
|
172 |
value=speakers[1])
|
173 |
vc_input3 = gr.Audio(label="Input Audio (15s limitation)")
|
174 |
vc_submit = gr.Button("Convert", variant="primary")
|
175 |
-
|
176 |
-
|
177 |
-
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
|
178 |
app.launch()
|
|
|
1 |
import json
|
2 |
import os
|
|
|
|
|
|
|
3 |
import librosa
|
4 |
import numpy as np
|
5 |
import torch
|
|
|
25 |
def create_tts_fn(model, hps, speaker_ids):
|
26 |
def tts_fn(text, speaker, speed, is_phoneme):
|
27 |
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
28 |
+
raise gr.Error("Text is too long")
|
29 |
speaker_id = speaker_ids[speaker]
|
30 |
stn_tst = get_text(text, hps, is_phoneme)
|
31 |
with no_grad():
|
|
|
35 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
36 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
37 |
del stn_tst, x_tst, x_tst_lengths, sid
|
38 |
+
return hps.data.sampling_rate, audio
|
39 |
|
40 |
return tts_fn
|
41 |
|
|
|
43 |
def create_vc_fn(model, hps, speaker_ids):
|
44 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
45 |
if input_audio is None:
|
46 |
+
raise gr.Error("You need to upload an audio")
|
47 |
sampling_rate, audio = input_audio
|
48 |
duration = audio.shape[0] / sampling_rate
|
49 |
if limitation and duration > 15:
|
50 |
+
raise gr.Error("Audio is too long")
|
51 |
original_speaker_id = speaker_ids[original_speaker]
|
52 |
target_speaker_id = speaker_ids[target_speaker]
|
53 |
|
|
|
68 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
69 |
0, 0].data.cpu().float().numpy()
|
70 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
71 |
+
return hps.data.sampling_rate, audio
|
72 |
|
73 |
return vc_fn
|
74 |
|
|
|
141 |
with advanced_options:
|
142 |
phoneme_input = gr.Checkbox(value=False, label="Phoneme input")
|
143 |
to_phoneme_btn = gr.Button("Covert text to phoneme")
|
144 |
+
phoneme_list = gr.Dataset(label="Phoneme list", components=[tts_input1],
|
145 |
+
samples=[[x] for x in symbols])
|
146 |
+
phoneme_list_json = gr.Json(value=symbols, visible=False)
|
147 |
tts_submit = gr.Button("Generate", variant="primary")
|
148 |
+
tts_output = gr.Audio(label="Output Audio")
|
149 |
+
advanced_button.click(None, [], [], _js="""
|
150 |
+
() => {
|
151 |
+
let options = document.querySelector("body > gradio-app");
|
152 |
+
if (options.shadowRoot != null)
|
153 |
+
options = options.shadowRoot;
|
154 |
+
options = options.querySelector("#advanced-options");
|
155 |
+
options.style.display = ["none", ""].includes(options.style.display) ? "flex" : "none";
|
156 |
+
}""")
|
157 |
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
158 |
+
[tts_output])
|
159 |
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
160 |
[tts_input1], [tts_input1])
|
161 |
+
phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
|
162 |
+
_js="(i,phonemes, text) => text + phonemes[i]")
|
163 |
|
164 |
with gr.TabItem("Voice Conversion"):
|
165 |
with gr.Tabs():
|
|
|
173 |
value=speakers[1])
|
174 |
vc_input3 = gr.Audio(label="Input Audio (15s limitation)")
|
175 |
vc_submit = gr.Button("Convert", variant="primary")
|
176 |
+
vc_output = gr.Audio(label="Output Audio")
|
177 |
+
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output])
|
|
|
178 |
app.launch()
|
requirements.txt
CHANGED
@@ -9,5 +9,4 @@ torch
|
|
9 |
torchvision
|
10 |
Unidecode
|
11 |
pyopenjtalk
|
12 |
-
psutil
|
13 |
gradio
|
|
|
9 |
torchvision
|
10 |
Unidecode
|
11 |
pyopenjtalk
|
|
|
12 |
gradio
|