Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -25,7 +25,7 @@ def get_text(text, hps, is_phoneme):
|
|
25 |
def create_tts_fn(model, hps, speaker_ids):
|
26 |
def tts_fn(text, speaker, speed, is_phoneme):
|
27 |
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
28 |
-
|
29 |
speaker_id = speaker_ids[speaker]
|
30 |
stn_tst = get_text(text, hps, is_phoneme)
|
31 |
with no_grad():
|
@@ -35,7 +35,7 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
35 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
36 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
37 |
del stn_tst, x_tst, x_tst_lengths, sid
|
38 |
-
return hps.data.sampling_rate, audio
|
39 |
|
40 |
return tts_fn
|
41 |
|
@@ -43,11 +43,11 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
43 |
def create_vc_fn(model, hps, speaker_ids):
|
44 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
45 |
if input_audio is None:
|
46 |
-
|
47 |
sampling_rate, audio = input_audio
|
48 |
duration = audio.shape[0] / sampling_rate
|
49 |
if limitation and duration > 15:
|
50 |
-
|
51 |
original_speaker_id = speaker_ids[original_speaker]
|
52 |
target_speaker_id = speaker_ids[target_speaker]
|
53 |
|
@@ -68,7 +68,7 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
68 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
69 |
0, 0].data.cpu().float().numpy()
|
70 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
71 |
-
return hps.data.sampling_rate, audio
|
72 |
|
73 |
return vc_fn
|
74 |
|
@@ -145,7 +145,8 @@ if __name__ == '__main__':
|
|
145 |
samples=[[x] for x in symbols])
|
146 |
phoneme_list_json = gr.Json(value=symbols, visible=False)
|
147 |
tts_submit = gr.Button("Generate", variant="primary")
|
148 |
-
|
|
|
149 |
advanced_button.click(None, [], [], _js="""
|
150 |
() => {
|
151 |
let options = document.querySelector("body > gradio-app");
|
@@ -155,7 +156,7 @@ if __name__ == '__main__':
|
|
155 |
options.style.display = ["none", ""].includes(options.style.display) ? "flex" : "none";
|
156 |
}""")
|
157 |
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
158 |
-
[
|
159 |
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
160 |
[tts_input1], [tts_input1])
|
161 |
phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
|
@@ -173,6 +174,7 @@ if __name__ == '__main__':
|
|
173 |
value=speakers[1])
|
174 |
vc_input3 = gr.Audio(label="Input Audio (15s limitation)")
|
175 |
vc_submit = gr.Button("Convert", variant="primary")
|
176 |
-
|
177 |
-
|
|
|
178 |
app.launch()
|
|
|
25 |
def create_tts_fn(model, hps, speaker_ids):
|
26 |
def tts_fn(text, speaker, speed, is_phoneme):
|
27 |
if limitation and ((len(text) > 60 and not is_phoneme) or (len(text) > 120 and is_phoneme)):
|
28 |
+
return "Error: Text is too long", None
|
29 |
speaker_id = speaker_ids[speaker]
|
30 |
stn_tst = get_text(text, hps, is_phoneme)
|
31 |
with no_grad():
|
|
|
35 |
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
|
36 |
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
37 |
del stn_tst, x_tst, x_tst_lengths, sid
|
38 |
+
return "Success", (hps.data.sampling_rate, audio)
|
39 |
|
40 |
return tts_fn
|
41 |
|
|
|
43 |
def create_vc_fn(model, hps, speaker_ids):
|
44 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
45 |
if input_audio is None:
|
46 |
+
return "You need to upload an audio", None
|
47 |
sampling_rate, audio = input_audio
|
48 |
duration = audio.shape[0] / sampling_rate
|
49 |
if limitation and duration > 15:
|
50 |
+
return "Error: Audio is too long", None
|
51 |
original_speaker_id = speaker_ids[original_speaker]
|
52 |
target_speaker_id = speaker_ids[target_speaker]
|
53 |
|
|
|
68 |
audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
|
69 |
0, 0].data.cpu().float().numpy()
|
70 |
del y, spec, spec_lengths, sid_src, sid_tgt
|
71 |
+
return "Success", (hps.data.sampling_rate, audio)
|
72 |
|
73 |
return vc_fn
|
74 |
|
|
|
145 |
samples=[[x] for x in symbols])
|
146 |
phoneme_list_json = gr.Json(value=symbols, visible=False)
|
147 |
tts_submit = gr.Button("Generate", variant="primary")
|
148 |
+
tts_output1 = gr.Textbox(label="Output Message")
|
149 |
+
tts_output2 = gr.Audio(label="Output Audio")
|
150 |
advanced_button.click(None, [], [], _js="""
|
151 |
() => {
|
152 |
let options = document.querySelector("body > gradio-app");
|
|
|
156 |
options.style.display = ["none", ""].includes(options.style.display) ? "flex" : "none";
|
157 |
}""")
|
158 |
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, phoneme_input],
|
159 |
+
[tts_output1, tts_output2])
|
160 |
to_phoneme_btn.click(lambda x: _clean_text(x, hps.data.text_cleaners) if x != "" else x,
|
161 |
[tts_input1], [tts_input1])
|
162 |
phoneme_list.click(None, [phoneme_list, phoneme_list_json, tts_input1], [tts_input1],
|
|
|
174 |
value=speakers[1])
|
175 |
vc_input3 = gr.Audio(label="Input Audio (15s limitation)")
|
176 |
vc_submit = gr.Button("Convert", variant="primary")
|
177 |
+
vc_output1 = gr.Textbox(label="Output Message")
|
178 |
+
vc_output2 = gr.Audio(label="Output Audio")
|
179 |
+
vc_submit.click(vc_fn, [vc_input1, vc_input2, vc_input3], [vc_output1, vc_output2])
|
180 |
app.launch()
|