Spaces:
Running
Running
Improve message
Browse files
app.py
CHANGED
@@ -145,7 +145,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
|
|
145 |
print(f"text_language: {text_language}")
|
146 |
|
147 |
if len(prompt_text) > 100 or len(text) > 100:
|
148 |
-
|
|
|
149 |
t0 = ttime()
|
150 |
prompt_text = prompt_text.strip("\n")
|
151 |
prompt_language, text = prompt_language, text.strip("\n")
|
@@ -153,7 +154,8 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
|
|
153 |
wav16k, _ = librosa.load(ref_wav_path, sr=16000) # ๆดพ่
|
154 |
# length of wav16k in sec should be in 60s
|
155 |
if len(wav16k) > 16000 * 60:
|
156 |
-
|
|
|
157 |
wav16k = wav16k[: int(hps.data.sampling_rate * max_sec)]
|
158 |
wav16k = torch.from_numpy(wav16k)
|
159 |
if is_half == True:
|
@@ -233,9 +235,12 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language)
|
|
233 |
audio_opt.append(zero_wav)
|
234 |
t4 = ttime()
|
235 |
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
236 |
-
|
237 |
-
|
238 |
-
|
|
|
|
|
|
|
239 |
|
240 |
|
241 |
initial_md = """
|
@@ -258,30 +263,32 @@ If you do not agree with these terms and conditions, you may not use or referenc
|
|
258 |
|
259 |
with gr.Blocks(title="GPT-SoVITS Zero-shot TTS Demo") as app:
|
260 |
gr.Markdown(initial_md)
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
value="Japanese",
|
270 |
-
)
|
271 |
-
gr.Markdown(value="*Text to synthesize")
|
272 |
-
with gr.Row():
|
273 |
-
text = gr.Textbox(label="Text to synthesize")
|
274 |
-
text_language = gr.Dropdown(
|
275 |
-
label="Language of text",
|
276 |
-
choices=["Chinese", "English", "Japanese"],
|
277 |
-
value="Japanese",
|
278 |
-
)
|
279 |
-
inference_button = gr.Button("Synthesize", variant="primary")
|
280 |
-
output = gr.Audio(label="Result")
|
281 |
-
inference_button.click(
|
282 |
-
get_tts_wav,
|
283 |
-
[inp_ref, prompt_text, prompt_language, text, text_language],
|
284 |
-
[output],
|
285 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
|
|
287 |
app.launch(inbrowser=True)
|
|
|
145 |
print(f"text_language: {text_language}")
|
146 |
|
147 |
if len(prompt_text) > 100 or len(text) > 100:
|
148 |
+
print("Input text is limited to 100 characters.")
|
149 |
+
return "Input text is limited to 100 characters.", None
|
150 |
t0 = ttime()
|
151 |
prompt_text = prompt_text.strip("\n")
|
152 |
prompt_language, text = prompt_language, text.strip("\n")
|
|
|
154 |
wav16k, _ = librosa.load(ref_wav_path, sr=16000) # ๆดพ่
|
155 |
# length of wav16k in sec should be in 60s
|
156 |
if len(wav16k) > 16000 * 60:
|
157 |
+
print("Input audio is limited to 60 seconds.")
|
158 |
+
return "Input audio is limited to 60 seconds.", None
|
159 |
wav16k = wav16k[: int(hps.data.sampling_rate * max_sec)]
|
160 |
wav16k = torch.from_numpy(wav16k)
|
161 |
if is_half == True:
|
|
|
235 |
audio_opt.append(zero_wav)
|
236 |
t4 = ttime()
|
237 |
print("%.3f\t%.3f\t%.3f\t%.3f" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
|
238 |
+
return "Success! time: %.3f\t%.3f\t%.3f\t%.3f" % (
|
239 |
+
t1 - t0,
|
240 |
+
t2 - t1,
|
241 |
+
t3 - t2,
|
242 |
+
t4 - t3,
|
243 |
+
), (hps.data.sampling_rate, (np.concatenate(audio_opt, 0) * 32768).astype(np.int16))
|
244 |
|
245 |
|
246 |
initial_md = """
|
|
|
263 |
|
264 |
with gr.Blocks(title="GPT-SoVITS Zero-shot TTS Demo") as app:
|
265 |
gr.Markdown(initial_md)
|
266 |
+
gr.Markdown("## Upload reference audio")
|
267 |
+
with gr.Row():
|
268 |
+
inp_ref = gr.Audio(label="Reference audio", type="filepath")
|
269 |
+
prompt_text = gr.Textbox(label="Transcription of reference audio")
|
270 |
+
prompt_language = gr.Dropdown(
|
271 |
+
label="Language of reference audio",
|
272 |
+
choices=["Chinese", "English", "Japanese"],
|
273 |
+
value="Japanese",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
)
|
275 |
+
gr.Markdown("## Text to synthesize")
|
276 |
+
with gr.Row():
|
277 |
+
text = gr.Textbox(label="Text to synthesize")
|
278 |
+
text_language = gr.Dropdown(
|
279 |
+
label="Language of text",
|
280 |
+
choices=["Chinese", "English", "Japanese"],
|
281 |
+
value="Japanese",
|
282 |
+
)
|
283 |
+
inference_button = gr.Button("Synthesize", variant="primary")
|
284 |
+
with gr.Column():
|
285 |
+
info = gr.Textbox(label="Info")
|
286 |
+
output = gr.Audio(label="Result")
|
287 |
+
inference_button.click(
|
288 |
+
get_tts_wav,
|
289 |
+
[inp_ref, prompt_text, prompt_language, text, text_language],
|
290 |
+
[info, output],
|
291 |
+
)
|
292 |
|
293 |
+
app.queue(max_size=10)
|
294 |
app.launch(inbrowser=True)
|