Text-to-Speech
coqui
maks10263 commited on
Commit
4a5d42f
1 Parent(s): 6c2b0d7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +703 -0
app.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+ import time
8
+ import torch
9
+ import torchaudio
10
+
11
+
12
+ #download for mecab
13
+ os.system('python -m unidic download')
14
+
15
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
16
+ os.environ["COQUI_TOS_AGREED"] = "1"
17
+
18
+ # langid is used to detect language for longer text
19
+ # Most users expect text to be their own language, there is checkbox to disable it
20
+ import langid
21
+ import base64
22
+ import csv
23
+ from io import StringIO
24
+ import datetime
25
+ import re
26
+
27
+ import gradio as gr
28
+ from scipy.io.wavfile import write
29
+ from pydub import AudioSegment
30
+
31
+ from TTS.api import TTS
32
+ from TTS.tts.configs.xtts_config import XttsConfig
33
+ from TTS.tts.models.xtts import Xtts
34
+ from TTS.utils.generic_utils import get_user_data_dir
35
+
36
+ HF_TOKEN = os.environ.get("HF_TOKEN")
37
+
38
+ from huggingface_hub import HfApi
39
+
40
+ # will use api to restart space on a unrecoverable error
41
+ api = HfApi(token=HF_TOKEN)
42
+ repo_id = "coqui/xtts"
43
+
44
+ # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
45
+ print("Export newer ffmpeg binary for denoise filter")
46
+ ZipFile("ffmpeg.zip").extractall()
47
+ print("Make ffmpeg binary executable")
48
+ st = os.stat("ffmpeg")
49
+ os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
50
+
51
+ # This will trigger downloading model
52
+ print("Downloading if not downloaded Coqui XTTS V2")
53
+ from TTS.utils.manage import ModelManager
54
+
55
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
56
+ ModelManager().download_model(model_name)
57
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
58
+ print("XTTS downloaded")
59
+
60
+ config = XttsConfig()
61
+ config.load_json(os.path.join(model_path, "config.json"))
62
+
63
+ model = Xtts.init_from_config(config)
64
+ model.load_checkpoint(
65
+ config,
66
+ checkpoint_path=os.path.join(model_path, "model.pth"),
67
+ vocab_path=os.path.join(model_path, "vocab.json"),
68
+ eval=True,
69
+ use_deepspeed=True,
70
+ )
71
+ model.cuda()
72
+
73
+ # This is for debugging purposes only
74
+ DEVICE_ASSERT_DETECTED = 0
75
+ DEVICE_ASSERT_PROMPT = None
76
+ DEVICE_ASSERT_LANG = None
77
+
78
+ supported_languages = config.languages
79
+
80
+ def predict(
81
+ prompt,
82
+ language,
83
+ audio_file_pth,
84
+ mic_file_path,
85
+ use_mic,
86
+ voice_cleanup,
87
+ no_lang_auto_detect,
88
+ agree,
89
+ ):
90
+ if agree == True:
91
+ if language not in supported_languages:
92
+ gr.Warning(
93
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
94
+ )
95
+
96
+ return (
97
+ None,
98
+ None,
99
+ None,
100
+ None,
101
+ )
102
+
103
+ language_predicted = langid.classify(prompt)[
104
+ 0
105
+ ].strip() # strip need as there is space at end!
106
+
107
+ # tts expects chinese as zh-cn
108
+ if language_predicted == "zh":
109
+ # we use zh-cn
110
+ language_predicted = "zh-cn"
111
+
112
+ print(f"Detected language:{language_predicted}, Chosen language:{language}")
113
+
114
+ # After text character length 15 trigger language detection
115
+ if len(prompt) > 15:
116
+ # allow any language for short text as some may be common
117
+ # If user unchecks language autodetection it will not trigger
118
+ # You may remove this completely for own use
119
+ if language_predicted != language and not no_lang_auto_detect:
120
+ # Please duplicate and remove this check if you really want this
121
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
122
+ gr.Warning(
123
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
124
+ )
125
+
126
+ return (
127
+ None,
128
+ None,
129
+ None,
130
+ None,
131
+ )
132
+
133
+ if use_mic == True:
134
+ if mic_file_path is not None:
135
+ speaker_wav = mic_file_path
136
+ else:
137
+ gr.Warning(
138
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
139
+ )
140
+ return (
141
+ None,
142
+ None,
143
+ None,
144
+ None,
145
+ )
146
+
147
+ else:
148
+ speaker_wav = audio_file_pth
149
+
150
+ # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
151
+ # This is fast filtering not perfect
152
+
153
+ # Apply all on demand
154
+ lowpassfilter = denoise = trim = loudness = True
155
+
156
+ if lowpassfilter:
157
+ lowpass_highpass = "lowpass=8000,highpass=75,"
158
+ else:
159
+ lowpass_highpass = ""
160
+
161
+ if trim:
162
+ # better to remove silence in beginning and end for microphone
163
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
164
+ else:
165
+ trim_silence = ""
166
+
167
+ if voice_cleanup:
168
+ try:
169
+ out_filename = (
170
+ speaker_wav + str(uuid.uuid4()) + ".wav"
171
+ ) # ffmpeg to know output format
172
+
173
+ # we will use newer ffmpeg as that has afftn denoise filter
174
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
175
+ " "
176
+ )
177
+
178
+ command_result = subprocess.run(
179
+ [item for item in shell_command],
180
+ capture_output=False,
181
+ text=True,
182
+ check=True,
183
+ )
184
+ speaker_wav = out_filename
185
+ print("Filtered microphone input")
186
+ except subprocess.CalledProcessError:
187
+ # There was an error - command exited with non-zero code
188
+ print("Error: failed filtering, use original microphone input")
189
+ else:
190
+ speaker_wav = speaker_wav
191
+
192
+ if len(prompt) < 2:
193
+ gr.Warning("Please give a longer prompt text")
194
+ return (
195
+ None,
196
+ None,
197
+ None,
198
+ None,
199
+ )
200
+ if len(prompt) > 1000:
201
+ gr.Warning(
202
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
203
+ )
204
+ return (
205
+ None,
206
+ None,
207
+ None,
208
+ None,
209
+ )
210
+ global DEVICE_ASSERT_DETECTED
211
+ if DEVICE_ASSERT_DETECTED:
212
+ global DEVICE_ASSERT_PROMPT
213
+ global DEVICE_ASSERT_LANG
214
+ # It will likely never come here as we restart space on first unrecoverable error now
215
+ print(
216
+ f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
217
+ )
218
+
219
+ # HF Space specific.. This error is unrecoverable need to restart space
220
+ space = api.get_space_runtime(repo_id=repo_id)
221
+ if space.stage!="BUILDING":
222
+ api.restart_space(repo_id=repo_id)
223
+ else:
224
+ print("TRIED TO RESTART but space is building")
225
+
226
+ try:
227
+ metrics_text = ""
228
+ t_latent = time.time()
229
+
230
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
231
+ try:
232
+ (
233
+ gpt_cond_latent,
234
+ speaker_embedding,
235
+ ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
236
+ except Exception as e:
237
+ print("Speaker encoding error", str(e))
238
+ gr.Warning(
239
+ "It appears something wrong with reference, did you unmute your microphone?"
240
+ )
241
+ return (
242
+ None,
243
+ None,
244
+ None,
245
+ None,
246
+ )
247
+
248
+ latent_calculation_time = time.time() - t_latent
249
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
250
+
251
+ # temporary comma fix
252
+ prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
253
+
254
+ wav_chunks = []
255
+ ## Direct mode
256
+
257
+ print("I: Generating new audio...")
258
+ t0 = time.time()
259
+ out = model.inference(
260
+ prompt,
261
+ language,
262
+ gpt_cond_latent,
263
+ speaker_embedding,
264
+ repetition_penalty=5.0,
265
+ temperature=0.75,
266
+ )
267
+ inference_time = time.time() - t0
268
+ print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
269
+ metrics_text+=f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
270
+ real_time_factor= (time.time() - t0) / out['wav'].shape[-1] * 24000
271
+ print(f"Real-time factor (RTF): {real_time_factor}")
272
+ metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
273
+ torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
274
+
275
+
276
+ """
277
+ print("I: Generating new audio in streaming mode...")
278
+ t0 = time.time()
279
+ chunks = model.inference_stream(
280
+ prompt,
281
+ language,
282
+ gpt_cond_latent,
283
+ speaker_embedding,
284
+ repetition_penalty=7.0,
285
+ temperature=0.85,
286
+ )
287
+
288
+ first_chunk = True
289
+ for i, chunk in enumerate(chunks):
290
+ if first_chunk:
291
+ first_chunk_time = time.time() - t0
292
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
293
+ first_chunk = False
294
+ wav_chunks.append(chunk)
295
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
296
+ inference_time = time.time() - t0
297
+ print(
298
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
299
+ )
300
+ #metrics_text += (
301
+ # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
302
+ #)
303
+
304
+ wav = torch.cat(wav_chunks, dim=0)
305
+ print(wav.shape)
306
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
307
+ print(f"Real-time factor (RTF): {real_time_factor}")
308
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
309
+
310
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
311
+ """
312
+
313
+ except RuntimeError as e:
314
+ if "device-side assert" in str(e):
315
+ # cannot do anything on cuda device side error, need tor estart
316
+ print(
317
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
318
+ flush=True,
319
+ )
320
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
321
+ print("Cuda device-assert Runtime encountered need restart")
322
+ if not DEVICE_ASSERT_DETECTED:
323
+ DEVICE_ASSERT_DETECTED = 1
324
+ DEVICE_ASSERT_PROMPT = prompt
325
+ DEVICE_ASSERT_LANG = language
326
+
327
+ # just before restarting save what caused the issue so we can handle it in future
328
+ # Uploading Error data only happens for unrecovarable error
329
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
330
+ error_data = [
331
+ error_time,
332
+ prompt,
333
+ language,
334
+ audio_file_pth,
335
+ mic_file_path,
336
+ use_mic,
337
+ voice_cleanup,
338
+ no_lang_auto_detect,
339
+ agree,
340
+ ]
341
+ error_data = [str(e) if type(e) != str else e for e in error_data]
342
+ print(error_data)
343
+ print(speaker_wav)
344
+ write_io = StringIO()
345
+ csv.writer(write_io).writerows([error_data])
346
+ csv_upload = write_io.getvalue().encode()
347
+
348
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
349
+ print("Writing error csv")
350
+ error_api = HfApi()
351
+ error_api.upload_file(
352
+ path_or_fileobj=csv_upload,
353
+ path_in_repo=filename,
354
+ repo_id="coqui/xtts-flagged-dataset",
355
+ repo_type="dataset",
356
+ )
357
+
358
+ # speaker_wav
359
+ print("Writing error reference audio")
360
+ speaker_filename = (
361
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
362
+ )
363
+ error_api = HfApi()
364
+ error_api.upload_file(
365
+ path_or_fileobj=speaker_wav,
366
+ path_in_repo=speaker_filename,
367
+ repo_id="coqui/xtts-flagged-dataset",
368
+ repo_type="dataset",
369
+ )
370
+
371
+ # HF Space specific.. This error is unrecoverable need to restart space
372
+ space = api.get_space_runtime(repo_id=repo_id)
373
+ if space.stage!="BUILDING":
374
+ api.restart_space(repo_id=repo_id)
375
+ else:
376
+ print("TRIED TO RESTART but space is building")
377
+
378
+ else:
379
+ if "Failed to decode" in str(e):
380
+ print("Speaker encoding error", str(e))
381
+ gr.Warning(
382
+ "It appears something wrong with reference, did you unmute your microphone?"
383
+ )
384
+ else:
385
+ print("RuntimeError: non device-side assert error:", str(e))
386
+ gr.Warning("Something unexpected happened please retry again.")
387
+ return (
388
+ None,
389
+ None,
390
+ None,
391
+ None,
392
+ )
393
+ return (
394
+ gr.make_waveform(
395
+ audio="output.wav",
396
+ ),
397
+ "output.wav",
398
+ metrics_text,
399
+ speaker_wav,
400
+ )
401
+ else:
402
+ gr.Warning("Please accept the Terms & Condition!")
403
+ return (
404
+ None,
405
+ None,
406
+ None,
407
+ None,
408
+ )
409
+
410
+
411
+ title = "Coqui🐸 XTTS"
412
+
413
+ description = """
414
+
415
+ <br/>
416
+
417
+ This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
418
+
419
+ <br/>
420
+
421
+ Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
422
+
423
+ <br/>
424
+ """
425
+
426
+ links = """
427
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
428
+
429
+ | | |
430
+ | ------------------------------- | --------------------------------------- |
431
+ | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
432
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
433
+ | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
434
+ | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
435
+
436
+
437
+ """
438
+
439
+ article = """
440
+ <div style='margin:20px auto;'>
441
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
442
+ <p>We collect data only for error cases for improvement.</p>
443
+ </div>
444
+ """
445
+ examples = [
446
+ [
447
+ "Once when I was six years old I saw a magnificent picture",
448
+ "en",
449
+ "examples/female.wav",
450
+ None,
451
+ False,
452
+ False,
453
+ False,
454
+ True,
455
+ ],
456
+ [
457
+ "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
458
+ "fr",
459
+ "examples/male.wav",
460
+ None,
461
+ False,
462
+ False,
463
+ False,
464
+ True,
465
+ ],
466
+ [
467
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
468
+ "de",
469
+ "examples/female.wav",
470
+ None,
471
+ False,
472
+ False,
473
+ False,
474
+ True,
475
+ ],
476
+ [
477
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
478
+ "es",
479
+ "examples/male.wav",
480
+ None,
481
+ False,
482
+ False,
483
+ False,
484
+ True,
485
+ ],
486
+ [
487
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
488
+ "pt",
489
+ "examples/female.wav",
490
+ None,
491
+ False,
492
+ False,
493
+ False,
494
+ True,
495
+ ],
496
+ [
497
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
498
+ "pl",
499
+ "examples/male.wav",
500
+ None,
501
+ False,
502
+ False,
503
+ False,
504
+ True,
505
+ ],
506
+ [
507
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
508
+ "it",
509
+ "examples/female.wav",
510
+ None,
511
+ False,
512
+ False,
513
+ False,
514
+ True,
515
+ ],
516
+ [
517
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
518
+ "tr",
519
+ "examples/female.wav",
520
+ None,
521
+ False,
522
+ False,
523
+ False,
524
+ True,
525
+ ],
526
+ [
527
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
528
+ "ru",
529
+ "examples/female.wav",
530
+ None,
531
+ False,
532
+ False,
533
+ False,
534
+ True,
535
+ ],
536
+ [
537
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
538
+ "nl",
539
+ "examples/male.wav",
540
+ None,
541
+ False,
542
+ False,
543
+ False,
544
+ True,
545
+ ],
546
+ [
547
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
548
+ "cs",
549
+ "examples/female.wav",
550
+ None,
551
+ False,
552
+ False,
553
+ False,
554
+ True,
555
+ ],
556
+ [
557
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
558
+ "zh-cn",
559
+ "examples/female.wav",
560
+ None,
561
+ False,
562
+ False,
563
+ False,
564
+ True,
565
+ ],
566
+ [
567
+ "かつて 六歳のとき、素晴らしい絵を見ました",
568
+ "ja",
569
+ "examples/female.wav",
570
+ None,
571
+ False,
572
+ True,
573
+ False,
574
+ True,
575
+ ],
576
+ [
577
+ "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
578
+ "ko",
579
+ "examples/female.wav",
580
+ None,
581
+ False,
582
+ True,
583
+ False,
584
+ True,
585
+ ],
586
+ [
587
+ "Egyszer hat éves koromban láttam egy csodálatos képet",
588
+ "hu",
589
+ "examples/male.wav",
590
+ None,
591
+ False,
592
+ True,
593
+ False,
594
+ True,
595
+ ],
596
+ ]
597
+
598
+
599
+
600
+ with gr.Blocks(analytics_enabled=False) as demo:
601
+ with gr.Row():
602
+ with gr.Column():
603
+ gr.Markdown(
604
+ """
605
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
606
+ """
607
+ )
608
+ with gr.Column():
609
+ # placeholder to align the image
610
+ pass
611
+
612
+ with gr.Row():
613
+ with gr.Column():
614
+ gr.Markdown(description)
615
+ with gr.Column():
616
+ gr.Markdown(links)
617
+
618
+ with gr.Row():
619
+ with gr.Column():
620
+ input_text_gr = gr.Textbox(
621
+ label="Text Prompt",
622
+ info="One or two sentences at a time is better. Up to 200 text characters.",
623
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
624
+ )
625
+ language_gr = gr.Dropdown(
626
+ label="Language",
627
+ info="Select an output language for the synthesised speech",
628
+ choices=[
629
+ "en",
630
+ "es",
631
+ "fr",
632
+ "de",
633
+ "it",
634
+ "pt",
635
+ "pl",
636
+ "tr",
637
+ "ru",
638
+ "nl",
639
+ "cs",
640
+ "ar",
641
+ "zh-cn",
642
+ "ja",
643
+ "ko",
644
+ "hu",
645
+ "hi"
646
+ ],
647
+ max_choices=1,
648
+ value="en",
649
+ )
650
+ ref_gr = gr.Audio(
651
+ label="Reference Audio",
652
+ info="Click on the ✎ button to upload your own target speaker audio",
653
+ type="filepath",
654
+ value="examples/female.wav",
655
+ )
656
+ mic_gr = gr.Audio(
657
+ source="microphone",
658
+ type="filepath",
659
+ info="Use your microphone to record audio",
660
+ label="Use Microphone for Reference",
661
+ )
662
+ use_mic_gr = gr.Checkbox(
663
+ label="Use Microphone",
664
+ value=False,
665
+ info="Notice: Microphone input may not work properly under traffic",
666
+ )
667
+ clean_ref_gr = gr.Checkbox(
668
+ label="Cleanup Reference Voice",
669
+ value=False,
670
+ info="This check can improve output if your microphone or reference voice is noisy",
671
+ )
672
+ auto_det_lang_gr = gr.Checkbox(
673
+ label="Do not use language auto-detect",
674
+ value=False,
675
+ info="Check to disable language auto-detection",
676
+ )
677
+ tos_gr = gr.Checkbox(
678
+ label="Agree",
679
+ value=False,
680
+ info="I agree to the terms of the CPML: https://coqui.ai/cpml",
681
+ )
682
+
683
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
684
+
685
+
686
+ with gr.Column():
687
+ video_gr = gr.Video(label="Waveform Visual")
688
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
689
+ out_text_gr = gr.Text(label="Metrics")
690
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
691
+
692
+ with gr.Row():
693
+ gr.Examples(examples,
694
+ label="Examples",
695
+ inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
696
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
697
+ fn=predict,
698
+ cache_examples=False,)
699
+
700
+ tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
701
+
702
+ demo.queue()
703
+ demo.launch(debug=True, show_api=True)