barghavani commited on
Commit
2ccb399
1 Parent(s): 57ff557

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -69
app.py CHANGED
@@ -1,73 +1,150 @@
1
- import os
2
- import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import gradio as gr
 
 
 
4
  from TTS.api import TTS
5
- from TTS.utils.synthesizer import Synthesizer
6
- from huggingface_hub import hf_hub_download
7
- import json
8
-
9
- # Define constants
10
- MODEL_INFO = [
11
- ["Xtts Persian","best_model_110880.pth","config.json","saillab/xtts_v2_fa"],
12
- ]
13
-
14
- # Extract model names from MODEL_INFO
15
- MODEL_NAMES = [info[0] for info in MODEL_INFO]
16
-
17
- MAX_TXT_LEN = 400
18
- TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')
19
-
20
- model_files = {}
21
- config_files = {}
22
-
23
- # Create a dictionary to store synthesizer objects for each model
24
- synthesizers = {}
25
-
26
- # Download models and initialize synthesizers
27
- for info in MODEL_INFO:
28
- model_name, model_file, config_file, repo_name = info[:4]
29
-
30
- print(f"|> Downloading: {model_name}")
31
-
32
- # Download model and config files
33
- model_files[model_name] = hf_hub_download(repo_id=repo_name, filename=model_file, use_auth_token=TOKEN)
34
- config_files[model_name] = hf_hub_download(repo_id=repo_name, filename=config_file, use_auth_token=TOKEN)
35
-
36
- # Initialize synthesizer for the model
37
- synthesizer = Synthesizer(
38
- tts_checkpoint=model_files[model_name],
39
- tts_config_path=config_files[model_name],
40
- use_cuda=False
41
- )
42
-
43
- synthesizers[model_name] = synthesizer
44
-
45
-
46
- def synthesize(text: str, model_name: str) -> str:
47
- if len(text) > MAX_TXT_LEN:
48
- text = text[:MAX_TXT_LEN]
49
- print(f"Input text was cut off as it exceeded the {MAX_TXT_LEN} character limit.")
50
- synthesizer = synthesizers[model_name]
51
- if synthesizer is None:
52
- raise NameError("Model not found")
53
- wavs = synthesizer.tts(text)
54
-
55
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
56
- synthesizer.save_wav(wavs, fp)
57
- return fp.name
58
-
59
- iface = gr.Interface(
60
- fn=synthesize,
61
- inputs=[
62
- gr.Textbox(label="Enter Text to Synthesize:", value="زین همرهان سست عناصر، دلم گرفت."),
63
- gr.Radio(label="Pick a Model", choices=MODEL_NAMES, value=MODEL_NAMES[0], type="value"),
64
- ],
65
- outputs=gr.Audio(label="Output", type='filepath'),
66
- examples=[["زین همرهان سست عناصر، دلم گرفت.", MODEL_NAMES[0]]], # Example should include a speaker name for multispeaker models
67
- title='Persian TTS Playground',
68
- description="",
69
- article="",
70
- live=False
71
  )
 
 
 
 
 
72
 
73
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io, os, stat
3
+ import subprocess
4
+ import random
5
+ from zipfile import ZipFile
6
+ import uuid
7
+
8
+ import time
9
+ import torch
10
+ import torchaudio
11
+ os.environ["COQUI_TOS_AGREED"] = "1"
12
+
13
+ import langid
14
+
15
+ import base64
16
+ import csv
17
+ from io import StringIO
18
+ import datetime
19
+
20
  import gradio as gr
21
+ from scipy.io.wavfile import write
22
+ from pydub import AudioSegment
23
+
24
  from TTS.api import TTS
25
+ from TTS.tts.configs.xtts_config import XttsConfig
26
+ from TTS.tts.models.xtts import Xtts
27
+ from TTS.utils.generic_utils import get_user_data_dir
28
+
29
+ HF_TOKEN = os.environ.get("HUGGING_FACE_HUB_TOKEN")
30
+
31
+ from huggingface_hub import HfApi
32
+
33
+ api = HfApi(token=HF_TOKEN)
34
+ repo_id = "saillab/xtts-streaming"
35
+
36
+ print("Export newer ffmpeg binary for denoise filter")
37
+ ZipFile("ffmpeg.zip").extractall()
38
+ print("Make ffmpeg binary executable")
39
+ st = os.stat('ffmpeg')
40
+ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
41
+
42
+ print("Downloading if not downloaded Coqui XTTS V1.1")
43
+ from TTS.utils.manage import ModelManager
44
+ model_name = "saillab/xtts_v2_fa"
45
+ ModelManager().download_model(model_name)
46
+ model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
47
+ print("XTTS downloaded")
48
+
49
+ config = XttsConfig()
50
+ config.load_json(os.path.join(model_path, "config.json"))
51
+ model = Xtts.init_from_config(config)
52
+ model.load_checkpoint(
53
+ config,
54
+ checkpoint_path=os.path.join(model_path, "model.pth"),
55
+ vocab_path=os.path.join(model_path, "vocab.json"),
56
+ eval=True,
57
+ use_deepspeed=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  )
59
+ model.cuda()
60
+
61
+ supported_languages=["fa"]
62
+
63
+ title = "XTTS Persian"
64
 
65
+ description = """
66
+ <div>
67
+ <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
68
+ <a style='display:inline-block' href='https://discord.gg/5eXr5seRrv'><img src='https://discord.com/api/guilds/1037326658807533628/widget.png?style=shield' /></a>
69
+ <a href="https://huggingface.co/spaces/coqui/xtts-streaming?duplicate=true">
70
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
71
+ </div>
72
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
73
+ <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
74
+ <br/>
75
+ XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
76
+ <br/>
77
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
78
+ <br/>
79
+ Leave a star on the Github <a href="https://github.com/UNHSAILLab/Persian-TTS">🐸TTS</a>, where our open-source inference and training code lives.
80
+ <br/>
81
+ <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
82
+ <br/>
83
+ </p>
84
+ <p>Language Selectors:
85
+ Persian: fa
86
+ </p>
87
+ <p> Notice: Autoplay may not work on mobile, if you see black waveform image on mobile click it your Audio is there</p>
88
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
89
+ """
90
+
91
+ article = """
92
+ <div style='margin:20px auto;'>
93
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
94
+ <p>We collect data only for error cases for improvement.</p>
95
+ </div>
96
+ """
97
+
98
+ gr.Interface(
99
+ fn=predict,
100
+ inputs=[
101
+ gr.Textbox(
102
+ label="Text Prompt",
103
+ info="One or two sentences at a time is better. Up to 200 text characters.",
104
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
105
+ ),
106
+ gr.Dropdown(
107
+ label="Language",
108
+ info="Select an output language for the synthesised speech",
109
+ choices=supported_languages,
110
+ max_choices=1,
111
+ value=supported_languages[0],
112
+ ),
113
+ gr.Audio(
114
+ label="Reference Audio",
115
+ info="Click on the ✎ button to upload your own target speaker audio",
116
+ type="filepath",
117
+ value="examples/female.wav",
118
+ ),
119
+ gr.Audio(source="microphone",
120
+ type="filepath",
121
+ info="Use your microphone to record audio",
122
+ label="Use Microphone for Reference"),
123
+ gr.Checkbox(label="Use Microphone",
124
+ value=False,
125
+ info="Notice: Microphone input may not work properly under traffic",),
126
+ gr.Checkbox(label="Cleanup Reference Voice",
127
+ value=False,
128
+ info="This check can improve output if your microphone or reference voice is noisy",
129
+ ),
130
+ gr.Checkbox(label="Do not use language auto-detect",
131
+ value=False,
132
+ info="Check to disable language auto-detection",),
133
+ gr.Checkbox(
134
+ label="Agree",
135
+ value=False,
136
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
137
+ ),
138
+ ],
139
+ outputs=[
140
+ gr.Video(label="Waveform Visual"),
141
+ gr.Audio(label="Synthesised Audio", streaming=True, autoplay=True),
142
+ gr.Text(label="Metrics"),
143
+ gr.Audio(label="Reference Audio Used"),
144
+ ],
145
+ title=title,
146
+ description=description,
147
+ article=article,
148
+ examples=[],
149
+ cache_examples=False,
150
+ ).queue().launch(debug=True, show_api=True)