Pendrokar commited on
Commit
74e078c
โ€ข
1 Parent(s): 9230f47

import gr_client

Browse files
Files changed (2) hide show
  1. app.py +105 -513
  2. gr_client.py +257 -250
app.py CHANGED
@@ -1,16 +1,14 @@
1
  import os
2
  import sys
3
- import time
4
  import requests
5
  import json
6
- from subprocess import Popen, PIPE
7
- import threading
8
  from huggingface_hub import HfApi
9
- import gradio as gr
10
 
11
  # start xVASynth service (no HTTP)
12
  import resources.app.no_server as xvaserver
13
 
 
 
14
  # model
15
  hf_model_name = "Pendrokar/xvapitch_nvidia"
16
  model_repo = HfApi()
@@ -19,117 +17,9 @@ latest_commit_sha = commits[0].commit_id
19
  hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
20
  models_path = hf_cache_models_path
21
 
22
- # ordered from most emotional and respects pauses to ones that do the least
23
- voice_models = [
24
- ("๐Ÿ‘จโ€๐Ÿฆณ #6671", "ccby_nvidia_hifi_6671_M"),
25
- ("๐Ÿ‘ฑโ€โ™€๏ธ ๐Ÿ‡ฌ๐Ÿ‡ง #92", "ccby_nvidia_hifi_92_F"),
26
- ("๐Ÿง” #6670", "ccby_nvidia_hifi_6670_M"),
27
- ("Male #9017", "ccby_nvidia_hifi_9017_M"),
28
- ("Male #6097", "ccby_nvidia_hifi_6097_M"),
29
- ("๐Ÿ‘ฉโ€๐Ÿฆฑ #12787", "ccby_nvidia_hifi_12787_F"),
30
- ("๐Ÿ‘ต #11614", "ccby_nv_hifi_11614_F"),
31
- ("Female #8051", "ccby_nvidia_hifi_8051_F"),
32
- ("๐Ÿ‘ฉโ€๐Ÿฆณ #11697", "ccby_nvidia_hifi_11697_F"),
33
- ("Female #9136", "ccby_nvidia_hifi_9136_F"),
34
- ]
35
-
36
  current_voice_model = None
37
  base_speaker_emb = ''
38
 
39
- # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
40
- languages = [
41
- ("๐Ÿ‡บ๐Ÿ‡ธ EN", "en"),
42
- ("๐Ÿ‡ฉ๐Ÿ‡ช DE", "de"),
43
- ("๐Ÿ‡ช๐Ÿ‡ธ ES", "es"),
44
- ("๐Ÿ‡ฎ๐Ÿ‡น IT", "it"),
45
- ("๐Ÿ‡ณ๐Ÿ‡ฑ NL", "nl"),
46
- ("๐Ÿ‡ง๐Ÿ‡ท PT", "pt"),
47
- ("๐Ÿ‡ต๐Ÿ‡ฑ PL", "pl"),
48
- ("๐Ÿ‡ท๐Ÿ‡ด RO", "ro"),
49
- ("๐Ÿ‡ธ๐Ÿ‡ช SV", "sv"),
50
- ("๐Ÿ‡ฉ๐Ÿ‡ฐ DA", "da"),
51
- ("๐Ÿ‡ซ๐Ÿ‡ฎ FI", "fi"),
52
- ("๐Ÿ‡ญ๐Ÿ‡บ HU", "hu"),
53
- ("๐Ÿ‡ฌ๐Ÿ‡ท EL", "el"),
54
- ("๐Ÿ‡ซ๐Ÿ‡ท FR", "fr"),
55
- ("๐Ÿ‡ท๐Ÿ‡บ RU", "ru"),
56
- ("๐Ÿ‡บ๐Ÿ‡ฆ UA", "uk"),
57
- ("๐Ÿ‡น๐Ÿ‡ท TR", "tr"),
58
- ("๐Ÿ‡ธ๐Ÿ‡ฆ AR", "ar"),
59
- ("๐Ÿ‡ฎ๐Ÿ‡ณ HI", "hi"),
60
- ("๐Ÿ‡ฏ๐Ÿ‡ต JP", "jp"),
61
- ("๐Ÿ‡ฐ๐Ÿ‡ท KO", "ko"),
62
- ("๐Ÿ‡จ๐Ÿ‡ณ ZH", "zh"),
63
- ("๐Ÿ‡ป๐Ÿ‡ณ VI", "vi"),
64
- ("๐Ÿ‡ป๐Ÿ‡ฆ LA", "la"),
65
- ("๐Ÿ‡ณ๐Ÿ‡ฌ YO", "yo"),
66
- ("Swahili", "sw"),
67
- ("Hausa", "ha"),
68
- ("Wolof", "wo"),
69
- ]
70
-
71
- # Translated from English by DeepMind's Gemini Pro
72
- default_text = {
73
- "ar": "ู‡ุฐุง ู‡ูˆ ุตูˆุชูŠ.",
74
- "da": "Sรฅdan lyder min stemme.",
75
- "de": "So klingt meine Stimme.",
76
- "el": "ฮˆฯ„ฯƒฮน ฮฑฮบฮฟฯฮณฮตฯ„ฮฑฮน ฮท ฯ†ฯ‰ฮฝฮฎ ฮผฮฟฯ….",
77
- "en": "This is what my voice sounds like.",
78
- "es": "Asรญ suena mi voz.",
79
- "fi": "Nรคin รครคneni kuulostaa.",
80
- "fr": "Voici ร  quoi ressemble ma voix.",
81
- "ha": "Wannan ne muryata ke.",
82
- "hi": "เคฏเคน เคฎเฅ‡เคฐเฅ€ เค†เคตเคพเคœเคผ เค•เฅˆเคธเฅ€ เคฒเค—เคคเฅ€ เคนเฅˆเฅค",
83
- "hu": "รgy hangzik a hangom.",
84
- "it": "Cosรฌ suona la mia voce.",
85
- "jp": "ใ“ใ‚ŒใŒ็งใฎๅฃฐใงใ™ใ€‚",
86
- "ko": "์—ฌ๊ธฐ ์ œ ๋ชฉ์†Œ๋ฆฌ๊ฐ€ ์–ด๋–ค์ง€ ๋“ค์–ด๋ณด์„ธ์š”.",
87
- "la": "Haec est vox mea sonans.",
88
- "nl": "Dit is hoe mijn stem klinkt.",
89
- "pl": "Tak brzmi mรณj gล‚os.",
90
- "pt": "ร‰ assim que minha voz soa.",
91
- "ro": "Aศ™a sunฤƒ vocea mea.",
92
- "ru": "ะ’ะพั‚ ะบะฐะบ ะทะฒัƒั‡ะธั‚ ะผะพะน ะณะพะปะพั.",
93
- "sv": "Sรฅhรคr lรฅter min rรถst.",
94
- "sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4
95
- "tr": "Benim sesimin sesi bรถyle.",
96
- "uk": "ะžััŒ ัะบ ะทะฒัƒั‡ะธั‚ัŒ ะผั–ะน ะณะพะปะพั.",
97
- "vi": "ฤรขy lร  giแปng nรณi cแปงa tรดi.",
98
- "wo": "Ndox li neen xewnaal ma.",
99
- "yo": "รŒyรญ ni ohรนn mi ล„lรก.",
100
- "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
101
- }
102
-
103
- def run_xvaserver():
104
- # start the process without waiting for a response
105
- print('Running xVAServer subprocess...\n')
106
- xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/')
107
-
108
- # Wait for a moment to ensure the server starts up
109
- time.sleep(10)
110
-
111
- # Check if the server is running
112
- if xvaserver.poll() is not None:
113
- print("Web server failed to start.")
114
- sys.exit(0)
115
-
116
- # contact local xVASynth server
117
- print('Attempting to connect to xVASynth...')
118
- try:
119
- response = requests.get('http://0.0.0.0:8008')
120
- response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
121
- except requests.exceptions.RequestException as err:
122
- print('Failed to connect!')
123
- return
124
-
125
- print('xVAServer running on port 8008')
126
-
127
- # load default model
128
- load_model("ccby_nvidia_hifi_6671_M")
129
-
130
- # Wait for the process to exit
131
- xvaserver.wait()
132
-
133
  def load_model(voice_model_name):
134
  model_path = models_path + voice_model_name
135
 
@@ -160,413 +50,115 @@ def load_model(voice_model_name):
160
 
161
  return embs
162
 
163
- def predict(
164
- input_text,
165
- voice,
166
- lang,
167
- pacing,
168
- pitch,
169
- energy,
170
- anger,
171
- happy,
172
- sad,
173
- surprise,
174
- use_deepmoji
175
- ):
176
- # grab only the first 1000 characters
177
- input_text = input_text[:1000]
178
-
179
- # load voice model if not the current model
180
- if (current_voice_model != voice):
181
- base_speaker_emb = load_model(voice)
182
 
183
- model_type = 'xVAPitch'
184
- pace = pacing if pacing else 1.0
185
- save_path = '/tmp/xvapitch_audio_sample.wav'
186
- language = lang
187
- use_sr = 0
188
- use_cleanup = 0
189
-
190
- pluginsContext = {}
191
- pluginsContext["mantella_settings"] = {
192
- "emAngry": (anger if anger > 0 else 0),
193
- "emHappy": (happy if happy > 0 else 0),
194
- "emSad": (sad if sad > 0 else 0),
195
- "emSurprise": (surprise if surprise > 0 else 0),
196
- "run_model": use_deepmoji
197
- }
198
-
199
-
200
- data = {
201
- 'pluginsContext': json.dumps(pluginsContext),
202
- 'modelType': model_type,
203
- # pad with whitespaces as a workaround to avoid cutoffs
204
- 'sequence': input_text.center(len(input_text) + 2, ' '),
205
- 'pace': pace,
206
- 'outfile': save_path,
207
- 'vocoder': 'n/a',
208
- 'base_lang': language,
209
- 'base_emb': base_speaker_emb,
210
- 'useSR': use_sr,
211
- 'useCleanup': use_cleanup,
212
- }
213
-
214
- print('Synthesizing...')
215
- try:
216
- json_data = xvaserver.synthesize(data)
217
- # response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
218
- # response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
219
- # json_data = json.loads(response.text)
220
- except requests.exceptions.RequestException as err:
221
- print('FAILED to synthesize: {err}')
222
- save_path = ''
223
- response = {'text': '{"message": "Failed"}'}
224
- json_data = {
225
- 'arpabet': ['Failed'],
226
- 'durations': [0],
227
- 'em_anger': anger,
228
- 'em_happy': happy,
229
- 'em_sad': sad,
230
- 'em_surprise': surprise,
231
  }
232
 
233
- # print('server.log contents:')
234
- # with open('resources/app/server.log', 'r') as f:
235
- # print(f.read())
236
-
237
- arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
238
- arpabet_symbols = json_data['arpabet'].split('|')
239
- utter_time = 0
240
- for symb_i in range(len(json_data['durations'])):
241
- # skip PAD symbol
242
- if (arpabet_symbols[symb_i] == '<PAD>'):
243
- continue
244
-
245
- length = float(json_data['durations'][symb_i])
246
- arpa_length = str(round(length/2, 1))
247
- arpabet_html += '<strong\
248
- class="arpabet"\
249
- style="padding: 0 '\
250
- + str(arpa_length)\
251
- +'em"'\
252
- +f" title=\"{utter_time} + {length}\""\
253
- +'>'\
254
- + arpabet_symbols[symb_i]\
255
- + '</strong> '
256
- utter_time += round(length, 1)
257
 
258
- return [
259
- save_path,
260
- arpabet_html,
261
- round(json_data['em_angry'][0], 2),
262
- round(json_data['em_happy'][0], 2),
263
- round(json_data['em_sad'][0], 2),
264
- round(json_data['em_surprise'][0], 2),
265
- json_data
266
- ]
267
-
268
- input_textbox = gr.Textbox(
269
- label="Input Text",
270
- value="This is what my voice sounds like.",
271
- info="Also accepts ARPAbet symbols placed within {} brackets.",
272
- lines=1,
273
- max_lines=5,
274
- autofocus=True
275
- )
276
- pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
277
- pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
278
- energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
279
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
280
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
281
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
282
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
283
- voice_radio = gr.Radio(
284
- voice_models,
285
- value="ccby_nvidia_hifi_6671_M",
286
- label="Voice",
287
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
288
- )
289
-
290
- def set_default_text(lang, deepmoji_checked):
291
- # DeepMoji only works on English Text
292
- # checkbox_enabled = True
293
- # if lang != 'en':
294
- # checkbox_enabled = False
295
-
296
- if lang == 'en':
297
- checkbox_enabled = gr.Checkbox(
298
- label="Use DeepMoji",
299
- info="Auto adjust emotional values",
300
- value=deepmoji_checked,
301
- interactive=True
302
- )
303
- else:
304
- checkbox_enabled = gr.Checkbox(
305
- label="Use DeepMoji",
306
- info="Works only with English!",
307
- value=False,
308
- interactive=False
309
- )
310
-
311
- return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks)
312
-
313
- en_examples = [
314
- "This is what my voice sounds like.",
315
- "If there is anything else you need, feel free to ask.",
316
- "Amazing! Could you do that again?",
317
- "Why, I would be more than happy to help you!",
318
- "That was unexpected.",
319
- "How dare you! . You have no right.",
320
- "Ahh, well, you see. There is more to it.",
321
- "I can't believe she is gone.",
322
- "Stay out of my way!!!",
323
- # ARPAbet example
324
- "This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
325
- ]
326
-
327
- def set_example_as_input(example_text):
328
- return example_text
329
-
330
- def reset_em_sliders(
331
- deepmoji_enabled,
332
- anger,
333
- happy,
334
- sad,
335
- surprise
336
- ):
337
- if (deepmoji_enabled):
338
- return (0, 0, 0, 0)
339
- else:
340
- return (
341
- anger,
342
- happy,
343
- sad,
344
- surprise
345
- )
346
-
347
- def set_default_audio(voice_id):
348
- return models_path + voice_id + '.wav'
349
-
350
- def toggle_deepmoji(
351
- checked,
352
- anger,
353
- happy,
354
- sad,
355
- surprise
356
- ):
357
- if checked:
358
- return (0, 0, 0, 0)
359
- else:
360
- return (
361
- anger,
362
- happy,
363
- sad,
364
- surprise
365
- )
366
-
367
- language_radio = gr.Radio(
368
- languages,
369
- value="en",
370
- label="Language",
371
- info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
372
- )
373
-
374
- _DESCRIPTION = '''
375
- <div>
376
- <a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
377
- <a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.3k-blue?logo=nexusmods'/></a>
378
- <a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
379
- <span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
380
- </div>
381
- '''
382
-
383
- with gr.Blocks(css=".arpabet {display: inline-block; background-color: gray; border-radius: 5px; font-size: 120%; margin: 0.1em 0}") as demo:
384
- gr.Markdown("# xVASynth TTS")
385
-
386
- gr.HTML(label="description", value=_DESCRIPTION)
387
-
388
- with gr.Row(): # Main row for inputs and language selection
389
- with gr.Column(): # Input column
390
- input_textbox = gr.Textbox(
391
- label="Input Text",
392
- value="This is what my voice sounds like.",
393
- info="Also accepts ARPAbet symbols placed within {} brackets.",
394
- lines=1,
395
- max_lines=5,
396
- autofocus=True
397
- )
398
- language_radio = gr.Radio(
399
- languages,
400
- value="en",
401
- label="Language",
402
- info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
403
- )
404
- with gr.Row():
405
- with gr.Column():
406
- en_examples_dropdown = gr.Dropdown(
407
- en_examples,
408
- value=en_examples[0],
409
- label="Example dropdown",
410
- show_label=False,
411
- info="English Examples",
412
- visible=(language_radio.value == 'en')
413
- )
414
- with gr.Column():
415
- pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
416
- with gr.Column(): # Control column
417
- voice_radio = gr.Radio(
418
- voice_models,
419
- value="ccby_nvidia_hifi_6671_M",
420
- label="Voice",
421
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
422
- )
423
- pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
424
- energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
425
- with gr.Row(): # Main row for inputs and language selection
426
- with gr.Column(): # Input column
427
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
428
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
429
- with gr.Column(): # Input column
430
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
431
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Can oversaturate Happiness")
432
- deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True)
433
-
434
- # Event handling using click
435
- btn = gr.Button("Generate", variant="primary")
436
-
437
- with gr.Row(): # Main row for inputs and language selection
438
- with gr.Column(): # Input column
439
- output_wav = gr.Audio(
440
- label="22kHz audio output (autoplay enabled)",
441
- type="filepath",
442
- editable=False,
443
- autoplay=True
444
- )
445
- with gr.Column(): # Input column
446
- output_arpabet = gr.HTML(label="ARPAbet")
447
-
448
- btn.click(
449
- fn=predict,
450
- inputs=[
451
- input_textbox,
452
- voice_radio,
453
- language_radio,
454
- pacing_slider,
455
- pitch_slider,
456
- energy_slider,
457
- anger_slider,
458
- happy_slider,
459
- sad_slider,
460
- surprise_slider,
461
- deepmoji_checkbox
462
- ],
463
- outputs=[
464
- output_wav,
465
- output_arpabet,
466
- anger_slider,
467
- happy_slider,
468
- sad_slider,
469
- surprise_slider,
470
- # xVAServer response
471
- gr.Textbox(visible=False)
472
- ]
473
- )
474
- input_textbox.submit(
475
- fn=predict,
476
- inputs=[
477
- input_textbox,
478
- voice_radio,
479
- language_radio,
480
- pacing_slider,
481
- pitch_slider,
482
- energy_slider,
483
- anger_slider,
484
- happy_slider,
485
- sad_slider,
486
- surprise_slider,
487
- deepmoji_checkbox
488
- ],
489
- outputs=[
490
- output_wav,
491
- output_arpabet,
492
- anger_slider,
493
- happy_slider,
494
- sad_slider,
495
- surprise_slider,
496
- # xVAServer response
497
- gr.Textbox(visible=False)
498
- ]
499
- )
500
-
501
- language_radio.change(
502
- set_default_text,
503
- inputs=[language_radio, deepmoji_checkbox],
504
- outputs=[input_textbox, deepmoji_checkbox]
505
- )
506
-
507
- en_examples_dropdown.change(
508
- set_example_as_input,
509
- inputs=[en_examples_dropdown],
510
- outputs=[input_textbox]
511
- )
512
-
513
- deepmoji_checkbox.change(
514
- toggle_deepmoji,
515
- inputs=[
516
- deepmoji_checkbox,
517
- anger_slider,
518
- happy_slider,
519
- sad_slider,
520
- surprise_slider
521
- ],
522
- outputs=[
523
- anger_slider,
524
- happy_slider,
525
- sad_slider,
526
- surprise_slider
527
- ]
528
- )
529
-
530
- input_textbox.change(
531
- reset_em_sliders,
532
- inputs=[
533
- deepmoji_checkbox,
534
- anger_slider,
535
- happy_slider,
536
- sad_slider,
537
- surprise_slider
538
- ],
539
- outputs=[
540
- anger_slider,
541
- happy_slider,
542
- sad_slider,
543
- surprise_slider
544
- ]
545
- )
546
 
547
- voice_radio.change(
548
- reset_em_sliders,
549
- inputs=[
550
- deepmoji_checkbox,
551
- anger_slider,
552
- happy_slider,
553
- sad_slider,
554
- surprise_slider
555
- ],
556
- outputs=[
557
- anger_slider,
558
- happy_slider,
559
- sad_slider,
560
- surprise_slider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
561
  ]
562
- )
563
-
564
- voice_radio.change(
565
- set_default_audio,
566
- inputs=voice_radio,
567
- outputs=output_wav
568
- )
569
 
570
  if __name__ == "__main__":
571
  print('running custom Gradio interface')
572
- demo.launch()
 
 
1
  import os
2
  import sys
 
3
  import requests
4
  import json
 
 
5
  from huggingface_hub import HfApi
 
6
 
7
  # start xVASynth service (no HTTP)
8
  import resources.app.no_server as xvaserver
9
 
10
+ from gr_client import BlocksDemo
11
+
12
  # model
13
  hf_model_name = "Pendrokar/xvapitch_nvidia"
14
  model_repo = HfApi()
 
17
  hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
18
  models_path = hf_cache_models_path
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  current_voice_model = None
21
  base_speaker_emb = ''
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def load_model(voice_model_name):
24
  model_path = models_path + voice_model_name
25
 
 
50
 
51
  return embs
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ class LocalBlocksDemo(BlocksDemo):
55
+ def predict(
56
+ self,
57
+ input_text,
58
+ voice,
59
+ lang,
60
+ pacing,
61
+ pitch,
62
+ energy,
63
+ anger,
64
+ happy,
65
+ sad,
66
+ surprise,
67
+ use_deepmoji
68
+ ):
69
+ # grab only the first 1000 characters
70
+ input_text = input_text[:1000]
71
+
72
+ # load voice model if not the current model
73
+ if (current_voice_model != voice):
74
+ base_speaker_emb = load_model(voice)
75
+
76
+ model_type = 'xVAPitch'
77
+ pace = pacing if pacing else 1.0
78
+ save_path = '/tmp/xvapitch_audio_sample.wav'
79
+ language = lang
80
+ use_sr = 0
81
+ use_cleanup = 0
82
+
83
+ pluginsContext = {}
84
+ pluginsContext["mantella_settings"] = {
85
+ "emAngry": (anger if anger > 0 else 0),
86
+ "emHappy": (happy if happy > 0 else 0),
87
+ "emSad": (sad if sad > 0 else 0),
88
+ "emSurprise": (surprise if surprise > 0 else 0),
89
+ "run_model": use_deepmoji
 
 
 
 
 
 
 
 
 
 
 
 
90
  }
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ data = {
94
+ 'pluginsContext': json.dumps(pluginsContext),
95
+ 'modelType': model_type,
96
+ # pad with whitespaces as a workaround to avoid cutoffs
97
+ 'sequence': input_text.center(len(input_text) + 2, ' '),
98
+ 'pace': pace,
99
+ 'outfile': save_path,
100
+ 'vocoder': 'n/a',
101
+ 'base_lang': language,
102
+ 'base_emb': base_speaker_emb,
103
+ 'useSR': use_sr,
104
+ 'useCleanup': use_cleanup,
105
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ print('Synthesizing...')
108
+ try:
109
+ json_data = xvaserver.synthesize(data)
110
+ # response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
111
+ # response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
112
+ # json_data = json.loads(response.text)
113
+ except requests.exceptions.RequestException as err:
114
+ print('FAILED to synthesize: {err}')
115
+ save_path = ''
116
+ response = {'text': '{"message": "Failed"}'}
117
+ json_data = {
118
+ 'arpabet': ['Failed'],
119
+ 'durations': [0],
120
+ 'em_anger': anger,
121
+ 'em_happy': happy,
122
+ 'em_sad': sad,
123
+ 'em_surprise': surprise,
124
+ }
125
+
126
+ # print('server.log contents:')
127
+ # with open('resources/app/server.log', 'r') as f:
128
+ # print(f.read())
129
+
130
+ arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
131
+ arpabet_symbols = json_data['arpabet'].split('|')
132
+ utter_time = 0
133
+ for symb_i in range(len(json_data['durations'])):
134
+ # skip PAD symbol
135
+ if (arpabet_symbols[symb_i] == '<PAD>'):
136
+ continue
137
+
138
+ length = float(json_data['durations'][symb_i])
139
+ arpa_length = str(round(length/2, 1))
140
+ arpabet_html += '<strong\
141
+ class="arpabet"\
142
+ style="padding: 0 '\
143
+ + str(arpa_length)\
144
+ +'em"'\
145
+ +f" title=\"{utter_time} + {length}\""\
146
+ +'>'\
147
+ + arpabet_symbols[symb_i]\
148
+ + '</strong> '
149
+ utter_time += round(length, 1)
150
+
151
+ return [
152
+ save_path,
153
+ arpabet_html,
154
+ round(json_data['em_angry'][0], 2),
155
+ round(json_data['em_happy'][0], 2),
156
+ round(json_data['em_sad'][0], 2),
157
+ round(json_data['em_surprise'][0], 2),
158
+ json_data
159
  ]
 
 
 
 
 
 
 
160
 
161
  if __name__ == "__main__":
162
  print('running custom Gradio interface')
163
+ demo = LocalBlocksDemo()
164
+ demo.block.launch()
gr_client.py CHANGED
@@ -1,9 +1,5 @@
1
  import os
2
- import sys
3
- import time
4
- import requests
5
  import json
6
- from huggingface_hub import hf_hub_download
7
  import gradio as gr
8
  from gradio_client import Client
9
 
@@ -21,7 +17,6 @@ voice_models_more = [
21
  ("๐Ÿ‘ฉโ€๐Ÿฆณ #11697", "ccby_nvidia_hifi_11697_F"),
22
  ("Female #9136", "ccby_nvidia_hifi_9136_F"),
23
  ]
24
- current_voice_model = None
25
 
26
  # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
27
  languages = [
@@ -89,71 +84,6 @@ default_text = {
89
  "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
90
  }
91
 
92
-
93
- def predict(
94
- input_text,
95
- voice,
96
- lang,
97
- pacing,
98
- pitch,
99
- energy,
100
- anger,
101
- happy,
102
- sad,
103
- surprise,
104
- deepmoji_checked
105
- ):
106
- wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
107
- input_text, # str in 'Input Text' Textbox component
108
- voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
109
- lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
110
- pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
111
- pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
112
- energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
113
- anger, # float (numeric value between 0 and 1.0) in '๐Ÿ˜  Anger' Slider component
114
- happy, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ƒ Happiness' Slider component
115
- sad, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ญ Sadness' Slider component
116
- surprise, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ฎ Surprise' Slider component
117
- deepmoji_checked, # bool
118
- api_name="/predict"
119
- )
120
-
121
- json_data = json.loads(response.replace("'", '"'))
122
-
123
- arpabet_html = '<h6>ARPAbet & Durations</h6>'
124
- arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
125
- arpabet_nopad = json_data['arpabet'].split('|PAD|')
126
- arpabet_symbols = json_data['arpabet'].split('|')
127
- wpad_len = len(arpabet_symbols)
128
- nopad_len = len(arpabet_nopad)
129
- total_dur_length = 0
130
- for symb_i in range(wpad_len):
131
- if (arpabet_symbols[symb_i] == '<PAD>'):
132
- continue
133
- total_dur_length += float(json_data['durations'][symb_i])
134
-
135
- for symb_i in range(wpad_len):
136
- if (arpabet_symbols[symb_i] == '<PAD>'):
137
- continue
138
-
139
- arpabet_length = float(json_data['durations'][symb_i])
140
- cell_width = round(arpabet_length / total_dur_length * 100, 2)
141
- arpabet_html += '<td class="arpabet" style="width: '\
142
- + str(cell_width)\
143
- +'%">'\
144
- + arpabet_symbols[symb_i]\
145
- + '</td> '
146
- arpabet_html += '<tr></tbody></table>'
147
-
148
- return [
149
- wav_path,
150
- arpabet_html,
151
- round(json_data['em_angry'][0], 2),
152
- round(json_data['em_happy'][0], 2),
153
- round(json_data['em_sad'][0], 2),
154
- round(json_data['em_surprise'][0], 2)
155
- ]
156
-
157
  # Component defaults
158
  input_textbox_init = {
159
  'label': "Input Text",
@@ -232,7 +162,7 @@ deepmoji_checkbox_init = {
232
  }
233
 
234
  def more_lang_options(lang):
235
- print('more_lang_options')
236
  if lang != 'more':
237
  return lang
238
 
@@ -241,10 +171,14 @@ def more_lang_options(lang):
241
  return gr.Radio(**radio_init)
242
 
243
  def set_default_text(lang, deepmoji_checked):
244
- print('set_default_text')
 
245
  if lang == 'more':
246
- return default_text['en'], deepmoji_checked
247
- # return gr.Textbox(**input_textbox_init), deepmoji_checked
 
 
 
248
 
249
  # DeepMoji only works on English Text
250
  checkbox_init = {**deepmoji_checkbox_init}
@@ -257,9 +191,9 @@ def set_default_text(lang, deepmoji_checked):
257
  # checkbox_init['info'] = "Works only with English!",
258
  # checkbox_init['value'] = False,
259
  # checkbox_init['interactive'] = False
 
260
 
261
- # checkbox_enabled = gr.Checkbox(**checkbox_init)
262
- return default_text[lang], deepmoji_checked
263
 
264
  # examples component
265
  en_examples = [
@@ -285,11 +219,11 @@ en_examples_dropdown_init = {
285
  }
286
 
287
  def set_example_as_input(example_text):
288
- print('set_example_as_input')
289
  return example_text
290
 
291
  def toggle_example_dropdown(lang):
292
- print('toggle_example_dropdown')
293
  dropdown_init = {**en_examples_dropdown_init}
294
  if lang == 'en':
295
  dropdown_init['visible'] = True
@@ -299,7 +233,7 @@ def toggle_example_dropdown(lang):
299
  return gr.Dropdown(**dropdown_init)
300
 
301
  def more_voice_options(voice):
302
- print('more_voice_options')
303
  if voice != 'more':
304
  return voice
305
 
@@ -314,7 +248,7 @@ def reset_em_sliders(
314
  sad,
315
  surprise
316
  ):
317
- print('reset_em_sliders')
318
  if (deepmoji_enabled):
319
  return (0, 0, 0, 0)
320
  else:
@@ -332,7 +266,7 @@ def toggle_deepmoji(
332
  sad,
333
  surprise
334
  ):
335
- print('toggle_deepmoji')
336
  if checked:
337
  return (0, 0, 0, 0)
338
  else:
@@ -348,190 +282,263 @@ language_radio_init = {
348
  'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
349
  'value': "en",
350
  'label': "Language",
351
- 'info': "Will be more monotone and have an English accent. Tested mostly by a native Briton."
352
  }
353
 
354
  _DESCRIPTION = '''
355
  <div>
356
  <a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
357
- <a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.3k-blue?logo=nexusmods'/></a>
358
  <a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
359
  <span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
360
  </div>
361
  '''
362
 
363
- with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
364
- gr.Markdown("# xVASynth TTS")
365
-
366
- gr.HTML(label="description", value=_DESCRIPTION)
367
-
368
- with gr.Row(): # Main row for inputs and language selection
369
- with gr.Column(): # Input column
370
- input_textbox = gr.Textbox(**input_textbox_init)
371
- language_radio = gr.Radio(**language_radio_init)
372
-
373
- with gr.Row():
374
- with gr.Column():
375
- en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
376
- with gr.Column():
377
- pacing_slider = gr.Slider(**pacing_slider_init)
378
- with gr.Column(): # Control column
379
- voice_radio = gr.Radio(**voice_radio_init)
380
- pitch_slider = gr.Slider(**pitch_slider_init)
381
- energy_slider = gr.Slider(**energy_slider_init)
382
  with gr.Row(): # Main row for inputs and language selection
383
  with gr.Column(): # Input column
384
- anger_slider = gr.Slider(**anger_slider_init)
385
- sad_slider = gr.Slider(**sad_slider_init)
386
- with gr.Column(): # Input column
387
- happy_slider = gr.Slider(**happy_slider_init)
388
- surprise_slider = gr.Slider(**surprise_slider_init)
389
- deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True)
390
-
391
- # Event handling using click
392
- btn = gr.Button("Generate", variant="primary")
393
-
394
- # with gr.Row(): # Main row for inputs and language selection
395
- # with gr.Column(): # Input column
396
- output_wav = gr.Audio(
397
- label="22kHz audio output",
398
- type="filepath",
399
- editable=False,
400
- autoplay=True
401
- )
402
- # with gr.Column(): # Input column
403
- output_arpabet = gr.HTML(label="ARPAbet")
404
-
405
- btn.click(
406
- fn=predict,
407
- inputs=[
408
- input_textbox,
409
- voice_radio,
410
- language_radio,
411
- pacing_slider,
412
- pitch_slider,
413
- energy_slider,
414
- anger_slider,
415
- happy_slider,
416
- sad_slider,
417
- surprise_slider,
418
- deepmoji_checkbox
419
- ],
420
- outputs=[
421
- output_wav,
422
- output_arpabet,
423
- anger_slider,
424
- happy_slider,
425
- sad_slider,
426
- surprise_slider
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  ]
428
- )
429
-
430
- # more languages option
431
- language_radio.change(
432
- more_lang_options,
433
- inputs=language_radio,
434
- outputs=language_radio,
435
- trigger_mode='once',
436
- show_progress='hidden',
437
- )
438
-
439
- # more voices option
440
- voice_radio.change(
441
- more_voice_options,
442
- inputs=voice_radio,
443
- outputs=voice_radio,
444
- trigger_mode='once',
445
- show_progress='hidden',
446
- queue=False,
447
- )
448
-
449
- # set default text
450
- language_radio.change(
451
- set_default_text,
452
- inputs=[language_radio, deepmoji_checkbox],
453
- outputs=[input_textbox, deepmoji_checkbox],
454
- show_progress='hidden',
455
- queue=False,
456
- )
457
-
458
- # toggle en examples
459
- language_radio.change(
460
- toggle_example_dropdown,
461
- inputs=language_radio,
462
- outputs=en_examples_dropdown,
463
- show_progress='hidden',
464
- queue=False,
465
- )
466
-
467
- en_examples_dropdown.change(
468
- set_example_as_input,
469
- inputs=[en_examples_dropdown],
470
- outputs=[input_textbox],
471
- show_progress='hidden',
472
- queue=False,
473
- )
474
-
475
- deepmoji_checkbox.change(
476
- toggle_deepmoji,
477
- inputs=[
478
- deepmoji_checkbox,
479
- anger_slider,
480
- happy_slider,
481
- sad_slider,
482
- surprise_slider
483
- ],
484
- outputs=[
485
- anger_slider,
486
- happy_slider,
487
- sad_slider,
488
- surprise_slider
489
- ],
490
- show_progress='hidden',
491
- queue=False,
492
- )
493
-
494
- input_textbox.change(
495
- reset_em_sliders,
496
- inputs=[
497
- deepmoji_checkbox,
498
- anger_slider,
499
- happy_slider,
500
- sad_slider,
501
- surprise_slider
502
- ],
503
- outputs=[
504
- anger_slider,
505
- happy_slider,
506
- sad_slider,
507
- surprise_slider
508
- ],
509
- show_progress='hidden',
510
- queue=False,
511
- )
512
-
513
- voice_radio.change(
514
- reset_em_sliders,
515
- inputs=[
516
- deepmoji_checkbox,
517
- anger_slider,
518
- happy_slider,
519
- sad_slider,
520
- surprise_slider
521
- ],
522
- outputs=[
523
- anger_slider,
524
- happy_slider,
525
- sad_slider,
526
- surprise_slider
527
- ],
528
- show_progress='hidden',
529
- queue=False,
530
- )
531
 
532
  if __name__ == "__main__":
533
  print('running Gradio interface')
534
- # gradio_app.launch()
535
  client = Client("Pendrokar/xVASynth")
536
 
537
- demo.launch()
 
 
1
  import os
 
 
 
2
  import json
 
3
  import gradio as gr
4
  from gradio_client import Client
5
 
 
17
  ("๐Ÿ‘ฉโ€๐Ÿฆณ #11697", "ccby_nvidia_hifi_11697_F"),
18
  ("Female #9136", "ccby_nvidia_hifi_9136_F"),
19
  ]
 
20
 
21
  # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
22
  languages = [
 
84
  "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
85
  }
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # Component defaults
88
  input_textbox_init = {
89
  'label': "Input Text",
 
162
  }
163
 
164
  def more_lang_options(lang):
165
+ # print('more_lang_options')
166
  if lang != 'more':
167
  return lang
168
 
 
171
  return gr.Radio(**radio_init)
172
 
173
  def set_default_text(lang, deepmoji_checked):
174
+ # print('set_default_text')
175
+ textbox_init = {**input_textbox_init}
176
  if lang == 'more':
177
+ textbox_init['value'] = default_text['en']
178
+ # return default_text['en'], deepmoji_checked
179
+ return gr.Textbox(**textbox_init), deepmoji_checked
180
+
181
+ textbox_init['value'] = default_text[lang]
182
 
183
  # DeepMoji only works on English Text
184
  checkbox_init = {**deepmoji_checkbox_init}
 
191
  # checkbox_init['info'] = "Works only with English!",
192
  # checkbox_init['value'] = False,
193
  # checkbox_init['interactive'] = False
194
+ # gr.Checkbox(**checkbox_init)
195
 
196
+ return gr.Textbox(**textbox_init), deepmoji_checked
 
197
 
198
  # examples component
199
  en_examples = [
 
219
  }
220
 
221
  def set_example_as_input(example_text):
222
+ # print('set_example_as_input')
223
  return example_text
224
 
225
  def toggle_example_dropdown(lang):
226
+ # print('toggle_example_dropdown')
227
  dropdown_init = {**en_examples_dropdown_init}
228
  if lang == 'en':
229
  dropdown_init['visible'] = True
 
233
  return gr.Dropdown(**dropdown_init)
234
 
235
  def more_voice_options(voice):
236
+ # print('more_voice_options')
237
  if voice != 'more':
238
  return voice
239
 
 
248
  sad,
249
  surprise
250
  ):
251
+ # print('reset_em_sliders')
252
  if (deepmoji_enabled):
253
  return (0, 0, 0, 0)
254
  else:
 
266
  sad,
267
  surprise
268
  ):
269
+ # print('toggle_deepmoji')
270
  if checked:
271
  return (0, 0, 0, 0)
272
  else:
 
282
  'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
283
  'value': "en",
284
  'label': "Language",
285
+ 'info': "Will be more monotone and have an English accent."
286
  }
287
 
288
  _DESCRIPTION = '''
289
  <div>
290
  <a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
291
+ <a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.4k-blue?logo=nexusmods'/></a>
292
  <a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
293
  <span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
294
  </div>
295
  '''
296
 
297
+
298
+ class BlocksDemo:
299
+ def __init__(self):
300
+ with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
301
+ gr.Markdown("# xVASynth TTS")
302
+
303
+ gr.HTML(label="description", value=_DESCRIPTION)
304
+
 
 
 
 
 
 
 
 
 
 
 
305
  with gr.Row(): # Main row for inputs and language selection
306
  with gr.Column(): # Input column
307
+ input_textbox = gr.Textbox(**input_textbox_init)
308
+ language_radio = gr.Radio(**language_radio_init)
309
+
310
+ # remove autofocus
311
+ input_textbox_init['autofocus'] = False
312
+
313
+ with gr.Row():
314
+ with gr.Column():
315
+ en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
316
+ with gr.Column():
317
+ pacing_slider = gr.Slider(**pacing_slider_init)
318
+ with gr.Column(): # Control column
319
+ voice_radio = gr.Radio(**voice_radio_init)
320
+ pitch_slider = gr.Slider(**pitch_slider_init)
321
+ energy_slider = gr.Slider(**energy_slider_init)
322
+ with gr.Row(): # Main row for inputs and language selection
323
+ with gr.Column(): # Input column
324
+ anger_slider = gr.Slider(**anger_slider_init)
325
+ sad_slider = gr.Slider(**sad_slider_init)
326
+ with gr.Column(): # Input column
327
+ happy_slider = gr.Slider(**happy_slider_init)
328
+ surprise_slider = gr.Slider(**surprise_slider_init)
329
+ deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init)
330
+
331
+ # Event handling using click
332
+ btn = gr.Button("Generate", variant="primary")
333
+
334
+ # with gr.Row(): # Main row for inputs and language selection
335
+ # with gr.Column(): # Input column
336
+ output_wav = gr.Audio(
337
+ label="22kHz audio output",
338
+ type="filepath",
339
+ editable=False,
340
+ autoplay=True
341
+ )
342
+ # with gr.Column(): # Input column
343
+ output_arpabet = gr.HTML(label="ARPAbet")
344
+
345
+ btn.click(
346
+ fn=self.predict,
347
+ inputs=[
348
+ input_textbox,
349
+ voice_radio,
350
+ language_radio,
351
+ pacing_slider,
352
+ pitch_slider,
353
+ energy_slider,
354
+ anger_slider,
355
+ happy_slider,
356
+ sad_slider,
357
+ surprise_slider,
358
+ deepmoji_checkbox
359
+ ],
360
+ outputs=[
361
+ output_wav,
362
+ output_arpabet,
363
+ anger_slider,
364
+ happy_slider,
365
+ sad_slider,
366
+ surprise_slider
367
+ ]
368
+ )
369
+
370
+ # more languages option
371
+ language_radio.change(
372
+ more_lang_options,
373
+ inputs=language_radio,
374
+ outputs=language_radio,
375
+ trigger_mode='once',
376
+ show_progress='hidden',
377
+ )
378
+
379
+ # more voices option
380
+ voice_radio.change(
381
+ more_voice_options,
382
+ inputs=voice_radio,
383
+ outputs=voice_radio,
384
+ trigger_mode='once',
385
+ show_progress='hidden',
386
+ queue=False,
387
+ )
388
+
389
+ # set default text
390
+ language_radio.change(
391
+ set_default_text,
392
+ inputs=[language_radio, deepmoji_checkbox],
393
+ outputs=[input_textbox, deepmoji_checkbox],
394
+ show_progress='hidden',
395
+ queue=False,
396
+ )
397
+
398
+ # toggle en examples
399
+ language_radio.change(
400
+ toggle_example_dropdown,
401
+ inputs=language_radio,
402
+ outputs=en_examples_dropdown,
403
+ show_progress='hidden',
404
+ queue=False,
405
+ )
406
+
407
+ en_examples_dropdown.change(
408
+ set_example_as_input,
409
+ inputs=[en_examples_dropdown],
410
+ outputs=[input_textbox],
411
+ show_progress='hidden',
412
+ queue=False,
413
+ )
414
+
415
+ deepmoji_checkbox.change(
416
+ toggle_deepmoji,
417
+ inputs=[
418
+ deepmoji_checkbox,
419
+ anger_slider,
420
+ happy_slider,
421
+ sad_slider,
422
+ surprise_slider
423
+ ],
424
+ outputs=[
425
+ anger_slider,
426
+ happy_slider,
427
+ sad_slider,
428
+ surprise_slider
429
+ ],
430
+ show_progress='hidden',
431
+ queue=False,
432
+ )
433
+
434
+ input_textbox.change(
435
+ reset_em_sliders,
436
+ inputs=[
437
+ deepmoji_checkbox,
438
+ anger_slider,
439
+ happy_slider,
440
+ sad_slider,
441
+ surprise_slider
442
+ ],
443
+ outputs=[
444
+ anger_slider,
445
+ happy_slider,
446
+ sad_slider,
447
+ surprise_slider
448
+ ],
449
+ show_progress='hidden',
450
+ queue=False,
451
+ )
452
+
453
+ voice_radio.change(
454
+ reset_em_sliders,
455
+ inputs=[
456
+ deepmoji_checkbox,
457
+ anger_slider,
458
+ happy_slider,
459
+ sad_slider,
460
+ surprise_slider
461
+ ],
462
+ outputs=[
463
+ anger_slider,
464
+ happy_slider,
465
+ sad_slider,
466
+ surprise_slider
467
+ ],
468
+ show_progress='hidden',
469
+ queue=False,
470
+ )
471
+
472
+ self.block = demo
473
+
474
+ def predict(
475
+ self,
476
+ input_text,
477
+ voice,
478
+ lang,
479
+ pacing,
480
+ pitch,
481
+ energy,
482
+ anger,
483
+ happy,
484
+ sad,
485
+ surprise,
486
+ deepmoji_checked
487
+ ):
488
+ wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
489
+ input_text, # str in 'Input Text' Textbox component
490
+ voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
491
+ lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
492
+ pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
493
+ pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
494
+ energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
495
+ anger, # float (numeric value between 0 and 1.0) in '๐Ÿ˜  Anger' Slider component
496
+ happy, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ƒ Happiness' Slider component
497
+ sad, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ญ Sadness' Slider component
498
+ surprise, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ฎ Surprise' Slider component
499
+ deepmoji_checked, # bool
500
+ api_name="/predict"
501
+ )
502
+
503
+ json_data = json.loads(response.replace("'", '"'))
504
+
505
+ arpabet_html = '<h6>ARPAbet & Durations</h6>'
506
+ arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
507
+ arpabet_nopad = json_data['arpabet'].split('|PAD|')
508
+ arpabet_symbols = json_data['arpabet'].split('|')
509
+ wpad_len = len(arpabet_symbols)
510
+ nopad_len = len(arpabet_nopad)
511
+ total_dur_length = 0
512
+ for symb_i in range(wpad_len):
513
+ if (arpabet_symbols[symb_i] == '<PAD>'):
514
+ continue
515
+ total_dur_length += float(json_data['durations'][symb_i])
516
+
517
+ for symb_i in range(wpad_len):
518
+ if (arpabet_symbols[symb_i] == '<PAD>'):
519
+ continue
520
+
521
+ arpabet_length = float(json_data['durations'][symb_i])
522
+ cell_width = round(arpabet_length / total_dur_length * 100, 2)
523
+ arpabet_html += '<td class="arpabet" style="width: '\
524
+ + str(cell_width)\
525
+ +'%">'\
526
+ + arpabet_symbols[symb_i]\
527
+ + '</td> '
528
+ arpabet_html += '<tr></tbody></table>'
529
+
530
+ return [
531
+ wav_path,
532
+ arpabet_html,
533
+ round(json_data['em_angry'][0], 2),
534
+ round(json_data['em_happy'][0], 2),
535
+ round(json_data['em_sad'][0], 2),
536
+ round(json_data['em_surprise'][0], 2)
537
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
538
 
539
  if __name__ == "__main__":
540
  print('running Gradio interface')
 
541
  client = Client("Pendrokar/xVASynth")
542
 
543
+ demo = BlocksDemo()
544
+ demo.block.launch()