Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import gr_client
Browse files- app.py +105 -513
- gr_client.py +257 -250
app.py
CHANGED
@@ -1,16 +1,14 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
-
import time
|
4 |
import requests
|
5 |
import json
|
6 |
-
from subprocess import Popen, PIPE
|
7 |
-
import threading
|
8 |
from huggingface_hub import HfApi
|
9 |
-
import gradio as gr
|
10 |
|
11 |
# start xVASynth service (no HTTP)
|
12 |
import resources.app.no_server as xvaserver
|
13 |
|
|
|
|
|
14 |
# model
|
15 |
hf_model_name = "Pendrokar/xvapitch_nvidia"
|
16 |
model_repo = HfApi()
|
@@ -19,117 +17,9 @@ latest_commit_sha = commits[0].commit_id
|
|
19 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
20 |
models_path = hf_cache_models_path
|
21 |
|
22 |
-
# ordered from most emotional and respects pauses to ones that do the least
|
23 |
-
voice_models = [
|
24 |
-
("๐จโ๐ฆณ #6671", "ccby_nvidia_hifi_6671_M"),
|
25 |
-
("๐ฑโโ๏ธ ๐ฌ๐ง #92", "ccby_nvidia_hifi_92_F"),
|
26 |
-
("๐ง #6670", "ccby_nvidia_hifi_6670_M"),
|
27 |
-
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
28 |
-
("Male #6097", "ccby_nvidia_hifi_6097_M"),
|
29 |
-
("๐ฉโ๐ฆฑ #12787", "ccby_nvidia_hifi_12787_F"),
|
30 |
-
("๐ต #11614", "ccby_nv_hifi_11614_F"),
|
31 |
-
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
32 |
-
("๐ฉโ๐ฆณ #11697", "ccby_nvidia_hifi_11697_F"),
|
33 |
-
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
34 |
-
]
|
35 |
-
|
36 |
current_voice_model = None
|
37 |
base_speaker_emb = ''
|
38 |
|
39 |
-
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
40 |
-
languages = [
|
41 |
-
("๐บ๐ธ EN", "en"),
|
42 |
-
("๐ฉ๐ช DE", "de"),
|
43 |
-
("๐ช๐ธ ES", "es"),
|
44 |
-
("๐ฎ๐น IT", "it"),
|
45 |
-
("๐ณ๐ฑ NL", "nl"),
|
46 |
-
("๐ง๐ท PT", "pt"),
|
47 |
-
("๐ต๐ฑ PL", "pl"),
|
48 |
-
("๐ท๐ด RO", "ro"),
|
49 |
-
("๐ธ๐ช SV", "sv"),
|
50 |
-
("๐ฉ๐ฐ DA", "da"),
|
51 |
-
("๐ซ๐ฎ FI", "fi"),
|
52 |
-
("๐ญ๐บ HU", "hu"),
|
53 |
-
("๐ฌ๐ท EL", "el"),
|
54 |
-
("๐ซ๐ท FR", "fr"),
|
55 |
-
("๐ท๐บ RU", "ru"),
|
56 |
-
("๐บ๐ฆ UA", "uk"),
|
57 |
-
("๐น๐ท TR", "tr"),
|
58 |
-
("๐ธ๐ฆ AR", "ar"),
|
59 |
-
("๐ฎ๐ณ HI", "hi"),
|
60 |
-
("๐ฏ๐ต JP", "jp"),
|
61 |
-
("๐ฐ๐ท KO", "ko"),
|
62 |
-
("๐จ๐ณ ZH", "zh"),
|
63 |
-
("๐ป๐ณ VI", "vi"),
|
64 |
-
("๐ป๐ฆ LA", "la"),
|
65 |
-
("๐ณ๐ฌ YO", "yo"),
|
66 |
-
("Swahili", "sw"),
|
67 |
-
("Hausa", "ha"),
|
68 |
-
("Wolof", "wo"),
|
69 |
-
]
|
70 |
-
|
71 |
-
# Translated from English by DeepMind's Gemini Pro
|
72 |
-
default_text = {
|
73 |
-
"ar": "ูุฐุง ูู ุตูุชู.",
|
74 |
-
"da": "Sรฅdan lyder min stemme.",
|
75 |
-
"de": "So klingt meine Stimme.",
|
76 |
-
"el": "ฮฯฯฮน ฮฑฮบฮฟฯฮณฮตฯฮฑฮน ฮท ฯฯฮฝฮฎ ฮผฮฟฯ
.",
|
77 |
-
"en": "This is what my voice sounds like.",
|
78 |
-
"es": "Asรญ suena mi voz.",
|
79 |
-
"fi": "Nรคin รครคneni kuulostaa.",
|
80 |
-
"fr": "Voici ร quoi ressemble ma voix.",
|
81 |
-
"ha": "Wannan ne muryata ke.",
|
82 |
-
"hi": "เคฏเคน เคฎเฅเคฐเฅ เคเคตเคพเคเคผ เคเฅเคธเฅ เคฒเคเคคเฅ เคนเฅเฅค",
|
83 |
-
"hu": "รgy hangzik a hangom.",
|
84 |
-
"it": "Cosรฌ suona la mia voce.",
|
85 |
-
"jp": "ใใใ็งใฎๅฃฐใงใใ",
|
86 |
-
"ko": "์ฌ๊ธฐ ์ ๋ชฉ์๋ฆฌ๊ฐ ์ด๋ค์ง ๋ค์ด๋ณด์ธ์.",
|
87 |
-
"la": "Haec est vox mea sonans.",
|
88 |
-
"nl": "Dit is hoe mijn stem klinkt.",
|
89 |
-
"pl": "Tak brzmi mรณj gลos.",
|
90 |
-
"pt": "ร assim que minha voz soa.",
|
91 |
-
"ro": "Aศa sunฤ vocea mea.",
|
92 |
-
"ru": "ะะพั ะบะฐะบ ะทะฒััะธั ะผะพะน ะณะพะปะพั.",
|
93 |
-
"sv": "Sรฅhรคr lรฅter min rรถst.",
|
94 |
-
"sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4
|
95 |
-
"tr": "Benim sesimin sesi bรถyle.",
|
96 |
-
"uk": "ะัั ัะบ ะทะฒััะธัั ะผัะน ะณะพะปะพั.",
|
97 |
-
"vi": "ฤรขy lร giแปng nรณi cแปงa tรดi.",
|
98 |
-
"wo": "Ndox li neen xewnaal ma.",
|
99 |
-
"yo": "รyรญ ni ohรนn mi ลlรก.",
|
100 |
-
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ",
|
101 |
-
}
|
102 |
-
|
103 |
-
def run_xvaserver():
|
104 |
-
# start the process without waiting for a response
|
105 |
-
print('Running xVAServer subprocess...\n')
|
106 |
-
xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/')
|
107 |
-
|
108 |
-
# Wait for a moment to ensure the server starts up
|
109 |
-
time.sleep(10)
|
110 |
-
|
111 |
-
# Check if the server is running
|
112 |
-
if xvaserver.poll() is not None:
|
113 |
-
print("Web server failed to start.")
|
114 |
-
sys.exit(0)
|
115 |
-
|
116 |
-
# contact local xVASynth server
|
117 |
-
print('Attempting to connect to xVASynth...')
|
118 |
-
try:
|
119 |
-
response = requests.get('http://0.0.0.0:8008')
|
120 |
-
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
|
121 |
-
except requests.exceptions.RequestException as err:
|
122 |
-
print('Failed to connect!')
|
123 |
-
return
|
124 |
-
|
125 |
-
print('xVAServer running on port 8008')
|
126 |
-
|
127 |
-
# load default model
|
128 |
-
load_model("ccby_nvidia_hifi_6671_M")
|
129 |
-
|
130 |
-
# Wait for the process to exit
|
131 |
-
xvaserver.wait()
|
132 |
-
|
133 |
def load_model(voice_model_name):
|
134 |
model_path = models_path + voice_model_name
|
135 |
|
@@ -160,413 +50,115 @@ def load_model(voice_model_name):
|
|
160 |
|
161 |
return embs
|
162 |
|
163 |
-
def predict(
|
164 |
-
input_text,
|
165 |
-
voice,
|
166 |
-
lang,
|
167 |
-
pacing,
|
168 |
-
pitch,
|
169 |
-
energy,
|
170 |
-
anger,
|
171 |
-
happy,
|
172 |
-
sad,
|
173 |
-
surprise,
|
174 |
-
use_deepmoji
|
175 |
-
):
|
176 |
-
# grab only the first 1000 characters
|
177 |
-
input_text = input_text[:1000]
|
178 |
-
|
179 |
-
# load voice model if not the current model
|
180 |
-
if (current_voice_model != voice):
|
181 |
-
base_speaker_emb = load_model(voice)
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
'
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
# json_data = json.loads(response.text)
|
220 |
-
except requests.exceptions.RequestException as err:
|
221 |
-
print('FAILED to synthesize: {err}')
|
222 |
-
save_path = ''
|
223 |
-
response = {'text': '{"message": "Failed"}'}
|
224 |
-
json_data = {
|
225 |
-
'arpabet': ['Failed'],
|
226 |
-
'durations': [0],
|
227 |
-
'em_anger': anger,
|
228 |
-
'em_happy': happy,
|
229 |
-
'em_sad': sad,
|
230 |
-
'em_surprise': surprise,
|
231 |
}
|
232 |
|
233 |
-
# print('server.log contents:')
|
234 |
-
# with open('resources/app/server.log', 'r') as f:
|
235 |
-
# print(f.read())
|
236 |
-
|
237 |
-
arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
|
238 |
-
arpabet_symbols = json_data['arpabet'].split('|')
|
239 |
-
utter_time = 0
|
240 |
-
for symb_i in range(len(json_data['durations'])):
|
241 |
-
# skip PAD symbol
|
242 |
-
if (arpabet_symbols[symb_i] == '<PAD>'):
|
243 |
-
continue
|
244 |
-
|
245 |
-
length = float(json_data['durations'][symb_i])
|
246 |
-
arpa_length = str(round(length/2, 1))
|
247 |
-
arpabet_html += '<strong\
|
248 |
-
class="arpabet"\
|
249 |
-
style="padding: 0 '\
|
250 |
-
+ str(arpa_length)\
|
251 |
-
+'em"'\
|
252 |
-
+f" title=\"{utter_time} + {length}\""\
|
253 |
-
+'>'\
|
254 |
-
+ arpabet_symbols[symb_i]\
|
255 |
-
+ '</strong> '
|
256 |
-
utter_time += round(length, 1)
|
257 |
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
272 |
-
lines=1,
|
273 |
-
max_lines=5,
|
274 |
-
autofocus=True
|
275 |
-
)
|
276 |
-
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
277 |
-
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
278 |
-
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
279 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Anger", info="Tread lightly beyond 0.9")
|
280 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Happiness", info="Tread lightly beyond 0.7")
|
281 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ญ Sadness", info="Duration increased when beyond 0.2")
|
282 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
|
283 |
-
voice_radio = gr.Radio(
|
284 |
-
voice_models,
|
285 |
-
value="ccby_nvidia_hifi_6671_M",
|
286 |
-
label="Voice",
|
287 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
288 |
-
)
|
289 |
-
|
290 |
-
def set_default_text(lang, deepmoji_checked):
|
291 |
-
# DeepMoji only works on English Text
|
292 |
-
# checkbox_enabled = True
|
293 |
-
# if lang != 'en':
|
294 |
-
# checkbox_enabled = False
|
295 |
-
|
296 |
-
if lang == 'en':
|
297 |
-
checkbox_enabled = gr.Checkbox(
|
298 |
-
label="Use DeepMoji",
|
299 |
-
info="Auto adjust emotional values",
|
300 |
-
value=deepmoji_checked,
|
301 |
-
interactive=True
|
302 |
-
)
|
303 |
-
else:
|
304 |
-
checkbox_enabled = gr.Checkbox(
|
305 |
-
label="Use DeepMoji",
|
306 |
-
info="Works only with English!",
|
307 |
-
value=False,
|
308 |
-
interactive=False
|
309 |
-
)
|
310 |
-
|
311 |
-
return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks)
|
312 |
-
|
313 |
-
en_examples = [
|
314 |
-
"This is what my voice sounds like.",
|
315 |
-
"If there is anything else you need, feel free to ask.",
|
316 |
-
"Amazing! Could you do that again?",
|
317 |
-
"Why, I would be more than happy to help you!",
|
318 |
-
"That was unexpected.",
|
319 |
-
"How dare you! . You have no right.",
|
320 |
-
"Ahh, well, you see. There is more to it.",
|
321 |
-
"I can't believe she is gone.",
|
322 |
-
"Stay out of my way!!!",
|
323 |
-
# ARPAbet example
|
324 |
-
"This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
|
325 |
-
]
|
326 |
-
|
327 |
-
def set_example_as_input(example_text):
|
328 |
-
return example_text
|
329 |
-
|
330 |
-
def reset_em_sliders(
|
331 |
-
deepmoji_enabled,
|
332 |
-
anger,
|
333 |
-
happy,
|
334 |
-
sad,
|
335 |
-
surprise
|
336 |
-
):
|
337 |
-
if (deepmoji_enabled):
|
338 |
-
return (0, 0, 0, 0)
|
339 |
-
else:
|
340 |
-
return (
|
341 |
-
anger,
|
342 |
-
happy,
|
343 |
-
sad,
|
344 |
-
surprise
|
345 |
-
)
|
346 |
-
|
347 |
-
def set_default_audio(voice_id):
|
348 |
-
return models_path + voice_id + '.wav'
|
349 |
-
|
350 |
-
def toggle_deepmoji(
|
351 |
-
checked,
|
352 |
-
anger,
|
353 |
-
happy,
|
354 |
-
sad,
|
355 |
-
surprise
|
356 |
-
):
|
357 |
-
if checked:
|
358 |
-
return (0, 0, 0, 0)
|
359 |
-
else:
|
360 |
-
return (
|
361 |
-
anger,
|
362 |
-
happy,
|
363 |
-
sad,
|
364 |
-
surprise
|
365 |
-
)
|
366 |
-
|
367 |
-
language_radio = gr.Radio(
|
368 |
-
languages,
|
369 |
-
value="en",
|
370 |
-
label="Language",
|
371 |
-
info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
|
372 |
-
)
|
373 |
-
|
374 |
-
_DESCRIPTION = '''
|
375 |
-
<div>
|
376 |
-
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
377 |
-
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.3k-blue?logo=nexusmods'/></a>
|
378 |
-
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
|
379 |
-
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
|
380 |
-
</div>
|
381 |
-
'''
|
382 |
-
|
383 |
-
with gr.Blocks(css=".arpabet {display: inline-block; background-color: gray; border-radius: 5px; font-size: 120%; margin: 0.1em 0}") as demo:
|
384 |
-
gr.Markdown("# xVASynth TTS")
|
385 |
-
|
386 |
-
gr.HTML(label="description", value=_DESCRIPTION)
|
387 |
-
|
388 |
-
with gr.Row(): # Main row for inputs and language selection
|
389 |
-
with gr.Column(): # Input column
|
390 |
-
input_textbox = gr.Textbox(
|
391 |
-
label="Input Text",
|
392 |
-
value="This is what my voice sounds like.",
|
393 |
-
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
394 |
-
lines=1,
|
395 |
-
max_lines=5,
|
396 |
-
autofocus=True
|
397 |
-
)
|
398 |
-
language_radio = gr.Radio(
|
399 |
-
languages,
|
400 |
-
value="en",
|
401 |
-
label="Language",
|
402 |
-
info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
|
403 |
-
)
|
404 |
-
with gr.Row():
|
405 |
-
with gr.Column():
|
406 |
-
en_examples_dropdown = gr.Dropdown(
|
407 |
-
en_examples,
|
408 |
-
value=en_examples[0],
|
409 |
-
label="Example dropdown",
|
410 |
-
show_label=False,
|
411 |
-
info="English Examples",
|
412 |
-
visible=(language_radio.value == 'en')
|
413 |
-
)
|
414 |
-
with gr.Column():
|
415 |
-
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
416 |
-
with gr.Column(): # Control column
|
417 |
-
voice_radio = gr.Radio(
|
418 |
-
voice_models,
|
419 |
-
value="ccby_nvidia_hifi_6671_M",
|
420 |
-
label="Voice",
|
421 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
422 |
-
)
|
423 |
-
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
424 |
-
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
425 |
-
with gr.Row(): # Main row for inputs and language selection
|
426 |
-
with gr.Column(): # Input column
|
427 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Anger", info="Tread lightly beyond 0.9")
|
428 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ญ Sadness", info="Duration increased when beyond 0.2")
|
429 |
-
with gr.Column(): # Input column
|
430 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Happiness", info="Tread lightly beyond 0.7")
|
431 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ฎ Surprise", info="Can oversaturate Happiness")
|
432 |
-
deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True)
|
433 |
-
|
434 |
-
# Event handling using click
|
435 |
-
btn = gr.Button("Generate", variant="primary")
|
436 |
-
|
437 |
-
with gr.Row(): # Main row for inputs and language selection
|
438 |
-
with gr.Column(): # Input column
|
439 |
-
output_wav = gr.Audio(
|
440 |
-
label="22kHz audio output (autoplay enabled)",
|
441 |
-
type="filepath",
|
442 |
-
editable=False,
|
443 |
-
autoplay=True
|
444 |
-
)
|
445 |
-
with gr.Column(): # Input column
|
446 |
-
output_arpabet = gr.HTML(label="ARPAbet")
|
447 |
-
|
448 |
-
btn.click(
|
449 |
-
fn=predict,
|
450 |
-
inputs=[
|
451 |
-
input_textbox,
|
452 |
-
voice_radio,
|
453 |
-
language_radio,
|
454 |
-
pacing_slider,
|
455 |
-
pitch_slider,
|
456 |
-
energy_slider,
|
457 |
-
anger_slider,
|
458 |
-
happy_slider,
|
459 |
-
sad_slider,
|
460 |
-
surprise_slider,
|
461 |
-
deepmoji_checkbox
|
462 |
-
],
|
463 |
-
outputs=[
|
464 |
-
output_wav,
|
465 |
-
output_arpabet,
|
466 |
-
anger_slider,
|
467 |
-
happy_slider,
|
468 |
-
sad_slider,
|
469 |
-
surprise_slider,
|
470 |
-
# xVAServer response
|
471 |
-
gr.Textbox(visible=False)
|
472 |
-
]
|
473 |
-
)
|
474 |
-
input_textbox.submit(
|
475 |
-
fn=predict,
|
476 |
-
inputs=[
|
477 |
-
input_textbox,
|
478 |
-
voice_radio,
|
479 |
-
language_radio,
|
480 |
-
pacing_slider,
|
481 |
-
pitch_slider,
|
482 |
-
energy_slider,
|
483 |
-
anger_slider,
|
484 |
-
happy_slider,
|
485 |
-
sad_slider,
|
486 |
-
surprise_slider,
|
487 |
-
deepmoji_checkbox
|
488 |
-
],
|
489 |
-
outputs=[
|
490 |
-
output_wav,
|
491 |
-
output_arpabet,
|
492 |
-
anger_slider,
|
493 |
-
happy_slider,
|
494 |
-
sad_slider,
|
495 |
-
surprise_slider,
|
496 |
-
# xVAServer response
|
497 |
-
gr.Textbox(visible=False)
|
498 |
-
]
|
499 |
-
)
|
500 |
-
|
501 |
-
language_radio.change(
|
502 |
-
set_default_text,
|
503 |
-
inputs=[language_radio, deepmoji_checkbox],
|
504 |
-
outputs=[input_textbox, deepmoji_checkbox]
|
505 |
-
)
|
506 |
-
|
507 |
-
en_examples_dropdown.change(
|
508 |
-
set_example_as_input,
|
509 |
-
inputs=[en_examples_dropdown],
|
510 |
-
outputs=[input_textbox]
|
511 |
-
)
|
512 |
-
|
513 |
-
deepmoji_checkbox.change(
|
514 |
-
toggle_deepmoji,
|
515 |
-
inputs=[
|
516 |
-
deepmoji_checkbox,
|
517 |
-
anger_slider,
|
518 |
-
happy_slider,
|
519 |
-
sad_slider,
|
520 |
-
surprise_slider
|
521 |
-
],
|
522 |
-
outputs=[
|
523 |
-
anger_slider,
|
524 |
-
happy_slider,
|
525 |
-
sad_slider,
|
526 |
-
surprise_slider
|
527 |
-
]
|
528 |
-
)
|
529 |
-
|
530 |
-
input_textbox.change(
|
531 |
-
reset_em_sliders,
|
532 |
-
inputs=[
|
533 |
-
deepmoji_checkbox,
|
534 |
-
anger_slider,
|
535 |
-
happy_slider,
|
536 |
-
sad_slider,
|
537 |
-
surprise_slider
|
538 |
-
],
|
539 |
-
outputs=[
|
540 |
-
anger_slider,
|
541 |
-
happy_slider,
|
542 |
-
sad_slider,
|
543 |
-
surprise_slider
|
544 |
-
]
|
545 |
-
)
|
546 |
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
]
|
562 |
-
)
|
563 |
-
|
564 |
-
voice_radio.change(
|
565 |
-
set_default_audio,
|
566 |
-
inputs=voice_radio,
|
567 |
-
outputs=output_wav
|
568 |
-
)
|
569 |
|
570 |
if __name__ == "__main__":
|
571 |
print('running custom Gradio interface')
|
572 |
-
demo
|
|
|
|
1 |
import os
|
2 |
import sys
|
|
|
3 |
import requests
|
4 |
import json
|
|
|
|
|
5 |
from huggingface_hub import HfApi
|
|
|
6 |
|
7 |
# start xVASynth service (no HTTP)
|
8 |
import resources.app.no_server as xvaserver
|
9 |
|
10 |
+
from gr_client import BlocksDemo
|
11 |
+
|
12 |
# model
|
13 |
hf_model_name = "Pendrokar/xvapitch_nvidia"
|
14 |
model_repo = HfApi()
|
|
|
17 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
18 |
models_path = hf_cache_models_path
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
current_voice_model = None
|
21 |
base_speaker_emb = ''
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def load_model(voice_model_name):
|
24 |
model_path = models_path + voice_model_name
|
25 |
|
|
|
50 |
|
51 |
return embs
|
52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
class LocalBlocksDemo(BlocksDemo):
|
55 |
+
def predict(
|
56 |
+
self,
|
57 |
+
input_text,
|
58 |
+
voice,
|
59 |
+
lang,
|
60 |
+
pacing,
|
61 |
+
pitch,
|
62 |
+
energy,
|
63 |
+
anger,
|
64 |
+
happy,
|
65 |
+
sad,
|
66 |
+
surprise,
|
67 |
+
use_deepmoji
|
68 |
+
):
|
69 |
+
# grab only the first 1000 characters
|
70 |
+
input_text = input_text[:1000]
|
71 |
+
|
72 |
+
# load voice model if not the current model
|
73 |
+
if (current_voice_model != voice):
|
74 |
+
base_speaker_emb = load_model(voice)
|
75 |
+
|
76 |
+
model_type = 'xVAPitch'
|
77 |
+
pace = pacing if pacing else 1.0
|
78 |
+
save_path = '/tmp/xvapitch_audio_sample.wav'
|
79 |
+
language = lang
|
80 |
+
use_sr = 0
|
81 |
+
use_cleanup = 0
|
82 |
+
|
83 |
+
pluginsContext = {}
|
84 |
+
pluginsContext["mantella_settings"] = {
|
85 |
+
"emAngry": (anger if anger > 0 else 0),
|
86 |
+
"emHappy": (happy if happy > 0 else 0),
|
87 |
+
"emSad": (sad if sad > 0 else 0),
|
88 |
+
"emSurprise": (surprise if surprise > 0 else 0),
|
89 |
+
"run_model": use_deepmoji
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
}
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
+
data = {
|
94 |
+
'pluginsContext': json.dumps(pluginsContext),
|
95 |
+
'modelType': model_type,
|
96 |
+
# pad with whitespaces as a workaround to avoid cutoffs
|
97 |
+
'sequence': input_text.center(len(input_text) + 2, ' '),
|
98 |
+
'pace': pace,
|
99 |
+
'outfile': save_path,
|
100 |
+
'vocoder': 'n/a',
|
101 |
+
'base_lang': language,
|
102 |
+
'base_emb': base_speaker_emb,
|
103 |
+
'useSR': use_sr,
|
104 |
+
'useCleanup': use_cleanup,
|
105 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
print('Synthesizing...')
|
108 |
+
try:
|
109 |
+
json_data = xvaserver.synthesize(data)
|
110 |
+
# response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
|
111 |
+
# response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
|
112 |
+
# json_data = json.loads(response.text)
|
113 |
+
except requests.exceptions.RequestException as err:
|
114 |
+
print('FAILED to synthesize: {err}')
|
115 |
+
save_path = ''
|
116 |
+
response = {'text': '{"message": "Failed"}'}
|
117 |
+
json_data = {
|
118 |
+
'arpabet': ['Failed'],
|
119 |
+
'durations': [0],
|
120 |
+
'em_anger': anger,
|
121 |
+
'em_happy': happy,
|
122 |
+
'em_sad': sad,
|
123 |
+
'em_surprise': surprise,
|
124 |
+
}
|
125 |
+
|
126 |
+
# print('server.log contents:')
|
127 |
+
# with open('resources/app/server.log', 'r') as f:
|
128 |
+
# print(f.read())
|
129 |
+
|
130 |
+
arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
|
131 |
+
arpabet_symbols = json_data['arpabet'].split('|')
|
132 |
+
utter_time = 0
|
133 |
+
for symb_i in range(len(json_data['durations'])):
|
134 |
+
# skip PAD symbol
|
135 |
+
if (arpabet_symbols[symb_i] == '<PAD>'):
|
136 |
+
continue
|
137 |
+
|
138 |
+
length = float(json_data['durations'][symb_i])
|
139 |
+
arpa_length = str(round(length/2, 1))
|
140 |
+
arpabet_html += '<strong\
|
141 |
+
class="arpabet"\
|
142 |
+
style="padding: 0 '\
|
143 |
+
+ str(arpa_length)\
|
144 |
+
+'em"'\
|
145 |
+
+f" title=\"{utter_time} + {length}\""\
|
146 |
+
+'>'\
|
147 |
+
+ arpabet_symbols[symb_i]\
|
148 |
+
+ '</strong> '
|
149 |
+
utter_time += round(length, 1)
|
150 |
+
|
151 |
+
return [
|
152 |
+
save_path,
|
153 |
+
arpabet_html,
|
154 |
+
round(json_data['em_angry'][0], 2),
|
155 |
+
round(json_data['em_happy'][0], 2),
|
156 |
+
round(json_data['em_sad'][0], 2),
|
157 |
+
round(json_data['em_surprise'][0], 2),
|
158 |
+
json_data
|
159 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
if __name__ == "__main__":
|
162 |
print('running custom Gradio interface')
|
163 |
+
demo = LocalBlocksDemo()
|
164 |
+
demo.block.launch()
|
gr_client.py
CHANGED
@@ -1,9 +1,5 @@
|
|
1 |
import os
|
2 |
-
import sys
|
3 |
-
import time
|
4 |
-
import requests
|
5 |
import json
|
6 |
-
from huggingface_hub import hf_hub_download
|
7 |
import gradio as gr
|
8 |
from gradio_client import Client
|
9 |
|
@@ -21,7 +17,6 @@ voice_models_more = [
|
|
21 |
("๐ฉโ๐ฆณ #11697", "ccby_nvidia_hifi_11697_F"),
|
22 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
23 |
]
|
24 |
-
current_voice_model = None
|
25 |
|
26 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
27 |
languages = [
|
@@ -89,71 +84,6 @@ default_text = {
|
|
89 |
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ",
|
90 |
}
|
91 |
|
92 |
-
|
93 |
-
def predict(
|
94 |
-
input_text,
|
95 |
-
voice,
|
96 |
-
lang,
|
97 |
-
pacing,
|
98 |
-
pitch,
|
99 |
-
energy,
|
100 |
-
anger,
|
101 |
-
happy,
|
102 |
-
sad,
|
103 |
-
surprise,
|
104 |
-
deepmoji_checked
|
105 |
-
):
|
106 |
-
wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
|
107 |
-
input_text, # str in 'Input Text' Textbox component
|
108 |
-
voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
|
109 |
-
lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
|
110 |
-
pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
|
111 |
-
pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
|
112 |
-
energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
|
113 |
-
anger, # float (numeric value between 0 and 1.0) in '๐ Anger' Slider component
|
114 |
-
happy, # float (numeric value between 0 and 1.0) in '๐ Happiness' Slider component
|
115 |
-
sad, # float (numeric value between 0 and 1.0) in '๐ญ Sadness' Slider component
|
116 |
-
surprise, # float (numeric value between 0 and 1.0) in '๐ฎ Surprise' Slider component
|
117 |
-
deepmoji_checked, # bool
|
118 |
-
api_name="/predict"
|
119 |
-
)
|
120 |
-
|
121 |
-
json_data = json.loads(response.replace("'", '"'))
|
122 |
-
|
123 |
-
arpabet_html = '<h6>ARPAbet & Durations</h6>'
|
124 |
-
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
|
125 |
-
arpabet_nopad = json_data['arpabet'].split('|PAD|')
|
126 |
-
arpabet_symbols = json_data['arpabet'].split('|')
|
127 |
-
wpad_len = len(arpabet_symbols)
|
128 |
-
nopad_len = len(arpabet_nopad)
|
129 |
-
total_dur_length = 0
|
130 |
-
for symb_i in range(wpad_len):
|
131 |
-
if (arpabet_symbols[symb_i] == '<PAD>'):
|
132 |
-
continue
|
133 |
-
total_dur_length += float(json_data['durations'][symb_i])
|
134 |
-
|
135 |
-
for symb_i in range(wpad_len):
|
136 |
-
if (arpabet_symbols[symb_i] == '<PAD>'):
|
137 |
-
continue
|
138 |
-
|
139 |
-
arpabet_length = float(json_data['durations'][symb_i])
|
140 |
-
cell_width = round(arpabet_length / total_dur_length * 100, 2)
|
141 |
-
arpabet_html += '<td class="arpabet" style="width: '\
|
142 |
-
+ str(cell_width)\
|
143 |
-
+'%">'\
|
144 |
-
+ arpabet_symbols[symb_i]\
|
145 |
-
+ '</td> '
|
146 |
-
arpabet_html += '<tr></tbody></table>'
|
147 |
-
|
148 |
-
return [
|
149 |
-
wav_path,
|
150 |
-
arpabet_html,
|
151 |
-
round(json_data['em_angry'][0], 2),
|
152 |
-
round(json_data['em_happy'][0], 2),
|
153 |
-
round(json_data['em_sad'][0], 2),
|
154 |
-
round(json_data['em_surprise'][0], 2)
|
155 |
-
]
|
156 |
-
|
157 |
# Component defaults
|
158 |
input_textbox_init = {
|
159 |
'label': "Input Text",
|
@@ -232,7 +162,7 @@ deepmoji_checkbox_init = {
|
|
232 |
}
|
233 |
|
234 |
def more_lang_options(lang):
|
235 |
-
print('more_lang_options')
|
236 |
if lang != 'more':
|
237 |
return lang
|
238 |
|
@@ -241,10 +171,14 @@ def more_lang_options(lang):
|
|
241 |
return gr.Radio(**radio_init)
|
242 |
|
243 |
def set_default_text(lang, deepmoji_checked):
|
244 |
-
print('set_default_text')
|
|
|
245 |
if lang == 'more':
|
246 |
-
|
247 |
-
# return
|
|
|
|
|
|
|
248 |
|
249 |
# DeepMoji only works on English Text
|
250 |
checkbox_init = {**deepmoji_checkbox_init}
|
@@ -257,9 +191,9 @@ def set_default_text(lang, deepmoji_checked):
|
|
257 |
# checkbox_init['info'] = "Works only with English!",
|
258 |
# checkbox_init['value'] = False,
|
259 |
# checkbox_init['interactive'] = False
|
|
|
260 |
|
261 |
-
|
262 |
-
return default_text[lang], deepmoji_checked
|
263 |
|
264 |
# examples component
|
265 |
en_examples = [
|
@@ -285,11 +219,11 @@ en_examples_dropdown_init = {
|
|
285 |
}
|
286 |
|
287 |
def set_example_as_input(example_text):
|
288 |
-
print('set_example_as_input')
|
289 |
return example_text
|
290 |
|
291 |
def toggle_example_dropdown(lang):
|
292 |
-
print('toggle_example_dropdown')
|
293 |
dropdown_init = {**en_examples_dropdown_init}
|
294 |
if lang == 'en':
|
295 |
dropdown_init['visible'] = True
|
@@ -299,7 +233,7 @@ def toggle_example_dropdown(lang):
|
|
299 |
return gr.Dropdown(**dropdown_init)
|
300 |
|
301 |
def more_voice_options(voice):
|
302 |
-
print('more_voice_options')
|
303 |
if voice != 'more':
|
304 |
return voice
|
305 |
|
@@ -314,7 +248,7 @@ def reset_em_sliders(
|
|
314 |
sad,
|
315 |
surprise
|
316 |
):
|
317 |
-
print('reset_em_sliders')
|
318 |
if (deepmoji_enabled):
|
319 |
return (0, 0, 0, 0)
|
320 |
else:
|
@@ -332,7 +266,7 @@ def toggle_deepmoji(
|
|
332 |
sad,
|
333 |
surprise
|
334 |
):
|
335 |
-
print('toggle_deepmoji')
|
336 |
if checked:
|
337 |
return (0, 0, 0, 0)
|
338 |
else:
|
@@ -348,190 +282,263 @@ language_radio_init = {
|
|
348 |
'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
|
349 |
'value': "en",
|
350 |
'label': "Language",
|
351 |
-
'info': "Will be more monotone and have an English accent.
|
352 |
}
|
353 |
|
354 |
_DESCRIPTION = '''
|
355 |
<div>
|
356 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
357 |
-
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.
|
358 |
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
|
359 |
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
|
360 |
</div>
|
361 |
'''
|
362 |
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
language_radio = gr.Radio(**language_radio_init)
|
372 |
-
|
373 |
-
with gr.Row():
|
374 |
-
with gr.Column():
|
375 |
-
en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
|
376 |
-
with gr.Column():
|
377 |
-
pacing_slider = gr.Slider(**pacing_slider_init)
|
378 |
-
with gr.Column(): # Control column
|
379 |
-
voice_radio = gr.Radio(**voice_radio_init)
|
380 |
-
pitch_slider = gr.Slider(**pitch_slider_init)
|
381 |
-
energy_slider = gr.Slider(**energy_slider_init)
|
382 |
with gr.Row(): # Main row for inputs and language selection
|
383 |
with gr.Column(): # Input column
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
]
|
428 |
-
)
|
429 |
-
|
430 |
-
# more languages option
|
431 |
-
language_radio.change(
|
432 |
-
more_lang_options,
|
433 |
-
inputs=language_radio,
|
434 |
-
outputs=language_radio,
|
435 |
-
trigger_mode='once',
|
436 |
-
show_progress='hidden',
|
437 |
-
)
|
438 |
-
|
439 |
-
# more voices option
|
440 |
-
voice_radio.change(
|
441 |
-
more_voice_options,
|
442 |
-
inputs=voice_radio,
|
443 |
-
outputs=voice_radio,
|
444 |
-
trigger_mode='once',
|
445 |
-
show_progress='hidden',
|
446 |
-
queue=False,
|
447 |
-
)
|
448 |
-
|
449 |
-
# set default text
|
450 |
-
language_radio.change(
|
451 |
-
set_default_text,
|
452 |
-
inputs=[language_radio, deepmoji_checkbox],
|
453 |
-
outputs=[input_textbox, deepmoji_checkbox],
|
454 |
-
show_progress='hidden',
|
455 |
-
queue=False,
|
456 |
-
)
|
457 |
-
|
458 |
-
# toggle en examples
|
459 |
-
language_radio.change(
|
460 |
-
toggle_example_dropdown,
|
461 |
-
inputs=language_radio,
|
462 |
-
outputs=en_examples_dropdown,
|
463 |
-
show_progress='hidden',
|
464 |
-
queue=False,
|
465 |
-
)
|
466 |
-
|
467 |
-
en_examples_dropdown.change(
|
468 |
-
set_example_as_input,
|
469 |
-
inputs=[en_examples_dropdown],
|
470 |
-
outputs=[input_textbox],
|
471 |
-
show_progress='hidden',
|
472 |
-
queue=False,
|
473 |
-
)
|
474 |
-
|
475 |
-
deepmoji_checkbox.change(
|
476 |
-
toggle_deepmoji,
|
477 |
-
inputs=[
|
478 |
-
deepmoji_checkbox,
|
479 |
-
anger_slider,
|
480 |
-
happy_slider,
|
481 |
-
sad_slider,
|
482 |
-
surprise_slider
|
483 |
-
],
|
484 |
-
outputs=[
|
485 |
-
anger_slider,
|
486 |
-
happy_slider,
|
487 |
-
sad_slider,
|
488 |
-
surprise_slider
|
489 |
-
],
|
490 |
-
show_progress='hidden',
|
491 |
-
queue=False,
|
492 |
-
)
|
493 |
-
|
494 |
-
input_textbox.change(
|
495 |
-
reset_em_sliders,
|
496 |
-
inputs=[
|
497 |
-
deepmoji_checkbox,
|
498 |
-
anger_slider,
|
499 |
-
happy_slider,
|
500 |
-
sad_slider,
|
501 |
-
surprise_slider
|
502 |
-
],
|
503 |
-
outputs=[
|
504 |
-
anger_slider,
|
505 |
-
happy_slider,
|
506 |
-
sad_slider,
|
507 |
-
surprise_slider
|
508 |
-
],
|
509 |
-
show_progress='hidden',
|
510 |
-
queue=False,
|
511 |
-
)
|
512 |
-
|
513 |
-
voice_radio.change(
|
514 |
-
reset_em_sliders,
|
515 |
-
inputs=[
|
516 |
-
deepmoji_checkbox,
|
517 |
-
anger_slider,
|
518 |
-
happy_slider,
|
519 |
-
sad_slider,
|
520 |
-
surprise_slider
|
521 |
-
],
|
522 |
-
outputs=[
|
523 |
-
anger_slider,
|
524 |
-
happy_slider,
|
525 |
-
sad_slider,
|
526 |
-
surprise_slider
|
527 |
-
],
|
528 |
-
show_progress='hidden',
|
529 |
-
queue=False,
|
530 |
-
)
|
531 |
|
532 |
if __name__ == "__main__":
|
533 |
print('running Gradio interface')
|
534 |
-
# gradio_app.launch()
|
535 |
client = Client("Pendrokar/xVASynth")
|
536 |
|
537 |
-
demo
|
|
|
|
1 |
import os
|
|
|
|
|
|
|
2 |
import json
|
|
|
3 |
import gradio as gr
|
4 |
from gradio_client import Client
|
5 |
|
|
|
17 |
("๐ฉโ๐ฆณ #11697", "ccby_nvidia_hifi_11697_F"),
|
18 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
19 |
]
|
|
|
20 |
|
21 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
22 |
languages = [
|
|
|
84 |
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ",
|
85 |
}
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
# Component defaults
|
88 |
input_textbox_init = {
|
89 |
'label': "Input Text",
|
|
|
162 |
}
|
163 |
|
164 |
def more_lang_options(lang):
|
165 |
+
# print('more_lang_options')
|
166 |
if lang != 'more':
|
167 |
return lang
|
168 |
|
|
|
171 |
return gr.Radio(**radio_init)
|
172 |
|
173 |
def set_default_text(lang, deepmoji_checked):
|
174 |
+
# print('set_default_text')
|
175 |
+
textbox_init = {**input_textbox_init}
|
176 |
if lang == 'more':
|
177 |
+
textbox_init['value'] = default_text['en']
|
178 |
+
# return default_text['en'], deepmoji_checked
|
179 |
+
return gr.Textbox(**textbox_init), deepmoji_checked
|
180 |
+
|
181 |
+
textbox_init['value'] = default_text[lang]
|
182 |
|
183 |
# DeepMoji only works on English Text
|
184 |
checkbox_init = {**deepmoji_checkbox_init}
|
|
|
191 |
# checkbox_init['info'] = "Works only with English!",
|
192 |
# checkbox_init['value'] = False,
|
193 |
# checkbox_init['interactive'] = False
|
194 |
+
# gr.Checkbox(**checkbox_init)
|
195 |
|
196 |
+
return gr.Textbox(**textbox_init), deepmoji_checked
|
|
|
197 |
|
198 |
# examples component
|
199 |
en_examples = [
|
|
|
219 |
}
|
220 |
|
221 |
def set_example_as_input(example_text):
|
222 |
+
# print('set_example_as_input')
|
223 |
return example_text
|
224 |
|
225 |
def toggle_example_dropdown(lang):
|
226 |
+
# print('toggle_example_dropdown')
|
227 |
dropdown_init = {**en_examples_dropdown_init}
|
228 |
if lang == 'en':
|
229 |
dropdown_init['visible'] = True
|
|
|
233 |
return gr.Dropdown(**dropdown_init)
|
234 |
|
235 |
def more_voice_options(voice):
|
236 |
+
# print('more_voice_options')
|
237 |
if voice != 'more':
|
238 |
return voice
|
239 |
|
|
|
248 |
sad,
|
249 |
surprise
|
250 |
):
|
251 |
+
# print('reset_em_sliders')
|
252 |
if (deepmoji_enabled):
|
253 |
return (0, 0, 0, 0)
|
254 |
else:
|
|
|
266 |
sad,
|
267 |
surprise
|
268 |
):
|
269 |
+
# print('toggle_deepmoji')
|
270 |
if checked:
|
271 |
return (0, 0, 0, 0)
|
272 |
else:
|
|
|
282 |
'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
|
283 |
'value': "en",
|
284 |
'label': "Language",
|
285 |
+
'info': "Will be more monotone and have an English accent."
|
286 |
}
|
287 |
|
288 |
_DESCRIPTION = '''
|
289 |
<div>
|
290 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
291 |
+
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.4k-blue?logo=nexusmods'/></a>
|
292 |
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
|
293 |
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
|
294 |
</div>
|
295 |
'''
|
296 |
|
297 |
+
|
298 |
+
class BlocksDemo:
|
299 |
+
def __init__(self):
|
300 |
+
with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
|
301 |
+
gr.Markdown("# xVASynth TTS")
|
302 |
+
|
303 |
+
gr.HTML(label="description", value=_DESCRIPTION)
|
304 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
with gr.Row(): # Main row for inputs and language selection
|
306 |
with gr.Column(): # Input column
|
307 |
+
input_textbox = gr.Textbox(**input_textbox_init)
|
308 |
+
language_radio = gr.Radio(**language_radio_init)
|
309 |
+
|
310 |
+
# remove autofocus
|
311 |
+
input_textbox_init['autofocus'] = False
|
312 |
+
|
313 |
+
with gr.Row():
|
314 |
+
with gr.Column():
|
315 |
+
en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
|
316 |
+
with gr.Column():
|
317 |
+
pacing_slider = gr.Slider(**pacing_slider_init)
|
318 |
+
with gr.Column(): # Control column
|
319 |
+
voice_radio = gr.Radio(**voice_radio_init)
|
320 |
+
pitch_slider = gr.Slider(**pitch_slider_init)
|
321 |
+
energy_slider = gr.Slider(**energy_slider_init)
|
322 |
+
with gr.Row(): # Main row for inputs and language selection
|
323 |
+
with gr.Column(): # Input column
|
324 |
+
anger_slider = gr.Slider(**anger_slider_init)
|
325 |
+
sad_slider = gr.Slider(**sad_slider_init)
|
326 |
+
with gr.Column(): # Input column
|
327 |
+
happy_slider = gr.Slider(**happy_slider_init)
|
328 |
+
surprise_slider = gr.Slider(**surprise_slider_init)
|
329 |
+
deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init)
|
330 |
+
|
331 |
+
# Event handling using click
|
332 |
+
btn = gr.Button("Generate", variant="primary")
|
333 |
+
|
334 |
+
# with gr.Row(): # Main row for inputs and language selection
|
335 |
+
# with gr.Column(): # Input column
|
336 |
+
output_wav = gr.Audio(
|
337 |
+
label="22kHz audio output",
|
338 |
+
type="filepath",
|
339 |
+
editable=False,
|
340 |
+
autoplay=True
|
341 |
+
)
|
342 |
+
# with gr.Column(): # Input column
|
343 |
+
output_arpabet = gr.HTML(label="ARPAbet")
|
344 |
+
|
345 |
+
btn.click(
|
346 |
+
fn=self.predict,
|
347 |
+
inputs=[
|
348 |
+
input_textbox,
|
349 |
+
voice_radio,
|
350 |
+
language_radio,
|
351 |
+
pacing_slider,
|
352 |
+
pitch_slider,
|
353 |
+
energy_slider,
|
354 |
+
anger_slider,
|
355 |
+
happy_slider,
|
356 |
+
sad_slider,
|
357 |
+
surprise_slider,
|
358 |
+
deepmoji_checkbox
|
359 |
+
],
|
360 |
+
outputs=[
|
361 |
+
output_wav,
|
362 |
+
output_arpabet,
|
363 |
+
anger_slider,
|
364 |
+
happy_slider,
|
365 |
+
sad_slider,
|
366 |
+
surprise_slider
|
367 |
+
]
|
368 |
+
)
|
369 |
+
|
370 |
+
# more languages option
|
371 |
+
language_radio.change(
|
372 |
+
more_lang_options,
|
373 |
+
inputs=language_radio,
|
374 |
+
outputs=language_radio,
|
375 |
+
trigger_mode='once',
|
376 |
+
show_progress='hidden',
|
377 |
+
)
|
378 |
+
|
379 |
+
# more voices option
|
380 |
+
voice_radio.change(
|
381 |
+
more_voice_options,
|
382 |
+
inputs=voice_radio,
|
383 |
+
outputs=voice_radio,
|
384 |
+
trigger_mode='once',
|
385 |
+
show_progress='hidden',
|
386 |
+
queue=False,
|
387 |
+
)
|
388 |
+
|
389 |
+
# set default text
|
390 |
+
language_radio.change(
|
391 |
+
set_default_text,
|
392 |
+
inputs=[language_radio, deepmoji_checkbox],
|
393 |
+
outputs=[input_textbox, deepmoji_checkbox],
|
394 |
+
show_progress='hidden',
|
395 |
+
queue=False,
|
396 |
+
)
|
397 |
+
|
398 |
+
# toggle en examples
|
399 |
+
language_radio.change(
|
400 |
+
toggle_example_dropdown,
|
401 |
+
inputs=language_radio,
|
402 |
+
outputs=en_examples_dropdown,
|
403 |
+
show_progress='hidden',
|
404 |
+
queue=False,
|
405 |
+
)
|
406 |
+
|
407 |
+
en_examples_dropdown.change(
|
408 |
+
set_example_as_input,
|
409 |
+
inputs=[en_examples_dropdown],
|
410 |
+
outputs=[input_textbox],
|
411 |
+
show_progress='hidden',
|
412 |
+
queue=False,
|
413 |
+
)
|
414 |
+
|
415 |
+
deepmoji_checkbox.change(
|
416 |
+
toggle_deepmoji,
|
417 |
+
inputs=[
|
418 |
+
deepmoji_checkbox,
|
419 |
+
anger_slider,
|
420 |
+
happy_slider,
|
421 |
+
sad_slider,
|
422 |
+
surprise_slider
|
423 |
+
],
|
424 |
+
outputs=[
|
425 |
+
anger_slider,
|
426 |
+
happy_slider,
|
427 |
+
sad_slider,
|
428 |
+
surprise_slider
|
429 |
+
],
|
430 |
+
show_progress='hidden',
|
431 |
+
queue=False,
|
432 |
+
)
|
433 |
+
|
434 |
+
input_textbox.change(
|
435 |
+
reset_em_sliders,
|
436 |
+
inputs=[
|
437 |
+
deepmoji_checkbox,
|
438 |
+
anger_slider,
|
439 |
+
happy_slider,
|
440 |
+
sad_slider,
|
441 |
+
surprise_slider
|
442 |
+
],
|
443 |
+
outputs=[
|
444 |
+
anger_slider,
|
445 |
+
happy_slider,
|
446 |
+
sad_slider,
|
447 |
+
surprise_slider
|
448 |
+
],
|
449 |
+
show_progress='hidden',
|
450 |
+
queue=False,
|
451 |
+
)
|
452 |
+
|
453 |
+
voice_radio.change(
|
454 |
+
reset_em_sliders,
|
455 |
+
inputs=[
|
456 |
+
deepmoji_checkbox,
|
457 |
+
anger_slider,
|
458 |
+
happy_slider,
|
459 |
+
sad_slider,
|
460 |
+
surprise_slider
|
461 |
+
],
|
462 |
+
outputs=[
|
463 |
+
anger_slider,
|
464 |
+
happy_slider,
|
465 |
+
sad_slider,
|
466 |
+
surprise_slider
|
467 |
+
],
|
468 |
+
show_progress='hidden',
|
469 |
+
queue=False,
|
470 |
+
)
|
471 |
+
|
472 |
+
self.block = demo
|
473 |
+
|
474 |
+
def predict(
|
475 |
+
self,
|
476 |
+
input_text,
|
477 |
+
voice,
|
478 |
+
lang,
|
479 |
+
pacing,
|
480 |
+
pitch,
|
481 |
+
energy,
|
482 |
+
anger,
|
483 |
+
happy,
|
484 |
+
sad,
|
485 |
+
surprise,
|
486 |
+
deepmoji_checked
|
487 |
+
):
|
488 |
+
wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
|
489 |
+
input_text, # str in 'Input Text' Textbox component
|
490 |
+
voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
|
491 |
+
lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
|
492 |
+
pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
|
493 |
+
pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
|
494 |
+
energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
|
495 |
+
anger, # float (numeric value between 0 and 1.0) in '๐ Anger' Slider component
|
496 |
+
happy, # float (numeric value between 0 and 1.0) in '๐ Happiness' Slider component
|
497 |
+
sad, # float (numeric value between 0 and 1.0) in '๐ญ Sadness' Slider component
|
498 |
+
surprise, # float (numeric value between 0 and 1.0) in '๐ฎ Surprise' Slider component
|
499 |
+
deepmoji_checked, # bool
|
500 |
+
api_name="/predict"
|
501 |
+
)
|
502 |
+
|
503 |
+
json_data = json.loads(response.replace("'", '"'))
|
504 |
+
|
505 |
+
arpabet_html = '<h6>ARPAbet & Durations</h6>'
|
506 |
+
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
|
507 |
+
arpabet_nopad = json_data['arpabet'].split('|PAD|')
|
508 |
+
arpabet_symbols = json_data['arpabet'].split('|')
|
509 |
+
wpad_len = len(arpabet_symbols)
|
510 |
+
nopad_len = len(arpabet_nopad)
|
511 |
+
total_dur_length = 0
|
512 |
+
for symb_i in range(wpad_len):
|
513 |
+
if (arpabet_symbols[symb_i] == '<PAD>'):
|
514 |
+
continue
|
515 |
+
total_dur_length += float(json_data['durations'][symb_i])
|
516 |
+
|
517 |
+
for symb_i in range(wpad_len):
|
518 |
+
if (arpabet_symbols[symb_i] == '<PAD>'):
|
519 |
+
continue
|
520 |
+
|
521 |
+
arpabet_length = float(json_data['durations'][symb_i])
|
522 |
+
cell_width = round(arpabet_length / total_dur_length * 100, 2)
|
523 |
+
arpabet_html += '<td class="arpabet" style="width: '\
|
524 |
+
+ str(cell_width)\
|
525 |
+
+'%">'\
|
526 |
+
+ arpabet_symbols[symb_i]\
|
527 |
+
+ '</td> '
|
528 |
+
arpabet_html += '<tr></tbody></table>'
|
529 |
+
|
530 |
+
return [
|
531 |
+
wav_path,
|
532 |
+
arpabet_html,
|
533 |
+
round(json_data['em_angry'][0], 2),
|
534 |
+
round(json_data['em_happy'][0], 2),
|
535 |
+
round(json_data['em_sad'][0], 2),
|
536 |
+
round(json_data['em_surprise'][0], 2)
|
537 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
|
539 |
if __name__ == "__main__":
|
540 |
print('running Gradio interface')
|
|
|
541 |
client = Client("Pendrokar/xVASynth")
|
542 |
|
543 |
+
demo = BlocksDemo()
|
544 |
+
demo.block.launch()
|