Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
lojban v2 voice model
Browse files- README.md +3 -1
- app.py +9 -3
- gr_client.py +36 -5
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: xVASynth TTS
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
@@ -9,6 +9,7 @@ sdk_version: 4.20.0
|
|
9 |
models:
|
10 |
- Pendrokar/xvapitch_nvidia
|
11 |
- Pendrokar/TorchMoji
|
|
|
12 |
app_file: app.py
|
13 |
app_port: 7860
|
14 |
tags:
|
@@ -20,6 +21,7 @@ pinned: false
|
|
20 |
preload_from_hub:
|
21 |
- Pendrokar/xvapitch_nvidia
|
22 |
- Pendrokar/TorchMoji
|
|
|
23 |
license: gpl-3.0
|
24 |
thumbnail: >-
|
25 |
https://raw.githubusercontent.com/DanRuta/xVA-Synth/master/assets/x-icon.png
|
|
|
1 |
---
|
2 |
title: xVASynth TTS
|
3 |
+
emoji: 🧝♀️🧛♂️🧚♀️
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
|
|
9 |
models:
|
10 |
- Pendrokar/xvapitch_nvidia
|
11 |
- Pendrokar/TorchMoji
|
12 |
+
- Pendrokar/xvasynth_lojban
|
13 |
app_file: app.py
|
14 |
app_port: 7860
|
15 |
tags:
|
|
|
21 |
preload_from_hub:
|
22 |
- Pendrokar/xvapitch_nvidia
|
23 |
- Pendrokar/TorchMoji
|
24 |
+
- Pendrokar/xvasynth_lojban
|
25 |
license: gpl-3.0
|
26 |
thumbnail: >-
|
27 |
https://raw.githubusercontent.com/DanRuta/xVA-Synth/master/assets/x-icon.png
|
app.py
CHANGED
@@ -15,16 +15,22 @@ model_repo = HfApi()
|
|
15 |
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
|
16 |
latest_commit_sha = commits[0].commit_id
|
17 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
|
|
18 |
models_path = hf_cache_models_path
|
19 |
|
20 |
current_voice_model = None
|
21 |
base_speaker_emb = ''
|
22 |
|
23 |
def load_model(voice_model_name):
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
27 |
-
language = 'en'
|
28 |
|
29 |
data = {
|
30 |
'outputs': None,
|
|
|
15 |
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
|
16 |
latest_commit_sha = commits[0].commit_id
|
17 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
18 |
+
hf_cache_lojban_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_lojban/snapshots/{latest_commit_sha}/'
|
19 |
models_path = hf_cache_models_path
|
20 |
|
21 |
current_voice_model = None
|
22 |
base_speaker_emb = ''
|
23 |
|
24 |
def load_model(voice_model_name):
|
25 |
+
if voice_model_name == 'x_selpahi':
|
26 |
+
# Lojban
|
27 |
+
model_path = hf_cache_lojban_models_path + voice_model_name
|
28 |
+
model_type = 'FastPitch1.1'
|
29 |
+
else:
|
30 |
+
model_path = models_path + voice_model_name
|
31 |
+
model_type = 'xVAPitch'
|
32 |
|
33 |
+
language = 'en' # seems to have no effect if generated text is from a different language
|
|
|
34 |
|
35 |
data = {
|
36 |
'outputs': None,
|
gr_client.py
CHANGED
@@ -9,13 +9,14 @@ voice_models = [
|
|
9 |
]
|
10 |
voice_models_more = [
|
11 |
("🧔 #6670", "ccby_nvidia_hifi_6670_M"),
|
12 |
-
("
|
13 |
-
("
|
14 |
("👩🦱 #12787", "ccby_nvidia_hifi_12787_F"),
|
15 |
("👵 #11614", "ccby_nv_hifi_11614_F"),
|
16 |
-
("
|
17 |
("👩🦳 #11697", "ccby_nvidia_hifi_11697_F"),
|
18 |
-
("
|
|
|
19 |
]
|
20 |
|
21 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
@@ -52,6 +53,11 @@ languages_more = [
|
|
52 |
("Wolof", "wo"),
|
53 |
]
|
54 |
|
|
|
|
|
|
|
|
|
|
|
55 |
# Translated from English by DeepMind's Gemini Pro
|
56 |
default_text = {
|
57 |
"ar": "هذا هو صوتي.",
|
@@ -66,6 +72,7 @@ default_text = {
|
|
66 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
67 |
"hu": "Így hangzik a hangom.",
|
68 |
"it": "Così suona la mia voce.",
|
|
|
69 |
"jp": "これが私の声です。",
|
70 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
71 |
"la": "Haec est vox mea sonans.",
|
@@ -285,6 +292,19 @@ language_radio_init = {
|
|
285 |
'info': "Will be more monotone and have an English accent."
|
286 |
}
|
287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
_DESCRIPTION = '''
|
289 |
<div>
|
290 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
@@ -475,10 +495,21 @@ class BlocksDemo:
|
|
475 |
queue=False,
|
476 |
)
|
477 |
|
|
|
478 |
voice_radio.change(
|
479 |
self.set_default_audio,
|
480 |
inputs=voice_radio,
|
481 |
-
outputs=output_wav
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
)
|
483 |
|
484 |
return demo
|
|
|
9 |
]
|
10 |
voice_models_more = [
|
11 |
("🧔 #6670", "ccby_nvidia_hifi_6670_M"),
|
12 |
+
("👨🦲 #9017", "ccby_nvidia_hifi_9017_M"),
|
13 |
+
("🧑 #6097", "ccby_nvidia_hifi_6097_M"),
|
14 |
("👩🦱 #12787", "ccby_nvidia_hifi_12787_F"),
|
15 |
("👵 #11614", "ccby_nv_hifi_11614_F"),
|
16 |
+
("👩🦰 #8051", "ccby_nvidia_hifi_8051_F"),
|
17 |
("👩🦳 #11697", "ccby_nvidia_hifi_11697_F"),
|
18 |
+
("👩🦲 #9136", "ccby_nvidia_hifi_9136_F"),
|
19 |
+
("♟ Lojban", "x_selpahi"), # v2 model for Lojban, pre-multilingual capabilities of xVASynth
|
20 |
]
|
21 |
|
22 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
|
|
53 |
("Wolof", "wo"),
|
54 |
]
|
55 |
|
56 |
+
lojban_lang = [
|
57 |
+
# There is no ISO 639-1 for Lojban, but jb is valid
|
58 |
+
('♟ Lojban', 'jb')
|
59 |
+
]
|
60 |
+
|
61 |
# Translated from English by DeepMind's Gemini Pro
|
62 |
default_text = {
|
63 |
"ar": "هذا هو صوتي.",
|
|
|
72 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
73 |
"hu": "Így hangzik a hangom.",
|
74 |
"it": "Così suona la mia voce.",
|
75 |
+
"jb": ".i ",
|
76 |
"jp": "これが私の声です。",
|
77 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
78 |
"la": "Haec est vox mea sonans.",
|
|
|
292 |
'info': "Will be more monotone and have an English accent."
|
293 |
}
|
294 |
|
295 |
+
def set_lojban_language(voice, lang):
|
296 |
+
if voice != 'x_selpahi':
|
297 |
+
return lang
|
298 |
+
|
299 |
+
radio_init = {**language_radio_init}
|
300 |
+
radio_init['choices'] = [
|
301 |
+
*lojban_lang,
|
302 |
+
*languages,
|
303 |
+
*languages_more,
|
304 |
+
]
|
305 |
+
radio_init['value'] = lojban_lang[0][1]
|
306 |
+
return gr.Radio(**radio_init)
|
307 |
+
|
308 |
_DESCRIPTION = '''
|
309 |
<div>
|
310 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
|
|
495 |
queue=False,
|
496 |
)
|
497 |
|
498 |
+
# Replace output with voice audio sample
|
499 |
voice_radio.change(
|
500 |
self.set_default_audio,
|
501 |
inputs=voice_radio,
|
502 |
+
outputs=output_wav,
|
503 |
+
queue=True,
|
504 |
+
)
|
505 |
+
|
506 |
+
# Switched to Lojban voice
|
507 |
+
voice_radio.change(
|
508 |
+
set_lojban_language,
|
509 |
+
inputs=[voice_radio, language_radio],
|
510 |
+
outputs=[language_radio],
|
511 |
+
trigger_mode='once',
|
512 |
+
queue=True,
|
513 |
)
|
514 |
|
515 |
return demo
|