Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
language & voice reorder
Browse files
app.py
CHANGED
@@ -15,79 +15,81 @@ models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvi
|
|
15 |
|
16 |
|
17 |
voice_models = [
|
|
|
18 |
("Male #6670", "ccby_nvidia_hifi_6670_M"),
|
19 |
-
("
|
|
|
|
|
20 |
("Female #11697", "ccby_nvidia_hifi_11697_F"),
|
21 |
("Female #12787", "ccby_nvidia_hifi_12787_F"),
|
22 |
-
("
|
23 |
-
("Male #6671", "ccby_nvidia_hifi_6671_M"),
|
24 |
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
25 |
-
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
26 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
27 |
-
("Female #92", "ccby_nvidia_hifi_92_F"),
|
28 |
]
|
29 |
current_voice_model = None
|
30 |
|
|
|
31 |
languages = [
|
32 |
("🇬🇧 EN", "en"),
|
33 |
("🇩🇪 DE", "de"),
|
34 |
("🇪🇸 ES", "es"),
|
35 |
("🇮🇹 IT", "it"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
("🇫🇷 FR", "fr"),
|
37 |
("🇷🇺 RU", "ru"),
|
|
|
38 |
("🇹🇷 TR", "tr"),
|
39 |
-
("🇻🇦 LA", "la"),
|
40 |
-
("🇷🇴 RO", "ro"),
|
41 |
-
("🇩🇰 DA", "da"),
|
42 |
-
("🇻🇳 VI", "vi"),
|
43 |
-
("🇳🇬 HA", "ha"),
|
44 |
-
("🇳🇱 NL", "nl"),
|
45 |
-
("🇨🇳 ZH", "zh"),
|
46 |
("🇸🇦 AR", "ar"),
|
47 |
-
("🇺🇦 UK", "uk"),
|
48 |
("🇮🇳 HI", "hi"),
|
|
|
49 |
("🇰🇷 KO", "ko"),
|
50 |
-
("
|
51 |
-
("
|
52 |
-
("
|
53 |
-
("
|
54 |
-
("🇵🇹 PT", "pt"),
|
55 |
("🇳🇬 YO", "yo"),
|
56 |
-
("
|
57 |
-
("🇬🇷 EL", "el"),
|
58 |
-
("🇸🇳 WO", "wo"),
|
59 |
-
("🇯🇵 JP", "jp"),
|
60 |
]
|
61 |
|
|
|
62 |
default_text = {
|
63 |
-
"
|
|
|
64 |
"de": "So klingt meine Stimme.",
|
|
|
|
|
65 |
"es": "Así suena mi voz.",
|
66 |
-
"
|
67 |
"fr": "Voici à quoi ressemble ma voix.",
|
68 |
-
"ru": "Вот как звучит мой голос.",
|
69 |
-
"tr": "Benim sesimin sesi böyle.",
|
70 |
-
"la": "Haec est vox mea sonans.",
|
71 |
-
"ro": "Așa sună vocea mea.",
|
72 |
-
"da": "Sådan lyder min stemme.",
|
73 |
-
"vi": "Đây là giọng nói của tôi.",
|
74 |
"ha": "Wannan ne muryata ke.",
|
75 |
-
"nl": "Dit is hoe mijn stem klinkt.",
|
76 |
-
"zh": "这是我的声音。",
|
77 |
-
"ar": "هذا هو صوتي.",
|
78 |
-
"uk": "Ось як звучить мій голос.",
|
79 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
|
|
|
|
|
|
80 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
|
|
|
|
81 |
"pl": "Tak brzmi mój głos.",
|
82 |
-
"sw": "Sauti yangu inasikika hivi.",
|
83 |
-
"fi": "Näin ääneni kuulostaa.",
|
84 |
-
"hu": "Így hangzik a hangom.",
|
85 |
"pt": "É assim que minha voz soa.",
|
86 |
-
"
|
|
|
87 |
"sv": "Såhär låter min röst.",
|
88 |
-
"
|
|
|
|
|
|
|
89 |
"wo": "Ndox li neen xewnaal ma.",
|
90 |
-
"
|
|
|
91 |
}
|
92 |
|
93 |
def run_xvaserver():
|
@@ -115,7 +117,7 @@ def run_xvaserver():
|
|
115 |
print('xVAServer running on port 8008')
|
116 |
|
117 |
# load default model
|
118 |
-
load_model("
|
119 |
|
120 |
# Wait for the process to exit
|
121 |
xvaserver.wait()
|
@@ -207,6 +209,7 @@ def predict(
|
|
207 |
input_textbox = gr.Textbox(
|
208 |
label="Input Text",
|
209 |
value="This is what my voice sounds like.",
|
|
|
210 |
lines=1,
|
211 |
max_lines=5,
|
212 |
autofocus=True
|
@@ -214,15 +217,15 @@ input_textbox = gr.Textbox(
|
|
214 |
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
215 |
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
216 |
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
217 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger")
|
218 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness")
|
219 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness")
|
220 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise")
|
221 |
voice_radio = gr.Radio(
|
222 |
voice_models,
|
223 |
-
value="
|
224 |
label="Voice",
|
225 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch
|
226 |
)
|
227 |
|
228 |
def set_default_text(lang):
|
|
|
15 |
|
16 |
|
17 |
voice_models = [
|
18 |
+
("Male #6671", "ccby_nvidia_hifi_6671_M"),
|
19 |
("Male #6670", "ccby_nvidia_hifi_6670_M"),
|
20 |
+
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
21 |
+
("Male #6097", "ccby_nvidia_hifi_6097_M"),
|
22 |
+
("Female #92", "ccby_nvidia_hifi_92_F"),
|
23 |
("Female #11697", "ccby_nvidia_hifi_11697_F"),
|
24 |
("Female #12787", "ccby_nvidia_hifi_12787_F"),
|
25 |
+
("Female #11614", "ccby_nv_hifi_11614_F"),
|
|
|
26 |
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
|
|
27 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
|
|
28 |
]
|
29 |
current_voice_model = None
|
30 |
|
31 |
+
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
32 |
languages = [
|
33 |
("🇬🇧 EN", "en"),
|
34 |
("🇩🇪 DE", "de"),
|
35 |
("🇪🇸 ES", "es"),
|
36 |
("🇮🇹 IT", "it"),
|
37 |
+
("🇳🇱 NL", "nl"),
|
38 |
+
("🇵🇹 PT", "pt"),
|
39 |
+
("🇵🇱 PL", "pl"),
|
40 |
+
("🇷🇴 RO", "ro"),
|
41 |
+
("🇸🇪 SV", "sv"),
|
42 |
+
("SW", "sw"),
|
43 |
+
("🇩🇰 DA", "da"),
|
44 |
+
("🇫🇮 FI", "fi"),
|
45 |
+
("🇭🇺 HU", "hu"),
|
46 |
+
("🇬🇷 EL", "el"),
|
47 |
("🇫🇷 FR", "fr"),
|
48 |
("🇷🇺 RU", "ru"),
|
49 |
+
("🇺🇦 UK", "uk"),
|
50 |
("🇹🇷 TR", "tr"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
("🇸🇦 AR", "ar"),
|
|
|
52 |
("🇮🇳 HI", "hi"),
|
53 |
+
("🇯🇵 JP", "jp"),
|
54 |
("🇰🇷 KO", "ko"),
|
55 |
+
("🇨🇳 ZH", "zh"),
|
56 |
+
("🇻🇳 VI", "vi"),
|
57 |
+
("🇻🇦 LA", "la"),
|
58 |
+
("HA", "ha"),
|
|
|
59 |
("🇳🇬 YO", "yo"),
|
60 |
+
("WO", "wo"),
|
|
|
|
|
|
|
61 |
]
|
62 |
|
63 |
+
# Translated from English by DeepMind's Gemini Pro
|
64 |
default_text = {
|
65 |
+
"ar": "هذا هو صوتي.",
|
66 |
+
"da": "Sådan lyder min stemme.",
|
67 |
"de": "So klingt meine Stimme.",
|
68 |
+
"el": "Έτσι ακούγεται η φωνή μου.",
|
69 |
+
"en": "This is what my voice sounds like.",
|
70 |
"es": "Así suena mi voz.",
|
71 |
+
"fi": "Näin ääneni kuulostaa.",
|
72 |
"fr": "Voici à quoi ressemble ma voix.",
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
"ha": "Wannan ne muryata ke.",
|
|
|
|
|
|
|
|
|
74 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
75 |
+
"hu": "Így hangzik a hangom.",
|
76 |
+
"it": "Così suona la mia voce.",
|
77 |
+
"jp": "これが私の声です。",
|
78 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
79 |
+
"la": "Haec est vox mea sonans.",
|
80 |
+
"nl": "Dit is hoe mijn stem klinkt.",
|
81 |
"pl": "Tak brzmi mój głos.",
|
|
|
|
|
|
|
82 |
"pt": "É assim que minha voz soa.",
|
83 |
+
"ro": "Așa sună vocea mea.",
|
84 |
+
"ru": "Вот как звучит мой голос.",
|
85 |
"sv": "Såhär låter min röst.",
|
86 |
+
"sw": "Sauti yangu inasikika hivi.",
|
87 |
+
"tr": "Benim sesimin sesi böyle.",
|
88 |
+
"uk": "Ось як звучить мій голос.",
|
89 |
+
"vi": "Đây là giọng nói của tôi.",
|
90 |
"wo": "Ndox li neen xewnaal ma.",
|
91 |
+
"yo": "Ìyí ni ohùn mi ńlá.",
|
92 |
+
"zh": "这是我的声音。",
|
93 |
}
|
94 |
|
95 |
def run_xvaserver():
|
|
|
117 |
print('xVAServer running on port 8008')
|
118 |
|
119 |
# load default model
|
120 |
+
load_model("ccby_nvidia_hifi_6671_M")
|
121 |
|
122 |
# Wait for the process to exit
|
123 |
xvaserver.wait()
|
|
|
209 |
input_textbox = gr.Textbox(
|
210 |
label="Input Text",
|
211 |
value="This is what my voice sounds like.",
|
212 |
+
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
213 |
lines=1,
|
214 |
max_lines=5,
|
215 |
autofocus=True
|
|
|
217 |
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
218 |
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
219 |
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
220 |
+
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9")
|
221 |
+
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7")
|
222 |
+
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2")
|
223 |
+
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Does not play well with Happiness with either being beyond 0.3")
|
224 |
voice_radio = gr.Radio(
|
225 |
voice_models,
|
226 |
+
value="ccby_nvidia_hifi_6671_M",
|
227 |
label="Voice",
|
228 |
+
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
229 |
)
|
230 |
|
231 |
def set_default_text(lang):
|