Pendrokar commited on
Commit
0191adb
1 Parent(s): 2f1b49f

language & voice reorder

Browse files
Files changed (1) hide show
  1. app.py +50 -47
app.py CHANGED
@@ -15,79 +15,81 @@ models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvi
15
 
16
 
17
  voice_models = [
 
18
  ("Male #6670", "ccby_nvidia_hifi_6670_M"),
19
- ("Female #11614", "ccby_nv_hifi_11614_F"),
 
 
20
  ("Female #11697", "ccby_nvidia_hifi_11697_F"),
21
  ("Female #12787", "ccby_nvidia_hifi_12787_F"),
22
- ("Male #6097", "ccby_nvidia_hifi_6097_M"),
23
- ("Male #6671", "ccby_nvidia_hifi_6671_M"),
24
  ("Female #8051", "ccby_nvidia_hifi_8051_F"),
25
- ("Male #9017", "ccby_nvidia_hifi_9017_M"),
26
  ("Female #9136", "ccby_nvidia_hifi_9136_F"),
27
- ("Female #92", "ccby_nvidia_hifi_92_F"),
28
  ]
29
  current_voice_model = None
30
 
 
31
  languages = [
32
  ("🇬🇧 EN", "en"),
33
  ("🇩🇪 DE", "de"),
34
  ("🇪🇸 ES", "es"),
35
  ("🇮🇹 IT", "it"),
 
 
 
 
 
 
 
 
 
 
36
  ("🇫🇷 FR", "fr"),
37
  ("🇷🇺 RU", "ru"),
 
38
  ("🇹🇷 TR", "tr"),
39
- ("🇻🇦 LA", "la"),
40
- ("🇷🇴 RO", "ro"),
41
- ("🇩🇰 DA", "da"),
42
- ("🇻🇳 VI", "vi"),
43
- ("🇳🇬 HA", "ha"),
44
- ("🇳🇱 NL", "nl"),
45
- ("🇨🇳 ZH", "zh"),
46
  ("🇸🇦 AR", "ar"),
47
- ("🇺🇦 UK", "uk"),
48
  ("🇮🇳 HI", "hi"),
 
49
  ("🇰🇷 KO", "ko"),
50
- ("🇵🇱 PL", "pl"),
51
- ("🇸🇪 SW", "sw"),
52
- ("🇫🇮 FI", "fi"),
53
- ("🇭🇺 HU", "hu"),
54
- ("🇵🇹 PT", "pt"),
55
  ("🇳🇬 YO", "yo"),
56
- ("🇸🇪 SV", "sv"),
57
- ("🇬🇷 EL", "el"),
58
- ("🇸🇳 WO", "wo"),
59
- ("🇯🇵 JP", "jp"),
60
  ]
61
 
 
62
  default_text = {
63
- "en": "This is what my voice sounds like.",
 
64
  "de": "So klingt meine Stimme.",
 
 
65
  "es": "Así suena mi voz.",
66
- "it": "Così suona la mia voce.",
67
  "fr": "Voici à quoi ressemble ma voix.",
68
- "ru": "Вот как звучит мой голос.",
69
- "tr": "Benim sesimin sesi böyle.",
70
- "la": "Haec est vox mea sonans.",
71
- "ro": "Așa sună vocea mea.",
72
- "da": "Sådan lyder min stemme.",
73
- "vi": "Đây là giọng nói của tôi.",
74
  "ha": "Wannan ne muryata ke.",
75
- "nl": "Dit is hoe mijn stem klinkt.",
76
- "zh": "这是我的声音。",
77
- "ar": "هذا هو صوتي.",
78
- "uk": "Ось як звучить мій голос.",
79
  "hi": "यह मेरी आवाज़ कैसी लगती है।",
 
 
 
80
  "ko": "여기 제 목소리가 어떤지 들어보세요.",
 
 
81
  "pl": "Tak brzmi mój głos.",
82
- "sw": "Sauti yangu inasikika hivi.",
83
- "fi": "Näin ääneni kuulostaa.",
84
- "hu": "Így hangzik a hangom.",
85
  "pt": "É assim que minha voz soa.",
86
- "yo": "Ìyí ni ohùn mi ńlá.",
 
87
  "sv": "Såhär låter min röst.",
88
- "el": "Έτσι ακούγεται η φωνή μου.",
 
 
 
89
  "wo": "Ndox li neen xewnaal ma.",
90
- "jp": "これが私の声です。",
 
91
  }
92
 
93
  def run_xvaserver():
@@ -115,7 +117,7 @@ def run_xvaserver():
115
  print('xVAServer running on port 8008')
116
 
117
  # load default model
118
- load_model("ccby_nvidia_hifi_6670_M")
119
 
120
  # Wait for the process to exit
121
  xvaserver.wait()
@@ -207,6 +209,7 @@ def predict(
207
  input_textbox = gr.Textbox(
208
  label="Input Text",
209
  value="This is what my voice sounds like.",
 
210
  lines=1,
211
  max_lines=5,
212
  autofocus=True
@@ -214,15 +217,15 @@ input_textbox = gr.Textbox(
214
  pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
215
  pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
216
  energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
217
- anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger")
218
- happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness")
219
- sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness")
220
- surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise")
221
  voice_radio = gr.Radio(
222
  voice_models,
223
- value="ccby_nvidia_hifi_6670_M",
224
  label="Voice",
225
- info="NVIDIA HIFI CC-BY-4.0 xVAPitch/v3 xVASynth model"
226
  )
227
 
228
  def set_default_text(lang):
 
15
 
16
 
17
  voice_models = [
18
+ ("Male #6671", "ccby_nvidia_hifi_6671_M"),
19
  ("Male #6670", "ccby_nvidia_hifi_6670_M"),
20
+ ("Male #9017", "ccby_nvidia_hifi_9017_M"),
21
+ ("Male #6097", "ccby_nvidia_hifi_6097_M"),
22
+ ("Female #92", "ccby_nvidia_hifi_92_F"),
23
  ("Female #11697", "ccby_nvidia_hifi_11697_F"),
24
  ("Female #12787", "ccby_nvidia_hifi_12787_F"),
25
+ ("Female #11614", "ccby_nv_hifi_11614_F"),
 
26
  ("Female #8051", "ccby_nvidia_hifi_8051_F"),
 
27
  ("Female #9136", "ccby_nvidia_hifi_9136_F"),
 
28
  ]
29
  current_voice_model = None
30
 
31
+ # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
32
  languages = [
33
  ("🇬🇧 EN", "en"),
34
  ("🇩🇪 DE", "de"),
35
  ("🇪🇸 ES", "es"),
36
  ("🇮🇹 IT", "it"),
37
+ ("🇳🇱 NL", "nl"),
38
+ ("🇵🇹 PT", "pt"),
39
+ ("🇵🇱 PL", "pl"),
40
+ ("🇷🇴 RO", "ro"),
41
+ ("🇸🇪 SV", "sv"),
42
+ ("SW", "sw"),
43
+ ("🇩🇰 DA", "da"),
44
+ ("🇫🇮 FI", "fi"),
45
+ ("🇭🇺 HU", "hu"),
46
+ ("🇬🇷 EL", "el"),
47
  ("🇫🇷 FR", "fr"),
48
  ("🇷🇺 RU", "ru"),
49
+ ("🇺🇦 UK", "uk"),
50
  ("🇹🇷 TR", "tr"),
 
 
 
 
 
 
 
51
  ("🇸🇦 AR", "ar"),
 
52
  ("🇮🇳 HI", "hi"),
53
+ ("🇯🇵 JP", "jp"),
54
  ("🇰🇷 KO", "ko"),
55
+ ("🇨🇳 ZH", "zh"),
56
+ ("🇻🇳 VI", "vi"),
57
+ ("🇻🇦 LA", "la"),
58
+ ("HA", "ha"),
 
59
  ("🇳🇬 YO", "yo"),
60
+ ("WO", "wo"),
 
 
 
61
  ]
62
 
63
+ # Translated from English by DeepMind's Gemini Pro
64
  default_text = {
65
+ "ar": "هذا هو صوتي.",
66
+ "da": "Sådan lyder min stemme.",
67
  "de": "So klingt meine Stimme.",
68
+ "el": "Έτσι ακούγεται η φωνή μου.",
69
+ "en": "This is what my voice sounds like.",
70
  "es": "Así suena mi voz.",
71
+ "fi": "Näin ääneni kuulostaa.",
72
  "fr": "Voici à quoi ressemble ma voix.",
 
 
 
 
 
 
73
  "ha": "Wannan ne muryata ke.",
 
 
 
 
74
  "hi": "यह मेरी आवाज़ कैसी लगती है।",
75
+ "hu": "Így hangzik a hangom.",
76
+ "it": "Così suona la mia voce.",
77
+ "jp": "これが私の声です。",
78
  "ko": "여기 제 목소리가 어떤지 들어보세요.",
79
+ "la": "Haec est vox mea sonans.",
80
+ "nl": "Dit is hoe mijn stem klinkt.",
81
  "pl": "Tak brzmi mój głos.",
 
 
 
82
  "pt": "É assim que minha voz soa.",
83
+ "ro": "Așa sună vocea mea.",
84
+ "ru": "Вот как звучит мой голос.",
85
  "sv": "Såhär låter min röst.",
86
+ "sw": "Sauti yangu inasikika hivi.",
87
+ "tr": "Benim sesimin sesi böyle.",
88
+ "uk": "Ось як звучить мій голос.",
89
+ "vi": "Đây là giọng nói của tôi.",
90
  "wo": "Ndox li neen xewnaal ma.",
91
+ "yo": "Ìyí ni ohùn mi ńlá.",
92
+ "zh": "这是我的声音。",
93
  }
94
 
95
  def run_xvaserver():
 
117
  print('xVAServer running on port 8008')
118
 
119
  # load default model
120
+ load_model("ccby_nvidia_hifi_6671_M")
121
 
122
  # Wait for the process to exit
123
  xvaserver.wait()
 
209
  input_textbox = gr.Textbox(
210
  label="Input Text",
211
  value="This is what my voice sounds like.",
212
+ info="Also accepts ARPAbet symbols placed within {} brackets.",
213
  lines=1,
214
  max_lines=5,
215
  autofocus=True
 
217
  pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
218
  pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
219
  energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
220
+ anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9")
221
+ happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7")
222
+ sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2")
223
+ surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Does not play well with Happiness with either being beyond 0.3")
224
  voice_radio = gr.Radio(
225
  voice_models,
226
+ value="ccby_nvidia_hifi_6671_M",
227
  label="Voice",
228
+ info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
229
  )
230
 
231
  def set_default_text(lang):