Add support for IMS Toucan.
Browse files- app.py +21 -3
- src/synthesize.py +30 -0
app.py
CHANGED
@@ -50,6 +50,7 @@ type=['wav'])
|
|
50 |
base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
|
51 |
base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
|
52 |
base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
|
|
|
53 |
|
54 |
if tts_lang=="swh":
|
55 |
finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
|
@@ -68,6 +69,7 @@ type=['wav'])
|
|
68 |
row2 = st.columns([1,1,2])
|
69 |
row3 = st.columns([1,1,2])
|
70 |
row4 = st.columns([1,1,2])
|
|
|
71 |
|
72 |
row1[0].write("**Model**")
|
73 |
row1[1].write("**Configuration**")
|
@@ -84,10 +86,15 @@ type=['wav'])
|
|
84 |
row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
|
85 |
|
86 |
if base_espeakng is not None:
|
87 |
-
|
88 |
row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
|
89 |
row4[1].write("default")
|
90 |
row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
#################################################################
|
93 |
if tts_lang == "swh":
|
@@ -156,9 +163,13 @@ type=['wav'])
|
|
156 |
scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
|
157 |
converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
|
158 |
|
|
|
|
|
|
|
159 |
row1 = st.columns([1,1,2])
|
160 |
row2 = st.columns([1,1,2])
|
161 |
row3 = st.columns([1,1,2])
|
|
|
162 |
|
163 |
row1[0].write("**Model**")
|
164 |
row1[1].write("**Configuration**")
|
@@ -178,6 +189,11 @@ type=['wav'])
|
|
178 |
row3[0].write(f"Espeak-ng")
|
179 |
row3[1].write(f"converted")
|
180 |
row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
|
183 |
#row3[0].write("MMS-TTS-SWH")
|
@@ -197,12 +213,13 @@ type=['wav'])
|
|
197 |
with about:
|
198 |
#st.header("How it works")
|
199 |
st.markdown('''# Mockingbird TTS Demo
|
200 |
-
This page is a demo of the openly available Text to Speech models for various languages of interest. Currently,
|
201 |
- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
|
202 |
- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
|
203 |
- [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
|
|
|
204 |
|
205 |
-
Voice conversion is achieved through Coqui.
|
206 |
|
207 |
Notes:
|
208 |
1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
|
@@ -219,5 +236,6 @@ Notes:
|
|
219 |
|
220 |
[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
|
221 |
[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
|
|
|
222 |
''')
|
223 |
|
|
|
50 |
base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
|
51 |
base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
|
52 |
base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
|
53 |
+
base_toucan= synth_toucan(tts_text, models[tts_lang]['toucan'])
|
54 |
|
55 |
if tts_lang=="swh":
|
56 |
finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
|
|
|
69 |
row2 = st.columns([1,1,2])
|
70 |
row3 = st.columns([1,1,2])
|
71 |
row4 = st.columns([1,1,2])
|
72 |
+
row5 = st.columns([1,1,2])
|
73 |
|
74 |
row1[0].write("**Model**")
|
75 |
row1[1].write("**Configuration**")
|
|
|
86 |
row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
|
87 |
|
88 |
if base_espeakng is not None:
|
|
|
89 |
row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)")
|
90 |
row4[1].write("default")
|
91 |
row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
|
92 |
+
|
93 |
+
|
94 |
+
row5[0].write(f"[IMS-Toucan](https://github.com/DigitalPhonetics/IMS-Toucan)")
|
95 |
+
row5[1].write("default")
|
96 |
+
row5[2].audio(base_toucan[0], sample_rate = base_toucan[1])
|
97 |
+
|
98 |
|
99 |
#################################################################
|
100 |
if tts_lang == "swh":
|
|
|
163 |
scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
|
164 |
converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
|
165 |
|
166 |
+
scipy.io.wavfile.write("source_speaker_toucan.wav", rate=base_toucan[1], data=base_toucan[0].T)
|
167 |
+
converted_toucan = convert_coqui('source_speaker_toucan.wav', target_speaker)
|
168 |
+
|
169 |
row1 = st.columns([1,1,2])
|
170 |
row2 = st.columns([1,1,2])
|
171 |
row3 = st.columns([1,1,2])
|
172 |
+
row4 = st.columns([1,1,2])
|
173 |
|
174 |
row1[0].write("**Model**")
|
175 |
row1[1].write("**Configuration**")
|
|
|
189 |
row3[0].write(f"Espeak-ng")
|
190 |
row3[1].write(f"converted")
|
191 |
row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
|
192 |
+
|
193 |
+
|
194 |
+
row4[0].write(f"IMS Toucan")
|
195 |
+
row4[1].write(f"converted")
|
196 |
+
row4[2].audio(converted_toucan[0], sample_rate = converted_toucan[1])
|
197 |
|
198 |
|
199 |
#row3[0].write("MMS-TTS-SWH")
|
|
|
213 |
with about:
|
214 |
#st.header("How it works")
|
215 |
st.markdown('''# Mockingbird TTS Demo
|
216 |
+
This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 4 synthesizers are supported:
|
217 |
- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
|
218 |
- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
|
219 |
- [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
|
220 |
+
- [**IMS Toucan**](https://github.com/DigitalPhonetics/IMS-Toucan), which supports 7000 languages. [^4]
|
221 |
|
222 |
+
Voice conversion is currently achieved through Coqui.
|
223 |
|
224 |
Notes:
|
225 |
1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
|
|
|
236 |
|
237 |
[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
|
238 |
[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
|
239 |
+
[^4]: Language list is available in the Gradio API documentation [here](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS).
|
240 |
''')
|
241 |
|
src/synthesize.py
CHANGED
@@ -9,6 +9,8 @@ from scipy.io import wavfile
|
|
9 |
from transformers import pipeline
|
10 |
import os
|
11 |
import numpy as np
|
|
|
|
|
12 |
|
13 |
def synth_mms(text:str, model:str):
|
14 |
'''
|
@@ -86,3 +88,31 @@ def synth_espeakng(text:str, model:str):
|
|
86 |
return wav, sampling_rate
|
87 |
else:
|
88 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from transformers import pipeline
|
10 |
import os
|
11 |
import numpy as np
|
12 |
+
from gradio_client import Client, file
|
13 |
+
|
14 |
|
15 |
def synth_mms(text:str, model:str):
|
16 |
'''
|
|
|
88 |
return wav, sampling_rate
|
89 |
else:
|
90 |
return None
|
91 |
+
|
92 |
+
|
93 |
+
def synth_toucan(text:str, model:str):
|
94 |
+
'''
|
95 |
+
Use Toucan to synthesize text.
|
96 |
+
|
97 |
+
Inputs:
|
98 |
+
text: Text to synthesze
|
99 |
+
model: Model code
|
100 |
+
Returns:
|
101 |
+
Streaming Wav and sampling rate.
|
102 |
+
|
103 |
+
NOTE: This wrapper does not let you explore the full range of options possible with the API. The API should allow you to generate female voices, however, it does not seem to be working at the moment.
|
104 |
+
'''
|
105 |
+
client = Client("Flux9665/MassivelyMultilingualTTS")
|
106 |
+
result = client.predict(
|
107 |
+
prompt=text,
|
108 |
+
language=model,
|
109 |
+
reference_audio=file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
|
110 |
+
voice_seed=123,
|
111 |
+
prosody_creativity=0.1,
|
112 |
+
duration_scaling_factor=1,
|
113 |
+
emb1=0,
|
114 |
+
emb2=0,
|
115 |
+
api_name="/predict"
|
116 |
+
)
|
117 |
+
sampling_rate, wav = wavfile.read(result[0])
|
118 |
+
return wav, sampling_rate
|