Spaces:
Sleeping
Sleeping
Load initial demo.
Browse files- .gitignore +167 -0
- README.md +14 -11
- app.py +194 -0
- requirements.txt +7 -0
- src/__init__.py +4 -0
- src/convert.py +21 -0
- src/helpers.py +29 -0
- src/lookups.py +72 -0
- src/synthesize.py +85 -0
- target_speaker.wav +0 -0
.gitignore
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
110 |
+
.pdm.toml
|
111 |
+
.pdm-python
|
112 |
+
.pdm-build/
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
# MY FILES
|
167 |
+
dev_roadmap.txt
|
README.md
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: 🐠
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.37.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Mockingbird TTS Demo
|
2 |
+
This repo hosts Mockingbird, a demo of open Text-to-Speech tools.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
Currently, 3 synthesizers are supported:
|
5 |
+
- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model
|
6 |
+
- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package and the models supplied via that
|
7 |
+
- [**ESpeak-NG's**](espeak-ng) synthetic voices
|
8 |
+
|
9 |
+
Voice conversion is achieved through Coqui.
|
10 |
+
|
11 |
+
Notes:
|
12 |
+
1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
|
13 |
+
2. Coqui is no longer being officially developed.
|
14 |
+
3. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
|
15 |
+
4. Not all synthesizers support a given language.
|
app.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import scipy
|
3 |
+
import os
|
4 |
+
import streamlit as st
|
5 |
+
import pandas as pd
|
6 |
+
from transformers import pipeline #set_seed,
|
7 |
+
from transformers import VitsTokenizer, VitsModel
|
8 |
+
from datasets import load_dataset, Audio
|
9 |
+
from huggingface_hub.inference_api import InferenceApi
|
10 |
+
|
11 |
+
from src import *
|
12 |
+
|
13 |
+
|
14 |
+
########################
|
15 |
+
|
16 |
+
st.title("Mockingbird")
|
17 |
+
st.header("A demo of open Text to Speech tools")
|
18 |
+
tts, about = st.tabs(["Text to speech", "**About**"])
|
19 |
+
|
20 |
+
########################
|
21 |
+
with tts:
|
22 |
+
|
23 |
+
# Configurations -- language choice and text
|
24 |
+
tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
|
25 |
+
tts_text = st.text_area(label = "Please enter your sentence here:",
|
26 |
+
value="", placeholder=placeholders[tts_lang] )
|
27 |
+
|
28 |
+
target_speaker_file = st.file_uploader("If you would like to test voice conversion, you may upload your audio below. You should upload one file in .wav format. If you don't, a default file will be used.",
|
29 |
+
type=['wav'])
|
30 |
+
|
31 |
+
# Inference
|
32 |
+
if st.button("Generate"):
|
33 |
+
|
34 |
+
# Warning about alphabet support
|
35 |
+
if tts_lang in ['rus', 'fas']:
|
36 |
+
st.warning("WARNING! On Windows, ESpeak-NG has trouble synthesizing output when input is provided from non-Latin alphabets.")
|
37 |
+
|
38 |
+
st.divider()
|
39 |
+
|
40 |
+
# Synthesis
|
41 |
+
with st.spinner(":rainbow[Synthesizing, please wait... (this will be slowest the first time you generate audio in a new language)]"):
|
42 |
+
if tts_text == "":
|
43 |
+
tts_text=placeholders[tts_lang]
|
44 |
+
|
45 |
+
# First, make the audio
|
46 |
+
base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
|
47 |
+
base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
|
48 |
+
base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
|
49 |
+
|
50 |
+
if tts_lang=="swh":
|
51 |
+
finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
|
52 |
+
finetuned_mms2 = synth_mms(tts_text, "khof312/mms-tts-swh-female-2")
|
53 |
+
|
54 |
+
#vc_mms
|
55 |
+
#vc_coqui
|
56 |
+
#vc_espeakng
|
57 |
+
"## Synthesis"
|
58 |
+
"### Default models"
|
59 |
+
row1 = st.columns([1,1,2])
|
60 |
+
row2 = st.columns([1,1,2])
|
61 |
+
row3 = st.columns([1,1,2])
|
62 |
+
row4 = st.columns([1,1,2])
|
63 |
+
|
64 |
+
row1[0].write("**Model**")
|
65 |
+
row1[1].write("**Configuration**")
|
66 |
+
row1[2].write("**Audio**")
|
67 |
+
|
68 |
+
if base_mms is not None:
|
69 |
+
row2[0].write(f"Meta MMS")
|
70 |
+
row2[1].write("default")
|
71 |
+
row2[2].audio(base_mms[0], sample_rate = base_mms[1])
|
72 |
+
|
73 |
+
if base_coqui is not None:
|
74 |
+
row3[0].write(f"Coqui")
|
75 |
+
row3[1].write("default")
|
76 |
+
row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
|
77 |
+
|
78 |
+
if base_espeakng is not None:
|
79 |
+
|
80 |
+
row4[0].write(f"Espeak-ng")
|
81 |
+
row4[1].write("default")
|
82 |
+
row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
|
83 |
+
|
84 |
+
#################################################################
|
85 |
+
if tts_lang == "swh":
|
86 |
+
"### Fine Tuned"
|
87 |
+
row1 = st.columns([1,1,2])
|
88 |
+
row2 = st.columns([1,1,2])
|
89 |
+
row3 = st.columns([1,1,2])
|
90 |
+
|
91 |
+
row1[0].write("**Model**")
|
92 |
+
row1[1].write("**Configuration**")
|
93 |
+
row1[2].write("**Audio**")
|
94 |
+
|
95 |
+
|
96 |
+
row2[0].write(f"Meta MMS")
|
97 |
+
row2[1].write("female 1")
|
98 |
+
row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
|
99 |
+
row3[0].write(f"Meta MMS")
|
100 |
+
row3[1].write("female 2")
|
101 |
+
row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1])
|
102 |
+
|
103 |
+
st.divider()
|
104 |
+
|
105 |
+
"## Voice conversion" #################################################################
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
st.warning('''Note: The naturalness of the audio will only be as good as that of the audio in "default models" above.''')
|
110 |
+
|
111 |
+
if target_speaker_file is not None:
|
112 |
+
rate, wav = scipy.io.wavfile.read(target_speaker_file)
|
113 |
+
scipy.io.wavfile.write("target_speaker_custom.wav", data=wav, rate=rate)
|
114 |
+
target_speaker = "target_speaker_custom.wav"
|
115 |
+
else:
|
116 |
+
target_speaker = "target_speaker.wav"
|
117 |
+
|
118 |
+
if base_mms is not None:
|
119 |
+
scipy.io.wavfile.write("source_speaker_mms.wav", rate=base_mms[1], data=base_mms[0].T)
|
120 |
+
converted_mms = convert_coqui('source_speaker_mms.wav', target_speaker)
|
121 |
+
|
122 |
+
if base_coqui is not None:
|
123 |
+
scipy.io.wavfile.write("source_speaker_coqui.wav", rate=base_coqui[1], data=base_coqui[0].T)
|
124 |
+
converted_coqui = convert_coqui('source_speaker_coqui.wav', target_speaker)
|
125 |
+
|
126 |
+
if base_espeakng is not None:
|
127 |
+
scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
|
128 |
+
converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
|
129 |
+
|
130 |
+
row1 = st.columns([1,1,2])
|
131 |
+
row2 = st.columns([1,1,2])
|
132 |
+
row3 = st.columns([1,1,2])
|
133 |
+
|
134 |
+
row1[0].write("**Model**")
|
135 |
+
row1[1].write("**Configuration**")
|
136 |
+
row1[2].write("**Audio**")
|
137 |
+
|
138 |
+
if base_mms is not None:
|
139 |
+
row1[0].write(f"Meta MMS")
|
140 |
+
row1[1].write(f"converted")
|
141 |
+
row1[2].audio(converted_mms[0], sample_rate = converted_mms[1])
|
142 |
+
|
143 |
+
if base_coqui is not None:
|
144 |
+
row2[0].write(f"Coqui")
|
145 |
+
row2[1].write(f"converted")
|
146 |
+
row2[2].audio(converted_coqui[0], sample_rate = converted_coqui[1])
|
147 |
+
|
148 |
+
if base_espeakng is not None:
|
149 |
+
row3[0].write(f"Espeak-ng")
|
150 |
+
row3[1].write(f"converted")
|
151 |
+
row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
|
152 |
+
|
153 |
+
|
154 |
+
#row3[0].write("MMS-TTS-SWH")
|
155 |
+
#row3[1].audio(synth, sample_rate=16_000)
|
156 |
+
#row3[2].audio(synth, sample_rate=16_000)
|
157 |
+
|
158 |
+
#st.audio(synth, sample_rate=16_000)
|
159 |
+
#data.write(np.random.randn(10, 1)
|
160 |
+
|
161 |
+
|
162 |
+
#col1.subheader("A wide column with a chart")
|
163 |
+
#col1.line_chart(data)
|
164 |
+
|
165 |
+
#col2.subheader("A narrow column with the data")
|
166 |
+
#col2.write(data)
|
167 |
+
|
168 |
+
with about:
|
169 |
+
#st.header("How it works")
|
170 |
+
st.markdown('''# Mockingbird TTS Demo
|
171 |
+
This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers are supported:
|
172 |
+
- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
|
173 |
+
- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
|
174 |
+
- [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
|
175 |
+
|
176 |
+
Voice conversion is achieved through Coqui.
|
177 |
+
|
178 |
+
Notes:
|
179 |
+
1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
|
180 |
+
2. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
|
181 |
+
3. Not all synthesizers support a given language.
|
182 |
+
|
183 |
+
|
184 |
+
|
185 |
+
[^1]: Endpoints used are of the form https://huggingface.co/facebook/mms-tts-[LANG].
|
186 |
+
Learn more:
|
187 |
+
[Docs](https://huggingface.co/docs/transformers/model_doc/mms) |
|
188 |
+
[Paper](https://arxiv.org/abs/2305.13516) |
|
189 |
+
[Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)
|
190 |
+
|
191 |
+
[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
|
192 |
+
[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
|
193 |
+
''')
|
194 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets
|
2 |
+
librosa
|
3 |
+
pycountry
|
4 |
+
scipy
|
5 |
+
sentencepiece
|
6 |
+
transformers
|
7 |
+
torch
|
src/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .helpers import *
|
2 |
+
from .lookups import *
|
3 |
+
from .synthesize import *
|
4 |
+
from .convert import *
|
src/convert.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import IPython
|
3 |
+
from TTS.api import TTS
|
4 |
+
|
5 |
+
def convert_coqui(source_wav:str, target_wav:str):
|
6 |
+
'''
|
7 |
+
Use Coqui TTS for zero-shot voice conversion.
|
8 |
+
|
9 |
+
Inputs:
|
10 |
+
source_wav: Wav of the thing you want to say.
|
11 |
+
target_wav: Wav of the speaker you want to hear.
|
12 |
+
Returns:
|
13 |
+
Streaming wav and sampling rate.
|
14 |
+
'''
|
15 |
+
# Get device
|
16 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
+
|
18 |
+
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to(device)
|
19 |
+
wav = tts.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
20 |
+
|
21 |
+
return wav, 24000 # Identified sampling rate of freevc24
|
src/helpers.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pycountry
|
2 |
+
|
3 |
+
iso_encoder = {"English":"eng",
|
4 |
+
"French":"fra",
|
5 |
+
"Moore": "mos"}
|
6 |
+
|
7 |
+
iso_decoder = dict((v,k) for k,v in iso_encoder.items())
|
8 |
+
|
9 |
+
|
10 |
+
|
11 |
+
def encode_iso(lang:str)-> str:
|
12 |
+
''' Takes the name of a language and returns its ISO-3 code. '''
|
13 |
+
return iso_encoder[lang]
|
14 |
+
|
15 |
+
def decode_iso(iso:str)-> str:
|
16 |
+
''' Takes an ISO-3 code and returns the name of the language. '''
|
17 |
+
|
18 |
+
if "-" in iso:
|
19 |
+
iso, suffix = iso.split("-", 1)
|
20 |
+
else:
|
21 |
+
suffix = None
|
22 |
+
|
23 |
+
name = pycountry.languages.get(alpha_3 = iso).name
|
24 |
+
name = name.replace("Mossi", "Mooré").replace("Swahili (individual language)", "Swahili")
|
25 |
+
|
26 |
+
if suffix is not None:
|
27 |
+
name+= f" - {suffix}"
|
28 |
+
|
29 |
+
return name
|
src/lookups.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
language_list = ['swh', 'eng', 'spa','fra','por','ron', 'fas', 'lin', 'mos','rus',
|
2 |
+
#'ara','fas','ukr','tur', 'mya', 'rus',
|
3 |
+
#'kmr-script_latin', 'urd-script_arabic', 'urd-script_devanagari', 'urd-script_latin',
|
4 |
+
]
|
5 |
+
|
6 |
+
#####################################
|
7 |
+
placeholders = {
|
8 |
+
'swh': "Mfuko wa Kimataifa wa Watoto",
|
9 |
+
'eng': "the United Nations International Children's Emergency Fund",
|
10 |
+
'spa': "El Fondo de las Naciones Unidas para la Infancia",
|
11 |
+
'fra': "Le Fonds des Nations unies pour l'enfance",
|
12 |
+
'por': "O Fundo das Nações Unidas para a Infância",
|
13 |
+
'ron': "Fondul Internațional pentru Urgențe ale Copiilor al Națiunilor Unite",
|
14 |
+
'fas': "صندوق کودکان ملل متحد",
|
15 |
+
'lin': 'Your phrase here',
|
16 |
+
'mos': 'Your phrase here',
|
17 |
+
'rus': 'Международного фонда помощи детям' ###
|
18 |
+
}
|
19 |
+
|
20 |
+
#####################################
|
21 |
+
models = {
|
22 |
+
'swh': {
|
23 |
+
'mms': 'facebook/mms-tts-swh',
|
24 |
+
'coqui': None,
|
25 |
+
'espeakng': 'sw',
|
26 |
+
},
|
27 |
+
'eng': {
|
28 |
+
'mms': 'facebook/mms-tts-eng',
|
29 |
+
'coqui': None,
|
30 |
+
'espeakng': 'en',
|
31 |
+
},
|
32 |
+
'spa':{
|
33 |
+
'mms': 'facebook/mms-tts-spa',
|
34 |
+
'coqui': 'tts_models/es/css10/vits',
|
35 |
+
'espeakng': 'es-419',
|
36 |
+
},
|
37 |
+
'fra':{
|
38 |
+
'mms': 'facebook/mms-tts-fra',
|
39 |
+
'coqui': 'tts_models/fr/css10/vits',
|
40 |
+
'espeakng': 'fr',
|
41 |
+
},
|
42 |
+
'por':{
|
43 |
+
'mms': 'facebook/mms-tts-por',
|
44 |
+
'coqui': 'tts_models/pt/cv/vits',
|
45 |
+
'espeakng': 'pt-br',
|
46 |
+
},
|
47 |
+
'ron':{
|
48 |
+
'mms': 'facebook/mms-tts-ron',
|
49 |
+
'coqui': 'tts_models/ro/cv/vits',
|
50 |
+
'espeakng': 'ro',
|
51 |
+
},
|
52 |
+
'fas':{
|
53 |
+
'mms': 'facebook/mms-tts-fas',
|
54 |
+
'coqui': None, #'tts_models/fa/custom/glow-tts',
|
55 |
+
'espeakng': 'fa',
|
56 |
+
} ,
|
57 |
+
'lin':{
|
58 |
+
'mms': None,
|
59 |
+
'coqui': 'tts_models/lin/openbible/vits',
|
60 |
+
'espeakng': None,
|
61 |
+
},
|
62 |
+
'mos':{
|
63 |
+
'mms': 'facebook/mms-tts-mos',
|
64 |
+
'coqui': None,
|
65 |
+
'espeakng': None,
|
66 |
+
} ,
|
67 |
+
'rus':{
|
68 |
+
'mms': 'facebook/mms-tts-rus',
|
69 |
+
'coqui': None,
|
70 |
+
'espeakng': 'ru',
|
71 |
+
}
|
72 |
+
}
|
src/synthesize.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import IPython
|
2 |
+
from huggingface_hub.inference_api import InferenceApi
|
3 |
+
import torch
|
4 |
+
from TTS.api import TTS
|
5 |
+
import wave
|
6 |
+
from espeakng import ESpeakNG
|
7 |
+
import subprocess
|
8 |
+
from scipy.io import wavfile
|
9 |
+
from transformers import pipeline
|
10 |
+
import os
|
11 |
+
|
12 |
+
def synth_mms(text:str, model:str):
|
13 |
+
'''
|
14 |
+
Use Huggingface inference pipeline to synthesize text.
|
15 |
+
(Can be replaced by inference API, but that requires stored API token.)
|
16 |
+
|
17 |
+
Inputs:
|
18 |
+
text: Text to synthesze
|
19 |
+
model: Model code of the form mms-tts-LAN
|
20 |
+
Returns:
|
21 |
+
Streaming numpy and sampling rate.
|
22 |
+
'''
|
23 |
+
#inference = InferenceApi(repo_id=f"facebook/{model}",
|
24 |
+
# token=API_TOKEN)
|
25 |
+
#mms_tts = inference(inputs=text,
|
26 |
+
# raw_response=True)._content
|
27 |
+
|
28 |
+
if model is not None:
|
29 |
+
pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
|
30 |
+
mms_tts = pipe(text)
|
31 |
+
return mms_tts['audio'], mms_tts['sampling_rate']
|
32 |
+
else:
|
33 |
+
return None
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
def synth_coqui(text:str, model:str):
|
38 |
+
'''
|
39 |
+
Use Coqui inference API to synthesize text.
|
40 |
+
|
41 |
+
Inputs:
|
42 |
+
text: Text to synthesze
|
43 |
+
model: Model code
|
44 |
+
Returns:
|
45 |
+
Streaming Wav and sampling rate.
|
46 |
+
'''
|
47 |
+
if model is not None:
|
48 |
+
# Get device
|
49 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
50 |
+
|
51 |
+
# Init TTS
|
52 |
+
tts = TTS(model, progress_bar=False).to(device)
|
53 |
+
|
54 |
+
tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False)
|
55 |
+
|
56 |
+
sampling_rate, wav = wavfile.read('test.wav')
|
57 |
+
os.remove("test.wav")
|
58 |
+
|
59 |
+
#wav = tts.tts(text=text)
|
60 |
+
return wav, sampling_rate
|
61 |
+
else:
|
62 |
+
return None
|
63 |
+
|
64 |
+
|
65 |
+
def synth_espeakng(text:str, model:str):
|
66 |
+
'''
|
67 |
+
Use ESpeak-NG to synthesize text.
|
68 |
+
|
69 |
+
Inputs:
|
70 |
+
text: Text to synthesze
|
71 |
+
model: Model code
|
72 |
+
Returns:
|
73 |
+
Streaming Wav and sampling rate.
|
74 |
+
'''
|
75 |
+
if model is not None:
|
76 |
+
|
77 |
+
subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]).returncode
|
78 |
+
|
79 |
+
sampling_rate, wav = wavfile.read('test.wav')
|
80 |
+
os.remove("test.wav")
|
81 |
+
|
82 |
+
#wav = tts.tts(text=text)
|
83 |
+
return wav, sampling_rate
|
84 |
+
else:
|
85 |
+
return None
|
target_speaker.wav
ADDED
Binary file (51.5 kB). View file
|
|