Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
colab
Browse files- .gitattributes +1 -0
- app.py +42 -34
- lexicon/zaonhe.json +19 -0
- lexicon/zaonhe.ocd2 +3 -0
- text/shanghainese.py +1 -1
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.o filter=lfs diff=lfs merge=lfs -text
|
33 |
*.dll filter=lfs diff=lfs merge=lfs -text
|
34 |
*.so filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.o filter=lfs diff=lfs merge=lfs -text
|
33 |
*.dll filter=lfs diff=lfs merge=lfs -text
|
34 |
*.so filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.ocd2 filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
@@ -16,8 +17,8 @@ from mel_processing import spectrogram_torch
|
|
16 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
17 |
|
18 |
|
19 |
-
def get_text(text, hps,
|
20 |
-
text_norm = text_to_sequence(text, hps.symbols, [] if
|
21 |
if hps.data.add_blank:
|
22 |
text_norm = commons.intersperse(text_norm, 0)
|
23 |
text_norm = LongTensor(text_norm)
|
@@ -25,20 +26,17 @@ def get_text(text, hps, is_phoneme):
|
|
25 |
|
26 |
|
27 |
def create_tts_fn(model, hps, speaker_ids):
|
28 |
-
def tts_fn(text, speaker, speed,
|
29 |
if limitation:
|
30 |
-
text_len = len(text)
|
31 |
-
max_len =
|
32 |
-
if
|
33 |
max_len *= 3
|
34 |
-
else:
|
35 |
-
if len(hps.data.text_cleaners) > 0 and hps.data.text_cleaners[0] == "zh_ja_mixture_cleaners":
|
36 |
-
text_len = len(re.sub("(\[ZH\]|\[JA\])", "", text))
|
37 |
if text_len > max_len:
|
38 |
return "Error: Text is too long", None
|
39 |
|
40 |
speaker_id = speaker_ids[speaker]
|
41 |
-
stn_tst = get_text(text, hps,
|
42 |
with no_grad():
|
43 |
x_tst = stn_tst.unsqueeze(0)
|
44 |
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
@@ -115,11 +113,12 @@ def create_soft_vc_fn(model, hps, speaker_ids):
|
|
115 |
return soft_vc_fn
|
116 |
|
117 |
|
118 |
-
def
|
119 |
-
def
|
120 |
-
return _clean_text(
|
|
|
121 |
|
122 |
-
return
|
123 |
|
124 |
|
125 |
css = """
|
@@ -141,6 +140,10 @@ css = """
|
|
141 |
"""
|
142 |
|
143 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
144 |
models_tts = []
|
145 |
models_vc = []
|
146 |
models_soft_vc = []
|
@@ -170,50 +173,55 @@ if __name__ == '__main__':
|
|
170 |
if t == "vits":
|
171 |
models_tts.append((name, cover_path, speakers, lang, example,
|
172 |
hps.symbols, create_tts_fn(model, hps, speaker_ids),
|
173 |
-
|
174 |
models_vc.append((name, cover_path, speakers, create_vc_fn(model, hps, speaker_ids)))
|
175 |
elif t == "soft-vits-vc":
|
176 |
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
|
177 |
|
178 |
-
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft")
|
179 |
|
180 |
app = gr.Blocks(css=css)
|
181 |
|
182 |
with app:
|
183 |
gr.Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n"
|
184 |
-
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=skytnt.moegoe)\n\n"
|
|
|
|
|
|
|
185 |
with gr.Tabs():
|
186 |
with gr.TabItem("TTS"):
|
187 |
with gr.Tabs():
|
188 |
for i, (name, cover_path, speakers, lang, example, symbols, tts_fn,
|
189 |
-
|
190 |
with gr.TabItem(f"model{i}"):
|
191 |
with gr.Column():
|
192 |
cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
|
193 |
gr.Markdown(f"## {name}\n\n"
|
194 |
f"{cover_markdown}"
|
195 |
f"lang: {lang}")
|
196 |
-
tts_input1 = gr.TextArea(label="Text (
|
197 |
elem_id=f"tts-input{i}")
|
198 |
tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
|
199 |
type="index", value=speakers[0])
|
200 |
tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
|
201 |
with gr.Accordion(label="Advanced Options", open=False):
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
tts_submit = gr.Button("Generate", variant="primary")
|
209 |
tts_output1 = gr.Textbox(label="Output Message")
|
210 |
tts_output2 = gr.Audio(label="Output Audio")
|
211 |
-
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3,
|
212 |
[tts_output1, tts_output2])
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
(
|
|
|
|
|
217 |
let root = document.querySelector("body > gradio-app");
|
218 |
if (root.shadowRoot != null)
|
219 |
root = root.shadowRoot;
|
@@ -221,12 +229,12 @@ if __name__ == '__main__':
|
|
221 |
let startPos = text_input.selectionStart;
|
222 |
let endPos = text_input.selectionEnd;
|
223 |
let oldTxt = text_input.value;
|
224 |
-
let result = oldTxt.substring(0, startPos) +
|
225 |
text_input.value = result;
|
226 |
let x = window.scrollX, y = window.scrollY;
|
227 |
text_input.focus();
|
228 |
-
text_input.selectionStart = startPos +
|
229 |
-
text_input.selectionEnd = startPos +
|
230 |
text_input.blur();
|
231 |
window.scrollTo(x, y);
|
232 |
return [];
|
@@ -278,4 +286,4 @@ if __name__ == '__main__':
|
|
278 |
"- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)\n"
|
279 |
"- [https://github.com/Francis-Komizu/Sovits](https://github.com/Francis-Komizu/Sovits)"
|
280 |
)
|
281 |
-
app.queue(concurrency_count=3).launch(show_api=False)
|
|
|
1 |
+
import argparse
|
2 |
import json
|
3 |
import os
|
4 |
import re
|
|
|
17 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
18 |
|
19 |
|
20 |
+
def get_text(text, hps, is_symbol):
|
21 |
+
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
22 |
if hps.data.add_blank:
|
23 |
text_norm = commons.intersperse(text_norm, 0)
|
24 |
text_norm = LongTensor(text_norm)
|
|
|
26 |
|
27 |
|
28 |
def create_tts_fn(model, hps, speaker_ids):
|
29 |
+
def tts_fn(text, speaker, speed, is_symbol):
|
30 |
if limitation:
|
31 |
+
text_len = len(re.sub("\[([A-Z]{2})\]", "", text))
|
32 |
+
max_len = 150
|
33 |
+
if is_symbol:
|
34 |
max_len *= 3
|
|
|
|
|
|
|
35 |
if text_len > max_len:
|
36 |
return "Error: Text is too long", None
|
37 |
|
38 |
speaker_id = speaker_ids[speaker]
|
39 |
+
stn_tst = get_text(text, hps, is_symbol)
|
40 |
with no_grad():
|
41 |
x_tst = stn_tst.unsqueeze(0)
|
42 |
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
|
|
113 |
return soft_vc_fn
|
114 |
|
115 |
|
116 |
+
def create_to_symbol_fn(hps):
|
117 |
+
def to_symbol_fn(is_symbol_input, input_text, temp_text):
|
118 |
+
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
|
119 |
+
else (temp_text, temp_text)
|
120 |
|
121 |
+
return to_symbol_fn
|
122 |
|
123 |
|
124 |
css = """
|
|
|
140 |
"""
|
141 |
|
142 |
if __name__ == '__main__':
|
143 |
+
parser = argparse.ArgumentParser()
|
144 |
+
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
145 |
+
args = parser.parse_args()
|
146 |
+
|
147 |
models_tts = []
|
148 |
models_vc = []
|
149 |
models_soft_vc = []
|
|
|
173 |
if t == "vits":
|
174 |
models_tts.append((name, cover_path, speakers, lang, example,
|
175 |
hps.symbols, create_tts_fn(model, hps, speaker_ids),
|
176 |
+
create_to_symbol_fn(hps)))
|
177 |
models_vc.append((name, cover_path, speakers, create_vc_fn(model, hps, speaker_ids)))
|
178 |
elif t == "soft-vits-vc":
|
179 |
models_soft_vc.append((name, cover_path, speakers, create_soft_vc_fn(model, hps, speaker_ids)))
|
180 |
|
181 |
+
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
|
182 |
|
183 |
app = gr.Blocks(css=css)
|
184 |
|
185 |
with app:
|
186 |
gr.Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n"
|
187 |
+
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=skytnt.moegoe)\n\n"
|
188 |
+
"[Open In Colab]"
|
189 |
+
"(https://colab.research.google.com/drive/14Pb8lpmwZL-JI5Ub6jpG4sz2-8KS0kbS?usp=sharing)"
|
190 |
+
" without queue and length limitation")
|
191 |
with gr.Tabs():
|
192 |
with gr.TabItem("TTS"):
|
193 |
with gr.Tabs():
|
194 |
for i, (name, cover_path, speakers, lang, example, symbols, tts_fn,
|
195 |
+
to_symbol_fn) in enumerate(models_tts):
|
196 |
with gr.TabItem(f"model{i}"):
|
197 |
with gr.Column():
|
198 |
cover_markdown = f"![cover](file/{cover_path})\n\n" if cover_path else ""
|
199 |
gr.Markdown(f"## {name}\n\n"
|
200 |
f"{cover_markdown}"
|
201 |
f"lang: {lang}")
|
202 |
+
tts_input1 = gr.TextArea(label="Text (150 words limitation)", value=example,
|
203 |
elem_id=f"tts-input{i}")
|
204 |
tts_input2 = gr.Dropdown(label="Speaker", choices=speakers,
|
205 |
type="index", value=speakers[0])
|
206 |
tts_input3 = gr.Slider(label="Speed", value=1, minimum=0.5, maximum=2, step=0.1)
|
207 |
with gr.Accordion(label="Advanced Options", open=False):
|
208 |
+
temp_text_var = gr.Variable()
|
209 |
+
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
210 |
+
symbol_list = gr.Dataset(label="Symbol list", components=[tts_input1],
|
211 |
+
samples=[[x] for x in symbols],
|
212 |
+
elem_id=f"symbol-list{i}")
|
213 |
+
symbol_list_json = gr.Json(value=symbols, visible=False)
|
214 |
tts_submit = gr.Button("Generate", variant="primary")
|
215 |
tts_output1 = gr.Textbox(label="Output Message")
|
216 |
tts_output2 = gr.Audio(label="Output Audio")
|
217 |
+
tts_submit.click(tts_fn, [tts_input1, tts_input2, tts_input3, symbol_input],
|
218 |
[tts_output1, tts_output2])
|
219 |
+
symbol_input.change(to_symbol_fn,
|
220 |
+
[symbol_input, tts_input1, temp_text_var],
|
221 |
+
[tts_input1, temp_text_var])
|
222 |
+
symbol_list.click(None, [symbol_list, symbol_list_json], [],
|
223 |
+
_js=f"""
|
224 |
+
(i,symbols) => {{
|
225 |
let root = document.querySelector("body > gradio-app");
|
226 |
if (root.shadowRoot != null)
|
227 |
root = root.shadowRoot;
|
|
|
229 |
let startPos = text_input.selectionStart;
|
230 |
let endPos = text_input.selectionEnd;
|
231 |
let oldTxt = text_input.value;
|
232 |
+
let result = oldTxt.substring(0, startPos) + symbols[i] + oldTxt.substring(endPos);
|
233 |
text_input.value = result;
|
234 |
let x = window.scrollX, y = window.scrollY;
|
235 |
text_input.focus();
|
236 |
+
text_input.selectionStart = startPos + symbols[i].length;
|
237 |
+
text_input.selectionEnd = startPos + symbols[i].length;
|
238 |
text_input.blur();
|
239 |
window.scrollTo(x, y);
|
240 |
return [];
|
|
|
286 |
"- [https://github.com/luoyily/MoeTTS](https://github.com/luoyily/MoeTTS)\n"
|
287 |
"- [https://github.com/Francis-Komizu/Sovits](https://github.com/Francis-Komizu/Sovits)"
|
288 |
)
|
289 |
+
app.queue(concurrency_count=3).launch(show_api=False, share=args.share)
|
lexicon/zaonhe.json
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "Shanghainese to IPA",
|
3 |
+
"segmentation": {
|
4 |
+
"type": "mmseg",
|
5 |
+
"dict": {
|
6 |
+
"type": "ocd2",
|
7 |
+
"file": "zaonhe.ocd2"
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"conversion_chain": [{
|
11 |
+
"dict": {
|
12 |
+
"type": "group",
|
13 |
+
"dicts": [{
|
14 |
+
"type": "ocd2",
|
15 |
+
"file": "zaonhe.ocd2"
|
16 |
+
}]
|
17 |
+
}
|
18 |
+
}]
|
19 |
+
}
|
lexicon/zaonhe.ocd2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a71b5a97eb49699f440137391565d208ea82156f0765986b7f3e16909e15672e
|
3 |
+
size 4095228
|
text/shanghainese.py
CHANGED
@@ -3,7 +3,7 @@ import cn2an
|
|
3 |
import opencc
|
4 |
|
5 |
|
6 |
-
converter = opencc.OpenCC('zaonhe')
|
7 |
|
8 |
# List of (Latin alphabet, ipa) pairs:
|
9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
3 |
import opencc
|
4 |
|
5 |
|
6 |
+
converter = opencc.OpenCC('lexicon/zaonhe.json')
|
7 |
|
8 |
# List of (Latin alphabet, ipa) pairs:
|
9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|