Spaces:
Running
Running
from vietTTS.hifigan.mel2wave import mel2wave | |
from vietTTS.nat.text2mel import text2mel | |
from vietTTS import nat_normalize_text | |
import numpy as np | |
import gradio as gr | |
import os | |
def download_assets(): | |
os.system("gdown --id 16UhN8QBxG1YYwUh8smdEeVnKo9qZhvZj -O duration_latest_ckpt.pickle") | |
os.system("gdown --id 1-8Ig65S3irNHSzcskT37SLgeyuUhjKdj -O acoustic_latest_ckpt.pickle") | |
os.system("gdown --id 19cRNDC6IrHFAAE4U9I7K0mzLMgPsi5zb -O hk_hifi.pickle") | |
os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/hifigan/config.json") | |
os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/infore/lexicon.txt") | |
def text_to_speech(text): | |
# prevent too long text | |
if len(text) > 500: | |
text = text[:500] | |
text = nat_normalize_text(text) | |
mel = text2mel( | |
text, | |
"lexicon.txt", | |
0.2, | |
"acoustic_latest_ckpt.pickle", | |
"duration_latest_ckpt.pickle", | |
) | |
wave = mel2wave(mel, "config.json", "hk_hifi.pickle") | |
return (wave * (2**15)).astype(np.int16) | |
def speak(text): | |
y = text_to_speech(text) | |
return 16_000, y | |
download_assets() | |
title = "vietTTS" | |
description = "A vietnamese text-to-speech demo." | |
gr.Interface( | |
fn=speak, | |
inputs="text", | |
outputs="audio", | |
title = title, | |
description=description, | |
theme="default", | |
allow_screenshot=False, | |
allow_flagging="never", | |
).launch(debug=False) | |