vietTTS / app.py
ntt123's picture
Update app.py
f4d29b7
raw
history blame
1.45 kB
from vietTTS.hifigan.mel2wave import mel2wave
from vietTTS.nat.text2mel import text2mel
from vietTTS import nat_normalize_text
import numpy as np
import gradio as gr
import os
def download_assets():
os.system("gdown --id 16UhN8QBxG1YYwUh8smdEeVnKo9qZhvZj -O duration_latest_ckpt.pickle")
os.system("gdown --id 1-8Ig65S3irNHSzcskT37SLgeyuUhjKdj -O acoustic_latest_ckpt.pickle")
os.system("gdown --id 19cRNDC6IrHFAAE4U9I7K0mzLMgPsi5zb -O hk_hifi.pickle")
os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/hifigan/config.json")
os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/infore/lexicon.txt")
def text_to_speech(text):
# prevent too long text
if len(text) > 100:
text = text[:100]
text = nat_normalize_text(text)
mel = text2mel(
text,
"lexicon.txt",
0.2,
"acoustic_latest_ckpt.pickle",
"duration_latest_ckpt.pickle",
)
wave = mel2wave(mel, "config.json", "hk_hifi.pickle")
return (wave * (2**15)).astype(np.int16)
def speak(text):
y = text_to_speech(text)
return 16_000, y
download_assets()
title = "vietTTS"
description = "A vietnamese text-to-speech demo."
gr.Interface(
fn=speak,
inputs="text",
outputs="audio",
title = title,
description=description,
theme="default",
allow_screenshot=False,
allow_flagging="never",
).launch(debug=False)