File size: 1,450 Bytes
cd75eda
 
 
 
aa0196c
e89bb30
aa0196c
cd75eda
e89bb30
 
 
 
 
 
cd75eda
 
f4d29b7
daea792
 
cd75eda
 
acbc346
 
 
 
 
cd75eda
e89bb30
cd75eda
 
 
 
 
 
 
e89bb30
 
 
cd75eda
 
 
e89bb30
cd75eda
 
 
 
e89bb30
926a62f
 
ba54530
fd46c98
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from vietTTS.hifigan.mel2wave import mel2wave
from vietTTS.nat.text2mel import text2mel
from vietTTS import nat_normalize_text
import numpy as np
import gradio as gr
import os


def download_assets():
    os.system("gdown --id 16UhN8QBxG1YYwUh8smdEeVnKo9qZhvZj -O duration_latest_ckpt.pickle")
    os.system("gdown --id 1-8Ig65S3irNHSzcskT37SLgeyuUhjKdj -O acoustic_latest_ckpt.pickle")
    os.system("gdown --id 19cRNDC6IrHFAAE4U9I7K0mzLMgPsi5zb -O hk_hifi.pickle")
    os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/hifigan/config.json")
    os.system("wget https://raw.githubusercontent.com/NTT123/vietTTS/master/assets/infore/lexicon.txt")

def text_to_speech(text):
    # prevent too long text
    if len(text) > 500:
        text = text[:500]
    text = nat_normalize_text(text)
    mel = text2mel(
        text,
        "lexicon.txt",
        0.2,
        "acoustic_latest_ckpt.pickle",
        "duration_latest_ckpt.pickle",
    )
    wave = mel2wave(mel, "config.json", "hk_hifi.pickle")
    return (wave * (2**15)).astype(np.int16)


def speak(text):
    y = text_to_speech(text)
    return 16_000, y


download_assets()

title = "vietTTS"
description = "A vietnamese text-to-speech demo."

gr.Interface(
    fn=speak, 
    inputs="text", 
    outputs="audio",
    title = title,
    description=description,
    theme="default",
    allow_screenshot=False,
    allow_flagging="never",
).launch(debug=False)