Spaces:
Running
Running
File size: 6,626 Bytes
af3d42a 76282ae af3d42a 88d4840 af3d42a 88d4840 af3d42a 1890488 88d4840 1890488 88d4840 af3d42a 1890488 af3d42a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
from infer import OnnxInferenceSession
from text import cleaned_text_to_sequence, get_bert
from text.cleaner import clean_text
import numpy as np
from huggingface_hub import hf_hub_download
import asyncio
from pathlib import Path
OnnxSession = None
models = [
{
"local_path": "./bert/bert-large-cantonese",
"repo_id": "hon9kon9ize/bert-large-cantonese",
"files": [
"pytorch_model.bin"
]
},
{
"local_path": "./bert/deberta-v3-large",
"repo_id": "microsoft/deberta-v3-large",
"files": [
"spm.model",
"pytorch_model.bin"
]
},
{
"local_path": "./onnx",
"repo_id": "hon9kon9ize/bert-vits-zoengjyutgaai-onnx",
"files": [
"BertVits2.2PT.json",
"BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
"BertVits2.2PT/BertVits2.2PT_emb.onnx",
"BertVits2.2PT/BertVits2.2PT_dp.onnx",
"BertVits2.2PT/BertVits2.2PT_sdp.onnx",
"BertVits2.2PT/BertVits2.2PT_flow.onnx",
"BertVits2.2PT/BertVits2.2PT_dec.onnx"
]
}
]
def get_onnx_session():
global OnnxSession
if OnnxSession is not None:
return OnnxSession
OnnxSession = OnnxInferenceSession(
{
"enc": "onnx/BertVits2.2PT/BertVits2.2PT_enc_p.onnx",
"emb_g": "onnx/BertVits2.2PT/BertVits2.2PT_emb.onnx",
"dp": "onnx/BertVits2.2PT/BertVits2.2PT_dp.onnx",
"sdp": "onnx/BertVits2.2PT/BertVits2.2PT_sdp.onnx",
"flow": "onnx/BertVits2.2PT/BertVits2.2PT_flow.onnx",
"dec": "onnx/BertVits2.2PT/BertVits2.2PT_dec.onnx",
},
Providers=["CPUExecutionProvider"],
)
return OnnxSession
def download_model_files(repo_id, files, local_path):
for file in files:
if not Path(local_path).joinpath(file).exists():
hf_hub_download(
repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
)
def download_models():
for data in models:
download_model_files(data["repo_id"], data["files"], data["local_path"])
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
result[1::2] = lst
return result
def get_text(text, language_str, style_text=None, style_weight=0.7):
style_text = None if style_text == "" else style_text
# 在此处实现当前版本的get_text
norm_text, phone, tone, word2ph = clean_text(text, language_str)
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
# add blank
phone = intersperse(phone, 0)
tone = intersperse(tone, 0)
language = intersperse(language, 0)
for i in range(len(word2ph)):
word2ph[i] = word2ph[i] * 2
word2ph[0] += 1
bert_ori = get_bert(
norm_text, word2ph, language_str, "cpu", style_text, style_weight
)
del word2ph
assert bert_ori.shape[-1] == len(phone), phone
if language_str == "EN":
en_bert = bert_ori
yue_bert = np.random.randn(1024, len(phone))
elif language_str == "YUE":
en_bert = np.random.randn(1024, len(phone))
yue_bert = bert_ori
else:
raise ValueError("language_str should be EN or YUE")
assert yue_bert.shape[-1] == len(
phone
), f"Bert seq len {yue_bert.shape[-1]} != {len(phone)}"
phone = np.asarray(phone)
tone = np.asarray(tone)
language = np.asarray(language)
en_bert = np.asarray(en_bert.T)
yue_bert = np.asarray(yue_bert.T)
return en_bert, yue_bert, phone, tone, language
# Text-to-speech function
async def text_to_speech(text, sid=0, language="YUE"):
Session = get_onnx_session()
if not text.strip():
return None, gr.Warning("Please enter text to convert.")
en_bert, yue_bert, x, tone, language = get_text(text, language)
sid = np.array([sid])
audio = Session(x, tone, language, en_bert, yue_bert, sid, sdp_ratio=0.4)
return audio[0][0]
# Create Gradio application
import gradio as gr
# Gradio interface function
def tts_interface(text):
audio = asyncio.run(text_to_speech(text, 0, "YUE"))
return 44100, audio
async def create_demo():
description = """張悦楷粵語語音生成器,基於 Bert-VITS2 模型
本模型由 https://huggingface.co/datasets/laubonghaudoi/zoengjyutgaai_saamgwokjinji 張悦楷語音數據集訓練而得,所以係楷叔把聲。
注意:模型本身支持粵文同英文,但呢個 space 未實現中英夾雜生成。
"""
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5),
],
outputs=[
gr.Audio(label="Generated Audio"),
],
examples=[
["漆黑之中我心眺望,不出一聲但兩眼發光\n寂寞極淒厲,晚風充滿汗,只因她幽怨目光"],
["本身我就係一個言出必達嘅人"],
["正話坐落喺龍椅上便,突然間,一朕狂風呼——哈噉吹起上嚟。"],
["1950年春,廣東開始試行土改,到1951年夏天已在1500萬人口的地區鋪開。廣東省土改委員會主任由華南分局第三書記方方擔任。以林彪為第一書記,鄧子恢為第二書記的中共中央中南局,以及李雪峰為主任的中南局土改委員會, 在對廣東土改的評價上,一直同華南分局之間存在嚴重分歧。李雪峰多次在中南局機關報《長江日報》批評廣東土改群眾發動不夠,太右,是「和平土改」。毛澤東和中南局認為,需要改變廣東土改領導軟弱和進展緩慢的局面。1951年4月,中南局將中共南陽地委書記趙紫陽調到廣東,任華南分局秘書長,5月6日又增選為廣東省土改委員會副主任。1951年12月25日,又將廣西省委代理書記陶鑄調任華南分局第四書記,並接替方方主管廣東土改運動。此後,中南局正式提出了「廣東黨組織嚴重不純,要反對地方主義」的口號。廣東先後36次大規模進行「土改整隊」、「整肅」。到1952年5月,全省共處理廣東「地方主義」幹部6515人。期間,提出了「依靠大軍,依靠南下幹部,由大軍、南下幹部掛帥的方針」。"]
],
title="Cantonese TTS Text-to-Speech 粵語語音合成",
description=description,
analytics_enabled=False,
allow_flagging=False,
)
return demo
# Run the application
if __name__ == "__main__":
download_models()
demo = asyncio.run(create_demo())
demo.launch() |