tts / text /cantonese.py
indiejoseph's picture
Update text/cantonese.py
cf05cbe verified
raw
history blame
7.44 kB
from text.symbols import punctuation
import re
import unicodedata
import cn2an
import pycantonese
import jieba
import csv
jieba.load_userdict("./text/yue_dict.txt")
jyutping_dict = {}
with open("./text/jyutping.csv", "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
word, jyutping = line.split(",")
if word not in jyutping_dict:
jyutping_dict[word] = [jyutping]
else:
jyutping_dict[word].append(jyutping)
def normalizer(x):
x = cn2an.transform(x, "an2cn")
return x
def word2jyutping(word):
jyutpings = [pycantonese.characters_to_jyutping(
w)[0][1] for w in word if unicodedata.name(w, "").startswith("CJK UNIFIED IDEOGRAPH")]
for i, j in enumerate(jyutpings):
if re.search(r"^(la|ga)[1-6]$", j):
# la1 -> laa1, ga1 -> gaa1
jyutpings[i] = jyutpings[i].replace('a', 'aa')
if None in jyutpings:
raise ValueError(f"Failed to convert {word} to jyutping: {jyutpings}")
return " ".join(jyutpings)
INITIALS = ["", "b", "c", "d", "f", "g", "gw", "h", "j",
"k", "kw", "l", "m", "n", "ng", "p", "s", "t", "w", "z"]
FINALS = ["aa", "aai", "aau", "aam", "aan", "aang", "aap", "aat", "aak", "ai", "au", "am", "an", "ang", "ap", "at", "ak", "e", "ei", "eu", "em", "eng", "ep", "ek", "i", "iu", "im",
"in", "ing", "ip", "it", "ik", "o", "oi", "ou", "on", "ong", "ot", "ok", "oe", "oeng", "oek", "eoi", "eon", "eot", "u", "ui", "un", "ung", "ut", "uk", "yu", "yun", "yut", "m", "ng"]
rep_map = {
":": ",",
"︰": ",",
";": ",",
",": ",",
"﹐": ",",
"。": ".",
"!": "!",
"?": "?",
"﹖": "?",
"﹗": "!",
"\n": ".",
"·": ",",
"、": ",",
"丶": ",",
"...": "…",
"⋯": "…",
"$": ".",
"“": "'",
"”": "'",
'"': "'",
"‘": "'",
"’": "'",
"(": "'",
")": "'",
"(": "'",
")": "'",
"《": "'",
"》": "'",
"【": "'",
"】": "'",
"[": "'",
"]": "'",
"—": "-",
"~": "-",
"~": "-",
"「": "'",
"」": "'",
"_": "-",
}
replacement_chars = {
"\n": " ",
'ㄧ': '一',
'—': '一',
'更': '更',
'不': '不',
'料': '料',
'聯': '聯',
'行': '行',
'利': '利',
'謢': '護',
'岀': '出',
'鎭': '鎮',
'戯': '戲',
'旣': '既',
'立': '立',
'來': '來',
'年': '年',
'㗇': '蝦',
}
def replace_punctuation(text):
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
replaced_text = "".join(
c for c in replaced_text if unicodedata.name(c, "").startswith("CJK UNIFIED IDEOGRAPH") or c in punctuation
)
return replaced_text
def replace_chars(text):
for k, v in replacement_chars.items():
text = text.replace(k, v)
return text
def word_segmentation(text):
words = jieba.cut(text)
return words
def text_normalize(text):
text = text.strip()
text = normalizer(text)
text = replace_punctuation(text)
text = replace_chars(text)
return text
def jyuping_to_initials_finals_tones(jyuping_syllables):
initials_finals = []
tones = []
word2ph = []
for syllable in jyuping_syllables:
if syllable in punctuation:
initials_finals.append(syllable)
tones.append(0)
word2ph.append(1) # Add 1 for punctuation
else:
init, final, tone = parse_jyutping(syllable)
initials_finals.extend([init, final])
tones.extend([tone, tone])
word2ph.append(2)
assert len(initials_finals) == len(tones)
return initials_finals, tones, word2ph
wordshk_juytping = {}
# with open("/notebooks/bert-vits2/Bert-VITS2-Cantonese/wordshk_juytping.csv", "r") as csv_file:
# csv_reader = csv.reader(csv_file, delimiter=',')
# for row in csv_reader:
# wordshk_juytping[text_normalize(row[0])] = row[1]
def get_jyutping(text):
if text in wordshk_juytping:
return wordshk_juytping[text].split(" ")
words = word_segmentation(text)
jyutping_array = []
punct_pattern = re.compile(
r"^[{}]+$".format(re.escape("".join(punctuation))))
for word in words:
if punct_pattern.match(word):
puncts = re.split(r"([{}])".format(
re.escape("".join(punctuation))), word)
for punct in puncts:
if len(punct) > 0:
jyutping_array.append(punct)
else:
jyutpings = ""
if word in jyutping_dict:
jyutpings = jyutping_dict[word][0]
else:
jyutpings = word2jyutping(word)
# match multple jyutping eg: liu4 ge3, or single jyutping eg: liu4
if not re.search(r"^([a-z]+[1-6]+[ ]?)+$", jyutpings):
raise ValueError(
f"Failed to convert {word} to jyutping: {jyutpings}")
jyutping_array.extend(jyutpings.split(" "))
return jyutping_array
def get_bert_feature(text, word2ph):
from text import cantonese_bert
return cantonese_bert.get_bert_feature(text, word2ph)
def parse_jyutping(jyutping):
orig_jyutping = jyutping
if len(jyutping) < 2:
raise ValueError(f"Jyutping string too short: {jyutping}")
init = ""
if jyutping[0] == 'n' and jyutping[1] == 'g' and len(jyutping) == 3:
init = ""
elif jyutping[0] == 'm' and len(jyutping) == 2:
init = ""
elif jyutping[0] == 'n' and jyutping[1] == 'g':
init = 'ng'
jyutping = jyutping[2:]
elif jyutping[0] == 'g' and jyutping[1] == 'w':
init = 'gw'
jyutping = jyutping[2:]
elif jyutping[0] == 'k' and jyutping[1] == 'w':
init = 'kw'
jyutping = jyutping[2:]
elif jyutping[0] in 'bpmfdtnlgkhwzcsj':
init = jyutping[0]
jyutping = jyutping[1:]
else:
jyutping = jyutping
try:
tone = int(jyutping[-1])
jyutping = jyutping[:-1]
except:
raise ValueError(
f"Jyutping string does not end with a tone number, in {orig_jyutping}")
final = jyutping
assert init in INITIALS, f"Invalid initial: {init}, in {orig_jyutping}"
if final not in FINALS:
raise ValueError(f"Invalid final: {final}, in {orig_jyutping}")
return [init, final, tone]
def g2p(text):
word2ph = []
jyuping = get_jyutping(text)
phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping)
phones = ["_"] + phones + ["_"]
tones = [0] + tones + [0]
word2ph = [1] + word2ph + [1]
return phones, tones, word2ph
if __name__ == "__main__":
from text.cantonese_bert import get_bert_feature
# text = "Apple BB 你點解會咁柒㗎?我真係唔該晒你呀!123"
text = "佢邊係想辭工吖,跳下草裙舞想加人工之嘛。"
# text = "我個 app 嘅介紹文想由你寫,因為我唔知從一般用家角度要細緻到乜程度"
# text = "佢哋最叻咪就係去㗇人傷害人,得個殼咋!"
text = text_normalize(text)
print('normalized text', text)
phones, tones, word2ph = g2p(text)
print(phones, tones, word2ph)
bert = get_bert_feature(text, word2ph)
print(bert.shape)