|
|
|
|
|
import sys
|
|
import re
|
|
import cn2an
|
|
|
|
from pyjyutping import jyutping
|
|
from text.symbols import punctuation
|
|
from text.zh_normalization.text_normlization import TextNormalizer
|
|
|
|
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
|
|
|
INITIALS = [
|
|
"aa",
|
|
"aai",
|
|
"aak",
|
|
"aap",
|
|
"aat",
|
|
"aau",
|
|
"ai",
|
|
"au",
|
|
"ap",
|
|
"at",
|
|
"ak",
|
|
"a",
|
|
"p",
|
|
"b",
|
|
"e",
|
|
"ts",
|
|
"t",
|
|
"dz",
|
|
"d",
|
|
"kw",
|
|
"k",
|
|
"gw",
|
|
"g",
|
|
"f",
|
|
"h",
|
|
"l",
|
|
"m",
|
|
"ng",
|
|
"n",
|
|
"s",
|
|
"y",
|
|
"w",
|
|
"c",
|
|
"z",
|
|
"j",
|
|
"ong",
|
|
"on",
|
|
"ou",
|
|
"oi",
|
|
"ok",
|
|
"o",
|
|
"uk",
|
|
"ung",
|
|
]
|
|
INITIALS += ["sp", "spl", "spn", "sil"]
|
|
|
|
|
|
rep_map = {
|
|
":": ",",
|
|
";": ",",
|
|
",": ",",
|
|
"。": ".",
|
|
"!": "!",
|
|
"?": "?",
|
|
"\n": ".",
|
|
"·": ",",
|
|
"、": ",",
|
|
"...": "…",
|
|
"$": ".",
|
|
"“": "'",
|
|
"”": "'",
|
|
'"': "'",
|
|
"‘": "'",
|
|
"’": "'",
|
|
"(": "'",
|
|
")": "'",
|
|
"(": "'",
|
|
")": "'",
|
|
"《": "'",
|
|
"》": "'",
|
|
"【": "'",
|
|
"】": "'",
|
|
"[": "'",
|
|
"]": "'",
|
|
"—": "-",
|
|
"~": "-",
|
|
"~": "-",
|
|
"「": "'",
|
|
"」": "'",
|
|
}
|
|
|
|
|
|
def replace_punctuation(text):
|
|
|
|
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
|
|
|
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
|
|
|
replaced_text = re.sub(
|
|
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
|
)
|
|
|
|
return replaced_text
|
|
|
|
|
|
def text_normalize(text):
|
|
tx = TextNormalizer()
|
|
sentences = tx.normalize(text)
|
|
dest_text = ""
|
|
for sentence in sentences:
|
|
dest_text += replace_punctuation(sentence)
|
|
return dest_text
|
|
|
|
|
|
punctuation_set=set(punctuation)
|
|
def jyuping_to_initials_finals_tones(jyuping_syllables):
|
|
initials_finals = []
|
|
tones = []
|
|
word2ph = []
|
|
|
|
for syllable in jyuping_syllables:
|
|
if syllable in punctuation:
|
|
initials_finals.append(syllable)
|
|
tones.append(0)
|
|
word2ph.append(1)
|
|
elif syllable == "_":
|
|
initials_finals.append(syllable)
|
|
tones.append(0)
|
|
word2ph.append(1)
|
|
else:
|
|
try:
|
|
tone = int(syllable[-1])
|
|
syllable_without_tone = syllable[:-1]
|
|
except ValueError:
|
|
tone = 0
|
|
syllable_without_tone = syllable
|
|
|
|
for initial in INITIALS:
|
|
if syllable_without_tone.startswith(initial):
|
|
if syllable_without_tone.startswith("nga"):
|
|
initials_finals.extend(
|
|
[
|
|
syllable_without_tone[:2],
|
|
syllable_without_tone[2:] or syllable_without_tone[-1],
|
|
]
|
|
)
|
|
|
|
tones.extend([-1, tone])
|
|
word2ph.append(2)
|
|
else:
|
|
final = syllable_without_tone[len(initial) :] or initial[-1]
|
|
initials_finals.extend([initial, final])
|
|
|
|
tones.extend([-1, tone])
|
|
word2ph.append(2)
|
|
break
|
|
assert len(initials_finals) == len(tones)
|
|
|
|
|
|
phones=[]
|
|
for a,b in zip(initials_finals,tones):
|
|
if(b not in [-1,0]):
|
|
todo="%s%s"%(a,b)
|
|
else:todo=a
|
|
if(todo not in punctuation_set):todo="Y%s"%todo
|
|
phones.append(todo)
|
|
|
|
|
|
return phones, word2ph
|
|
|
|
|
|
def get_jyutping(text):
|
|
jp = jyutping.convert(text)
|
|
|
|
for symbol in punctuation:
|
|
jp = jp.replace(symbol, " " + symbol + " ")
|
|
jp_array = jp.split()
|
|
return jp_array
|
|
|
|
|
|
def get_bert_feature(text, word2ph):
|
|
from text import chinese_bert
|
|
|
|
return chinese_bert.get_bert_feature(text, word2ph)
|
|
|
|
|
|
def g2p(text):
|
|
|
|
jyuping = get_jyutping(text)
|
|
|
|
|
|
phones, word2ph = jyuping_to_initials_finals_tones(jyuping)
|
|
|
|
|
|
|
|
return phones, word2ph
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
text = "佢個鋤頭太短啦。"
|
|
text = text_normalize(text)
|
|
|
|
phones, word2ph = g2p(text)
|
|
|
|
print(phones, word2ph) |