# reference: https://huggingface.co/spaces/Naozumi0512/Bert-VITS2-Cantonese-Yue/blob/main/text/chinese.py import sys import re import cn2an from pyjyutping import jyutping from text.symbols import punctuation from text.zh_normalization.text_normlization import TextNormalizer normalizer = lambda x: cn2an.transform(x, "an2cn") INITIALS = [ "aa", "aai", "aak", "aap", "aat", "aau", "ai", "au", "ap", "at", "ak", "a", "p", "b", "e", "ts", "t", "dz", "d", "kw", "k", "gw", "g", "f", "h", "l", "m", "ng", "n", "s", "y", "w", "c", "z", "j", "ong", "on", "ou", "oi", "ok", "o", "uk", "ung", ] INITIALS += ["sp", "spl", "spn", "sil"] rep_map = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": "…", "$": ".", "“": "'", "”": "'", '"': "'", "‘": "'", "’": "'", "(": "'", ")": "'", "(": "'", ")": "'", "《": "'", "》": "'", "【": "'", "】": "'", "[": "'", "]": "'", "—": "-", "~": "-", "~": "-", "「": "'", "」": "'", } def replace_punctuation(text): # text = text.replace("嗯", "恩").replace("呣", "母") pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) replaced_text = re.sub( r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text ) return replaced_text def text_normalize(text): tx = TextNormalizer() sentences = tx.normalize(text) dest_text = "" for sentence in sentences: dest_text += replace_punctuation(sentence) return dest_text punctuation_set=set(punctuation) def jyuping_to_initials_finals_tones(jyuping_syllables): initials_finals = [] tones = [] word2ph = [] for syllable in jyuping_syllables: if syllable in punctuation: initials_finals.append(syllable) tones.append(0) word2ph.append(1) # Add 1 for punctuation elif syllable == "_": initials_finals.append(syllable) tones.append(0) word2ph.append(1) # Add 1 for underscore else: try: tone = int(syllable[-1]) syllable_without_tone = syllable[:-1] except ValueError: tone = 0 syllable_without_tone = syllable for initial in INITIALS: if syllable_without_tone.startswith(initial): if syllable_without_tone.startswith("nga"): initials_finals.extend( [ syllable_without_tone[:2], syllable_without_tone[2:] or syllable_without_tone[-1], ] ) # tones.extend([tone, tone]) tones.extend([-1, tone]) word2ph.append(2) else: final = syllable_without_tone[len(initial) :] or initial[-1] initials_finals.extend([initial, final]) # tones.extend([tone, tone]) tones.extend([-1, tone]) word2ph.append(2) break assert len(initials_finals) == len(tones) ###魔改为辅音+带音调的元音 phones=[] for a,b in zip(initials_finals,tones): if(b not in [-1,0]):###防止粤语和普通话重合开头加Y,如果是标点,不加。 todo="%s%s"%(a,b) else:todo=a if(todo not in punctuation_set):todo="Y%s"%todo phones.append(todo) # return initials_finals, tones, word2ph return phones, word2ph def get_jyutping(text): jp = jyutping.convert(text) # print(1111111,jp) for symbol in punctuation: jp = jp.replace(symbol, " " + symbol + " ") jp_array = jp.split() return jp_array def get_bert_feature(text, word2ph): from text import chinese_bert return chinese_bert.get_bert_feature(text, word2ph) def g2p(text): # word2ph = [] jyuping = get_jyutping(text) # print(jyuping) # phones, tones, word2ph = jyuping_to_initials_finals_tones(jyuping) phones, word2ph = jyuping_to_initials_finals_tones(jyuping) # phones = ["_"] + phones + ["_"] # tones = [0] + tones + [0] # word2ph = [1] + word2ph + [1] return phones, word2ph if __name__ == "__main__": # text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏" text = "佢個鋤頭太短啦。" text = text_normalize(text) # phones, tones, word2ph = g2p(text) phones, word2ph = g2p(text) # print(phones, tones, word2ph) print(phones, word2ph)