|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from .text_normlization import * |
|
|
|
rep_map = { |
|
":": ",", |
|
";": ",", |
|
",": ",", |
|
"。": ".", |
|
"!": "!", |
|
"?": "?", |
|
"\n": ".", |
|
"·": ",", |
|
"、": ",", |
|
"...": "…", |
|
"$": ".", |
|
"/": ",", |
|
"—": "-", |
|
"~": "…", |
|
"~": "…", |
|
} |
|
|
|
|
|
def replace_punctuation(text): |
|
text = text.replace("嗯", "恩").replace("呣", "母") |
|
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys())) |
|
|
|
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text) |
|
punctuation = ["!", "?", "…", ",", "."] |
|
replaced_text = re.sub( |
|
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text |
|
) |
|
print(replaced_text) |
|
|
|
return replaced_text |
|
|
|
|
|
def text_normalize(text): |
|
|
|
tx = TextNormalizer() |
|
sentences = tx.normalize(text) |
|
dest_text = "" |
|
for sentence in sentences: |
|
dest_text += replace_punctuation(sentence) |
|
print(dest_text, sentence) |
|
return dest_text |
|
|