import itertools import re LANGUAGE_UNICODE_RANGE_MAP = { "ZH": [(0x4E00, 0x9FFF)], "JP": [(0x4E00, 0x9FFF), (0x3040, 0x309F), (0x30A0, 0x30FF), (0x31F0, 0x31FF)], "EN": [(0x0000, 0x007F)], } SYMBOLS_MAPPING = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": "…", "“": "'", "”": "'", "‘": "'", "’": "'", "(": "'", ")": "'", "(": "'", ")": "'", "《": "'", "》": "'", "【": "'", "】": "'", "[": "'", "]": "'", "—": "-", "~": "-", "~": "-", "・": "-", "「": "'", "」": "'", ";": ",", ":": ",", } REPLACE_SYMBOL_REGEX = re.compile( "|".join(re.escape(p) for p in SYMBOLS_MAPPING.keys()) ) ALL_KNOWN_UTF8_RANGE = list( itertools.chain.from_iterable(LANGUAGE_UNICODE_RANGE_MAP.values()) ) REMOVE_UNKNOWN_SYMBOL_REGEX = re.compile( "[^" + "".join( f"{re.escape(chr(start))}-{re.escape(chr(end))}" for start, end in ALL_KNOWN_UTF8_RANGE ) + "]" ) def clean_text(text): # Clean the text text = text.strip() # Replace all chinese symbols with their english counterparts text = REPLACE_SYMBOL_REGEX.sub(lambda x: SYMBOLS_MAPPING[x.group()], text) text = REMOVE_UNKNOWN_SYMBOL_REGEX.sub("", text) return text