Spaces:
Running
Running
from text import cleaned_text_to_sequence | |
import os | |
# if os.environ.get("version","v1")=="v1": | |
# from text import chinese | |
# from text.symbols import symbols | |
# else: | |
# from text import chinese2 as chinese | |
# from text.symbols2 import symbols | |
from text import symbols as symbols_v1 | |
from text import symbols2 as symbols_v2 | |
special = [ | |
# ("%", "zh", "SP"), | |
("¥", "zh", "SP2"), | |
("^", "zh", "SP3"), | |
# ('@', 'zh', "SP4")#不搞鬼畜了,和第二版保持一致吧 | |
] | |
def clean_text(text, language, version=None): | |
if version is None:version=os.environ.get('version', 'v2') | |
if version == "v1": | |
symbols = symbols_v1.symbols | |
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} | |
else: | |
symbols = symbols_v2.symbols | |
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} | |
if(language not in language_module_map): | |
language="en" | |
text=" " | |
for special_s, special_l, target_symbol in special: | |
if special_s in text and language == special_l: | |
return clean_special(text, language, special_s, target_symbol, version) | |
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) | |
if hasattr(language_module,"text_normalize"): | |
norm_text = language_module.text_normalize(text) | |
else: | |
norm_text=text | |
if language == "zh" or language=="yue":########## | |
phones, word2ph = language_module.g2p(norm_text) | |
assert len(phones) == sum(word2ph) | |
assert len(norm_text) == len(word2ph) | |
elif language == "en": | |
phones = language_module.g2p(norm_text) | |
if len(phones) < 4: | |
phones = [','] * (4 - len(phones)) + phones | |
word2ph = None | |
else: | |
phones = language_module.g2p(norm_text) | |
word2ph = None | |
for ph in phones: | |
phones = ['UNK' if ph not in symbols else ph for ph in phones] | |
return phones, word2ph, norm_text | |
def clean_special(text, language, special_s, target_symbol, version=None): | |
if version is None:version=os.environ.get('version', 'v2') | |
if version == "v1": | |
symbols = symbols_v1.symbols | |
language_module_map = {"zh": "chinese", "ja": "japanese", "en": "english"} | |
else: | |
symbols = symbols_v2.symbols | |
language_module_map = {"zh": "chinese2", "ja": "japanese", "en": "english", "ko": "korean","yue":"cantonese"} | |
""" | |
特殊静音段sp符号处理 | |
""" | |
text = text.replace(special_s, ",") | |
language_module = __import__("text."+language_module_map[language],fromlist=[language_module_map[language]]) | |
norm_text = language_module.text_normalize(text) | |
phones = language_module.g2p(norm_text) | |
new_ph = [] | |
for ph in phones[0]: | |
assert ph in symbols | |
if ph == ",": | |
new_ph.append(target_symbol) | |
else: | |
new_ph.append(ph) | |
return new_ph, phones[1], norm_text | |
def text_to_sequence(text, language, version=None): | |
version = os.environ.get('version',version) | |
if version is None:version='v2' | |
phones = clean_text(text) | |
return cleaned_text_to_sequence(phones, version) | |
if __name__ == "__main__": | |
print(clean_text("你好%啊啊啊额、还是到付红四方。", "zh")) | |