File size: 7,369 Bytes
96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 aa72a6f 96bd1d3 cc912a2 96bd1d3 aa72a6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import re
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
# from text.sanskrit import devanagari_to_ipa
# from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
# from text.thai import num_to_thai, latin_to_thai
# from text.shanghainese import shanghainese_to_ipa
# from text.cantonese import cantonese_to_ipa
# from text.ngu_dialect import ngu_dialect_to_ipa
def japanese_cleaners(text):
text = japanese_to_romaji_with_accent(text)
if re.match('[A-Za-z]', text[-1]):
text += '.'
return text
def japanese_cleaners2(text):
return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
def korean_cleaners(text):
'''Pipeline for Korean text'''
text = latin_to_hangul(text)
text = number_to_hangul(text)
text = divide_hangul(text)
if re.match('[\u3131-\u3163]', text[-1]):
text += '.'
return text
def chinese_cleaners(text):
'''Pipeline for Chinese text'''
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
if re.match('[ˉˊˇˋ˙]', text[-1]):
text += '。'
return text
def zh_ja_mixture_cleaners(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_romaji(chinese_text[4:-4])
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_romaji_with_accent(
japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
text = text.replace(japanese_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
text += '.'
return text
def sanskrit_cleaners(text):
text = text.replace('॥', '।').replace('ॐ', 'ओम्')
if text[-1] != '।':
text += ' ।'
return text
def cjks_cleaners(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
text = text.replace(japanese_text, cleaned_text+' ', 1)
for korean_text in korean_texts:
cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
text = text.replace(korean_text, cleaned_text+' ', 1)
for sanskrit_text in sanskrit_texts:
cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
text = text.replace(sanskrit_text, cleaned_text+' ', 1)
for english_text in english_texts:
cleaned_text = english_to_lazy_ipa(english_text[4:-4])
text = text.replace(english_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def cjke_cleaners(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
cleaned_text = cleaned_text.replace(
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
text = text.replace(japanese_text, cleaned_text+' ', 1)
for korean_text in korean_texts:
cleaned_text = korean_to_ipa(korean_text[4:-4])
text = text.replace(korean_text, cleaned_text+' ', 1)
for english_text in english_texts:
cleaned_text = english_to_ipa2(english_text[4:-4])
cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
text = text.replace(english_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def cjke_cleaners2(text):
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
for chinese_text in chinese_texts:
cleaned_text = chinese_to_ipa(chinese_text[4:-4])
text = text.replace(chinese_text, cleaned_text+' ', 1)
for japanese_text in japanese_texts:
cleaned_text = japanese_to_ipa2(japanese_text[4:-4])
text = text.replace(japanese_text, cleaned_text+' ', 1)
for korean_text in korean_texts:
cleaned_text = korean_to_ipa(korean_text[4:-4])
text = text.replace(korean_text, cleaned_text+' ', 1)
for english_text in english_texts:
cleaned_text = english_to_ipa2(english_text[4:-4])
text = text.replace(english_text, cleaned_text+' ', 1)
text = text[:-1]
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def thai_cleaners(text):
text = num_to_thai(text)
text = latin_to_thai(text)
return text
def shanghainese_cleaners(text):
text = shanghainese_to_ipa(text)
if re.match(r'[^\.,!\?\-…~]', text[-1]):
text += '.'
return text
def chinese_dialect_cleaners(text):
text = re.sub(r'\[MD\](.*?)\[MD\]',
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
text = re.sub(r'\[TW\](.*?)\[TW\]',
lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
text = re.sub(r'\[JA\](.*?)\[JA\]',
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
'˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
text = re.sub(r'\[GD\](.*?)\[GD\]',
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
text = re.sub(r'\[EN\](.*?)\[EN\]',
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
text = re.sub(r'\s+$', '', text)
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
return text
|