ORI-Muchim
commited on
Commit
•
d93648d
1
Parent(s):
b8edf2a
Upload 8 files
Browse files- text/__init__.py +56 -0
- text/__pycache__/__init__.cpython-38.pyc +0 -0
- text/__pycache__/cleaners.cpython-38.pyc +0 -0
- text/__pycache__/korean.cpython-38.pyc +0 -0
- text/__pycache__/symbols.cpython-38.pyc +0 -0
- text/cleaners.py +14 -0
- text/korean.py +231 -0
- text/symbols.py +10 -0
text/__init__.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
from text import cleaners
|
3 |
+
from text.symbols import symbols
|
4 |
+
|
5 |
+
|
6 |
+
# Mappings from symbol to numeric ID and vice versa:
|
7 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
8 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
9 |
+
|
10 |
+
|
11 |
+
def text_to_sequence(text, cleaner_names):
|
12 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
13 |
+
Args:
|
14 |
+
text: string to convert to a sequence
|
15 |
+
cleaner_names: names of the cleaner functions to run the text through
|
16 |
+
Returns:
|
17 |
+
List of integers corresponding to the symbols in the text
|
18 |
+
'''
|
19 |
+
sequence = []
|
20 |
+
|
21 |
+
clean_text = _clean_text(text, cleaner_names)
|
22 |
+
for symbol in clean_text:
|
23 |
+
if symbol not in _symbol_to_id.keys():
|
24 |
+
continue
|
25 |
+
symbol_id = _symbol_to_id[symbol]
|
26 |
+
sequence += [symbol_id]
|
27 |
+
return sequence
|
28 |
+
|
29 |
+
|
30 |
+
def cleaned_text_to_sequence(cleaned_text):
|
31 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
32 |
+
Args:
|
33 |
+
text: string to convert to a sequence
|
34 |
+
Returns:
|
35 |
+
List of integers corresponding to the symbols in the text
|
36 |
+
'''
|
37 |
+
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
|
38 |
+
return sequence
|
39 |
+
|
40 |
+
|
41 |
+
def sequence_to_text(sequence):
|
42 |
+
'''Converts a sequence of IDs back to a string'''
|
43 |
+
result = ''
|
44 |
+
for symbol_id in sequence:
|
45 |
+
s = _id_to_symbol[symbol_id]
|
46 |
+
result += s
|
47 |
+
return result
|
48 |
+
|
49 |
+
|
50 |
+
def _clean_text(text, cleaner_names):
|
51 |
+
for name in cleaner_names:
|
52 |
+
cleaner = getattr(cleaners, name)
|
53 |
+
if not cleaner:
|
54 |
+
raise Exception('Unknown cleaner: %s' % name)
|
55 |
+
text = cleaner(text)
|
56 |
+
return text
|
text/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (2.14 kB). View file
|
|
text/__pycache__/cleaners.cpython-38.pyc
ADDED
Binary file (560 Bytes). View file
|
|
text/__pycache__/korean.cpython-38.pyc
ADDED
Binary file (5.99 kB). View file
|
|
text/__pycache__/symbols.cpython-38.pyc
ADDED
Binary file (362 Bytes). View file
|
|
text/cleaners.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from unidecode import unidecode
|
3 |
+
from text.korean import latin_to_hangul, divide_hangul, fix_g2pk2_error
|
4 |
+
from g2pk2 import G2p
|
5 |
+
|
6 |
+
|
7 |
+
def korean_cleaners(text):
|
8 |
+
text = latin_to_hangul(text)
|
9 |
+
g2p = G2p()
|
10 |
+
text = g2p(text)
|
11 |
+
text = divide_hangul(text)
|
12 |
+
text = fix_g2pk2_error(text)
|
13 |
+
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
14 |
+
return text
|
text/korean.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from jamo import h2j, j2hcj
|
3 |
+
import ko_pron
|
4 |
+
from g2pk2 import G2p
|
5 |
+
|
6 |
+
|
7 |
+
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
8 |
+
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
|
9 |
+
|
10 |
+
# List of (hangul, hangul divided) pairs:
|
11 |
+
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
12 |
+
# ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
|
13 |
+
# ('ㄵ', 'ㄴㅈ'),
|
14 |
+
# ('ㄶ', 'ㄴㅎ'),
|
15 |
+
# ('ㄺ', 'ㄹㄱ'),
|
16 |
+
# ('ㄻ', 'ㄹㅁ'),
|
17 |
+
# ('ㄼ', 'ㄹㅂ'),
|
18 |
+
# ('ㄽ', 'ㄹㅅ'),
|
19 |
+
# ('ㄾ', 'ㄹㅌ'),
|
20 |
+
# ('ㄿ', 'ㄹㅍ'),
|
21 |
+
# ('ㅀ', 'ㄹㅎ'),
|
22 |
+
# ('ㅄ', 'ㅂㅅ'),
|
23 |
+
('ㅘ', 'ㅗㅏ'),
|
24 |
+
('ㅙ', 'ㅗㅐ'),
|
25 |
+
('ㅚ', 'ㅗㅣ'),
|
26 |
+
('ㅝ', 'ㅜㅓ'),
|
27 |
+
('ㅞ', 'ㅜㅔ'),
|
28 |
+
('ㅟ', 'ㅜㅣ'),
|
29 |
+
('ㅢ', 'ㅡㅣ'),
|
30 |
+
('ㅑ', 'ㅣㅏ'),
|
31 |
+
('ㅒ', 'ㅣㅐ'),
|
32 |
+
('ㅕ', 'ㅣㅓ'),
|
33 |
+
('ㅖ', 'ㅣㅔ'),
|
34 |
+
('ㅛ', 'ㅣㅗ'),
|
35 |
+
('ㅠ', 'ㅣㅜ')
|
36 |
+
]]
|
37 |
+
|
38 |
+
# List of (Latin alphabet, hangul) pairs:
|
39 |
+
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
40 |
+
('a', '에이'),
|
41 |
+
('b', '비'),
|
42 |
+
('c', '시'),
|
43 |
+
('d', '디'),
|
44 |
+
('e', '이'),
|
45 |
+
('f', '에프'),
|
46 |
+
('g', '지'),
|
47 |
+
('h', '에이치'),
|
48 |
+
('i', '아이'),
|
49 |
+
('j', '제이'),
|
50 |
+
('k', '케이'),
|
51 |
+
('l', '엘'),
|
52 |
+
('m', '엠'),
|
53 |
+
('n', '엔'),
|
54 |
+
('o', '오'),
|
55 |
+
('p', '피'),
|
56 |
+
('q', '큐'),
|
57 |
+
('r', '아르'),
|
58 |
+
('s', '에스'),
|
59 |
+
('t', '티'),
|
60 |
+
('u', '유'),
|
61 |
+
('v', '브이'),
|
62 |
+
('w', '더블유'),
|
63 |
+
('x', '엑스'),
|
64 |
+
('y', '와이'),
|
65 |
+
('z', '제트')
|
66 |
+
]]
|
67 |
+
|
68 |
+
# List of (ipa, lazy ipa) pairs:
|
69 |
+
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
70 |
+
('t͡ɕ','ʧ'),
|
71 |
+
('d͡ʑ','ʥ'),
|
72 |
+
('ɲ','n^'),
|
73 |
+
('ɕ','ʃ'),
|
74 |
+
('ʷ','w'),
|
75 |
+
('ɭ','l`'),
|
76 |
+
('ʎ','ɾ'),
|
77 |
+
('ɣ','ŋ'),
|
78 |
+
('ɰ','ɯ'),
|
79 |
+
('ʝ','j'),
|
80 |
+
('ʌ','ə'),
|
81 |
+
('ɡ','g'),
|
82 |
+
('\u031a','#'),
|
83 |
+
('\u0348','='),
|
84 |
+
('\u031e',''),
|
85 |
+
('\u0320',''),
|
86 |
+
('\u0339','')
|
87 |
+
]]
|
88 |
+
|
89 |
+
|
90 |
+
def fix_g2pk2_error(text):
|
91 |
+
new_text = ""
|
92 |
+
i = 0
|
93 |
+
while i < len(text) - 4:
|
94 |
+
if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ':
|
95 |
+
new_text += text[i:i+3] + ' ' + 'ㄴ'
|
96 |
+
i += 5
|
97 |
+
else:
|
98 |
+
new_text += text[i]
|
99 |
+
i += 1
|
100 |
+
|
101 |
+
new_text += text[i:]
|
102 |
+
return new_text
|
103 |
+
|
104 |
+
|
105 |
+
def latin_to_hangul(text):
|
106 |
+
for regex, replacement in _latin_to_hangul:
|
107 |
+
text = re.sub(regex, replacement, text)
|
108 |
+
return text
|
109 |
+
|
110 |
+
|
111 |
+
def divide_hangul(text):
|
112 |
+
text = j2hcj(h2j(text))
|
113 |
+
for regex, replacement in _hangul_divided:
|
114 |
+
text = re.sub(regex, replacement, text)
|
115 |
+
return text
|
116 |
+
|
117 |
+
|
118 |
+
def hangul_number(num, sino=True):
|
119 |
+
'''Reference https://github.com/Kyubyong/g2pK'''
|
120 |
+
num = re.sub(',', '', num)
|
121 |
+
|
122 |
+
if num == '0':
|
123 |
+
return '영'
|
124 |
+
if not sino and num == '20':
|
125 |
+
return '스무'
|
126 |
+
|
127 |
+
digits = '123456789'
|
128 |
+
names = '일이삼사오육칠팔구'
|
129 |
+
digit2name = {d: n for d, n in zip(digits, names)}
|
130 |
+
|
131 |
+
modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
|
132 |
+
decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
|
133 |
+
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
134 |
+
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
135 |
+
|
136 |
+
spelledout = []
|
137 |
+
for i, digit in enumerate(num):
|
138 |
+
i = len(num) - i - 1
|
139 |
+
if sino:
|
140 |
+
if i == 0:
|
141 |
+
name = digit2name.get(digit, '')
|
142 |
+
elif i == 1:
|
143 |
+
name = digit2name.get(digit, '') + '십'
|
144 |
+
name = name.replace('일십', '십')
|
145 |
+
else:
|
146 |
+
if i == 0:
|
147 |
+
name = digit2mod.get(digit, '')
|
148 |
+
elif i == 1:
|
149 |
+
name = digit2dec.get(digit, '')
|
150 |
+
if digit == '0':
|
151 |
+
if i % 4 == 0:
|
152 |
+
last_three = spelledout[-min(3, len(spelledout)):]
|
153 |
+
if ''.join(last_three) == '':
|
154 |
+
spelledout.append('')
|
155 |
+
continue
|
156 |
+
else:
|
157 |
+
spelledout.append('')
|
158 |
+
continue
|
159 |
+
if i == 2:
|
160 |
+
name = digit2name.get(digit, '') + '백'
|
161 |
+
name = name.replace('일백', '백')
|
162 |
+
elif i == 3:
|
163 |
+
name = digit2name.get(digit, '') + '천'
|
164 |
+
name = name.replace('일천', '천')
|
165 |
+
elif i == 4:
|
166 |
+
name = digit2name.get(digit, '') + '만'
|
167 |
+
name = name.replace('일만', '만')
|
168 |
+
elif i == 5:
|
169 |
+
name = digit2name.get(digit, '') + '십'
|
170 |
+
name = name.replace('일십', '십')
|
171 |
+
elif i == 6:
|
172 |
+
name = digit2name.get(digit, '') + '백'
|
173 |
+
name = name.replace('일백', '백')
|
174 |
+
elif i == 7:
|
175 |
+
name = digit2name.get(digit, '') + '천'
|
176 |
+
name = name.replace('일천', '천')
|
177 |
+
elif i == 8:
|
178 |
+
name = digit2name.get(digit, '') + '억'
|
179 |
+
elif i == 9:
|
180 |
+
name = digit2name.get(digit, '') + '십'
|
181 |
+
elif i == 10:
|
182 |
+
name = digit2name.get(digit, '') + '백'
|
183 |
+
elif i == 11:
|
184 |
+
name = digit2name.get(digit, '') + '천'
|
185 |
+
elif i == 12:
|
186 |
+
name = digit2name.get(digit, '') + '조'
|
187 |
+
elif i == 13:
|
188 |
+
name = digit2name.get(digit, '') + '십'
|
189 |
+
elif i == 14:
|
190 |
+
name = digit2name.get(digit, '') + '백'
|
191 |
+
elif i == 15:
|
192 |
+
name = digit2name.get(digit, '') + '천'
|
193 |
+
spelledout.append(name)
|
194 |
+
return ''.join(elem for elem in spelledout)
|
195 |
+
|
196 |
+
|
197 |
+
def number_to_hangul(text):
|
198 |
+
'''Reference https://github.com/Kyubyong/g2pK'''
|
199 |
+
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
200 |
+
for token in tokens:
|
201 |
+
num, classifier = token
|
202 |
+
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
203 |
+
spelledout = hangul_number(num, sino=False)
|
204 |
+
else:
|
205 |
+
spelledout = hangul_number(num, sino=True)
|
206 |
+
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
207 |
+
# digit by digit for remaining digits
|
208 |
+
digits = '0123456789'
|
209 |
+
names = '영일이삼사오육칠팔구'
|
210 |
+
for d, n in zip(digits, names):
|
211 |
+
text = text.replace(d, n)
|
212 |
+
return text
|
213 |
+
|
214 |
+
|
215 |
+
def korean_to_lazy_ipa(text):
|
216 |
+
text = latin_to_hangul(text)
|
217 |
+
text = number_to_hangul(text)
|
218 |
+
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
219 |
+
for regex, replacement in _ipa_to_lazy_ipa:
|
220 |
+
text = re.sub(regex, replacement, text)
|
221 |
+
return text
|
222 |
+
|
223 |
+
|
224 |
+
def korean_to_ipa(text):
|
225 |
+
text = latin_to_hangul(text)
|
226 |
+
text = number_to_hangul(text)
|
227 |
+
g2p = G2p()
|
228 |
+
text = g2p(text)
|
229 |
+
text = fix_g2pk2_error(text)
|
230 |
+
text = korean_to_lazy_ipa(text)
|
231 |
+
return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
|
text/symbols.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# korean_cleaners
|
2 |
+
_pad = '_'
|
3 |
+
_punctuation = ',.!?…~'
|
4 |
+
_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
|
5 |
+
|
6 |
+
|
7 |
+
# Export all symbols:
|
8 |
+
symbols = [_pad] + list(_punctuation) + list(_letters)
|
9 |
+
# Special symbol ids
|
10 |
+
SPACE_ID = symbols.index(' ')
|