ORI-Muchim commited on
Commit
d93648d
1 Parent(s): b8edf2a

Upload 8 files

Browse files
text/__init__.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from text import cleaners
3
+ from text.symbols import symbols
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+
11
+ def text_to_sequence(text, cleaner_names):
12
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
+ Args:
14
+ text: string to convert to a sequence
15
+ cleaner_names: names of the cleaner functions to run the text through
16
+ Returns:
17
+ List of integers corresponding to the symbols in the text
18
+ '''
19
+ sequence = []
20
+
21
+ clean_text = _clean_text(text, cleaner_names)
22
+ for symbol in clean_text:
23
+ if symbol not in _symbol_to_id.keys():
24
+ continue
25
+ symbol_id = _symbol_to_id[symbol]
26
+ sequence += [symbol_id]
27
+ return sequence
28
+
29
+
30
+ def cleaned_text_to_sequence(cleaned_text):
31
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
32
+ Args:
33
+ text: string to convert to a sequence
34
+ Returns:
35
+ List of integers corresponding to the symbols in the text
36
+ '''
37
+ sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
38
+ return sequence
39
+
40
+
41
+ def sequence_to_text(sequence):
42
+ '''Converts a sequence of IDs back to a string'''
43
+ result = ''
44
+ for symbol_id in sequence:
45
+ s = _id_to_symbol[symbol_id]
46
+ result += s
47
+ return result
48
+
49
+
50
+ def _clean_text(text, cleaner_names):
51
+ for name in cleaner_names:
52
+ cleaner = getattr(cleaners, name)
53
+ if not cleaner:
54
+ raise Exception('Unknown cleaner: %s' % name)
55
+ text = cleaner(text)
56
+ return text
text/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (2.14 kB). View file
 
text/__pycache__/cleaners.cpython-38.pyc ADDED
Binary file (560 Bytes). View file
 
text/__pycache__/korean.cpython-38.pyc ADDED
Binary file (5.99 kB). View file
 
text/__pycache__/symbols.cpython-38.pyc ADDED
Binary file (362 Bytes). View file
 
text/cleaners.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from unidecode import unidecode
3
+ from text.korean import latin_to_hangul, divide_hangul, fix_g2pk2_error
4
+ from g2pk2 import G2p
5
+
6
+
7
+ def korean_cleaners(text):
8
+ text = latin_to_hangul(text)
9
+ g2p = G2p()
10
+ text = g2p(text)
11
+ text = divide_hangul(text)
12
+ text = fix_g2pk2_error(text)
13
+ text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
14
+ return text
text/korean.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from jamo import h2j, j2hcj
3
+ import ko_pron
4
+ from g2pk2 import G2p
5
+
6
+
7
+ # This is a list of Korean classifiers preceded by pure Korean numerals.
8
+ _korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
9
+
10
+ # List of (hangul, hangul divided) pairs:
11
+ _hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
12
+ # ('ㄳ', 'ㄱㅅ'), # g2pk2, A Syllable-ending Rule
13
+ # ('ㄵ', 'ㄴㅈ'),
14
+ # ('ㄶ', 'ㄴㅎ'),
15
+ # ('ㄺ', 'ㄹㄱ'),
16
+ # ('ㄻ', 'ㄹㅁ'),
17
+ # ('ㄼ', 'ㄹㅂ'),
18
+ # ('ㄽ', 'ㄹㅅ'),
19
+ # ('ㄾ', 'ㄹㅌ'),
20
+ # ('ㄿ', 'ㄹㅍ'),
21
+ # ('ㅀ', 'ㄹㅎ'),
22
+ # ('ㅄ', 'ㅂㅅ'),
23
+ ('ㅘ', 'ㅗㅏ'),
24
+ ('ㅙ', 'ㅗㅐ'),
25
+ ('ㅚ', 'ㅗㅣ'),
26
+ ('ㅝ', 'ㅜㅓ'),
27
+ ('ㅞ', 'ㅜㅔ'),
28
+ ('ㅟ', 'ㅜㅣ'),
29
+ ('ㅢ', 'ㅡㅣ'),
30
+ ('ㅑ', 'ㅣㅏ'),
31
+ ('ㅒ', 'ㅣㅐ'),
32
+ ('ㅕ', 'ㅣㅓ'),
33
+ ('ㅖ', 'ㅣㅔ'),
34
+ ('ㅛ', 'ㅣㅗ'),
35
+ ('ㅠ', 'ㅣㅜ')
36
+ ]]
37
+
38
+ # List of (Latin alphabet, hangul) pairs:
39
+ _latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
40
+ ('a', '에이'),
41
+ ('b', '비'),
42
+ ('c', '시'),
43
+ ('d', '디'),
44
+ ('e', '이'),
45
+ ('f', '에프'),
46
+ ('g', '지'),
47
+ ('h', '에이치'),
48
+ ('i', '아이'),
49
+ ('j', '제이'),
50
+ ('k', '케이'),
51
+ ('l', '엘'),
52
+ ('m', '엠'),
53
+ ('n', '엔'),
54
+ ('o', '오'),
55
+ ('p', '피'),
56
+ ('q', '큐'),
57
+ ('r', '아르'),
58
+ ('s', '에스'),
59
+ ('t', '티'),
60
+ ('u', '유'),
61
+ ('v', '브이'),
62
+ ('w', '더블유'),
63
+ ('x', '엑스'),
64
+ ('y', '와이'),
65
+ ('z', '제트')
66
+ ]]
67
+
68
+ # List of (ipa, lazy ipa) pairs:
69
+ _ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
70
+ ('t͡ɕ','ʧ'),
71
+ ('d͡ʑ','ʥ'),
72
+ ('ɲ','n^'),
73
+ ('ɕ','ʃ'),
74
+ ('ʷ','w'),
75
+ ('ɭ','l`'),
76
+ ('ʎ','ɾ'),
77
+ ('ɣ','ŋ'),
78
+ ('ɰ','ɯ'),
79
+ ('ʝ','j'),
80
+ ('ʌ','ə'),
81
+ ('ɡ','g'),
82
+ ('\u031a','#'),
83
+ ('\u0348','='),
84
+ ('\u031e',''),
85
+ ('\u0320',''),
86
+ ('\u0339','')
87
+ ]]
88
+
89
+
90
+ def fix_g2pk2_error(text):
91
+ new_text = ""
92
+ i = 0
93
+ while i < len(text) - 4:
94
+ if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ':
95
+ new_text += text[i:i+3] + ' ' + 'ㄴ'
96
+ i += 5
97
+ else:
98
+ new_text += text[i]
99
+ i += 1
100
+
101
+ new_text += text[i:]
102
+ return new_text
103
+
104
+
105
+ def latin_to_hangul(text):
106
+ for regex, replacement in _latin_to_hangul:
107
+ text = re.sub(regex, replacement, text)
108
+ return text
109
+
110
+
111
+ def divide_hangul(text):
112
+ text = j2hcj(h2j(text))
113
+ for regex, replacement in _hangul_divided:
114
+ text = re.sub(regex, replacement, text)
115
+ return text
116
+
117
+
118
+ def hangul_number(num, sino=True):
119
+ '''Reference https://github.com/Kyubyong/g2pK'''
120
+ num = re.sub(',', '', num)
121
+
122
+ if num == '0':
123
+ return '영'
124
+ if not sino and num == '20':
125
+ return '스무'
126
+
127
+ digits = '123456789'
128
+ names = '일이삼사오육칠팔구'
129
+ digit2name = {d: n for d, n in zip(digits, names)}
130
+
131
+ modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
132
+ decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
133
+ digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
134
+ digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
135
+
136
+ spelledout = []
137
+ for i, digit in enumerate(num):
138
+ i = len(num) - i - 1
139
+ if sino:
140
+ if i == 0:
141
+ name = digit2name.get(digit, '')
142
+ elif i == 1:
143
+ name = digit2name.get(digit, '') + '십'
144
+ name = name.replace('일십', '십')
145
+ else:
146
+ if i == 0:
147
+ name = digit2mod.get(digit, '')
148
+ elif i == 1:
149
+ name = digit2dec.get(digit, '')
150
+ if digit == '0':
151
+ if i % 4 == 0:
152
+ last_three = spelledout[-min(3, len(spelledout)):]
153
+ if ''.join(last_three) == '':
154
+ spelledout.append('')
155
+ continue
156
+ else:
157
+ spelledout.append('')
158
+ continue
159
+ if i == 2:
160
+ name = digit2name.get(digit, '') + '백'
161
+ name = name.replace('일백', '백')
162
+ elif i == 3:
163
+ name = digit2name.get(digit, '') + '천'
164
+ name = name.replace('일천', '천')
165
+ elif i == 4:
166
+ name = digit2name.get(digit, '') + '만'
167
+ name = name.replace('일만', '만')
168
+ elif i == 5:
169
+ name = digit2name.get(digit, '') + '십'
170
+ name = name.replace('일십', '십')
171
+ elif i == 6:
172
+ name = digit2name.get(digit, '') + '백'
173
+ name = name.replace('일백', '백')
174
+ elif i == 7:
175
+ name = digit2name.get(digit, '') + '천'
176
+ name = name.replace('일천', '천')
177
+ elif i == 8:
178
+ name = digit2name.get(digit, '') + '억'
179
+ elif i == 9:
180
+ name = digit2name.get(digit, '') + '십'
181
+ elif i == 10:
182
+ name = digit2name.get(digit, '') + '백'
183
+ elif i == 11:
184
+ name = digit2name.get(digit, '') + '천'
185
+ elif i == 12:
186
+ name = digit2name.get(digit, '') + '조'
187
+ elif i == 13:
188
+ name = digit2name.get(digit, '') + '십'
189
+ elif i == 14:
190
+ name = digit2name.get(digit, '') + '백'
191
+ elif i == 15:
192
+ name = digit2name.get(digit, '') + '천'
193
+ spelledout.append(name)
194
+ return ''.join(elem for elem in spelledout)
195
+
196
+
197
+ def number_to_hangul(text):
198
+ '''Reference https://github.com/Kyubyong/g2pK'''
199
+ tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
200
+ for token in tokens:
201
+ num, classifier = token
202
+ if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
203
+ spelledout = hangul_number(num, sino=False)
204
+ else:
205
+ spelledout = hangul_number(num, sino=True)
206
+ text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
207
+ # digit by digit for remaining digits
208
+ digits = '0123456789'
209
+ names = '영일이삼사오육칠팔구'
210
+ for d, n in zip(digits, names):
211
+ text = text.replace(d, n)
212
+ return text
213
+
214
+
215
+ def korean_to_lazy_ipa(text):
216
+ text = latin_to_hangul(text)
217
+ text = number_to_hangul(text)
218
+ text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
219
+ for regex, replacement in _ipa_to_lazy_ipa:
220
+ text = re.sub(regex, replacement, text)
221
+ return text
222
+
223
+
224
+ def korean_to_ipa(text):
225
+ text = latin_to_hangul(text)
226
+ text = number_to_hangul(text)
227
+ g2p = G2p()
228
+ text = g2p(text)
229
+ text = fix_g2pk2_error(text)
230
+ text = korean_to_lazy_ipa(text)
231
+ return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
text/symbols.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # korean_cleaners
2
+ _pad = '_'
3
+ _punctuation = ',.!?…~'
4
+ _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
5
+
6
+
7
+ # Export all symbols:
8
+ symbols = [_pad] + list(_punctuation) + list(_letters)
9
+ # Special symbol ids
10
+ SPACE_ID = symbols.index(' ')