|
import re
|
|
from jamo import h2j, j2hcj
|
|
import ko_pron
|
|
|
|
|
|
|
|
_korean_classifiers = '๊ตฐ๋ฐ ๊ถ ๊ฐ ๊ทธ๋ฃจ ๋ข ๋ ๋ ๋ง๋ฆฌ ๋ชจ ๋ชจ๊ธ ๋ญ ๋ฐ ๋ฐ์ง ๋ฐฉ ๋ฒ ๋ฒ ๋ณด๋ฃจ ์ด ์ ์ ์ ์ ์ํผ ์ ์ง ์ฑ ์ฒ ์ฒฉ ์ถ ์ผค๋ ํจ ํต'
|
|
|
|
|
|
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
('ใณ', 'ใฑใ
'),
|
|
('ใต', 'ใดใ
'),
|
|
('ใถ', 'ใดใ
'),
|
|
('ใบ', 'ในใฑ'),
|
|
('ใป', 'ในใ
'),
|
|
('ใผ', 'ในใ
'),
|
|
('ใฝ', 'ในใ
'),
|
|
('ใพ', 'ในใ
'),
|
|
('ใฟ', 'ในใ
'),
|
|
('ใ
', 'ในใ
'),
|
|
('ใ
', 'ใ
ใ
'),
|
|
('ใ
', 'ใ
ใ
'),
|
|
('ใ
', 'ใ
ใ
'),
|
|
('ใ
', 'ใ
ใ
ฃ'),
|
|
('ใ
', 'ใ
ใ
'),
|
|
('ใ
', 'ใ
ใ
'),
|
|
('ใ
', 'ใ
ใ
ฃ'),
|
|
('ใ
ข', 'ใ
กใ
ฃ'),
|
|
('ใ
', 'ใ
ฃใ
'),
|
|
('ใ
', 'ใ
ฃใ
'),
|
|
('ใ
', 'ใ
ฃใ
'),
|
|
('ใ
', 'ใ
ฃใ
'),
|
|
('ใ
', 'ใ
ฃใ
'),
|
|
('ใ
', 'ใ
ฃใ
')
|
|
]]
|
|
|
|
|
|
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
|
('a', '์์ด'),
|
|
('b', '๋น'),
|
|
('c', '์'),
|
|
('d', '๋'),
|
|
('e', '์ด'),
|
|
('f', '์ํ'),
|
|
('g', '์ง'),
|
|
('h', '์์ด์น'),
|
|
('i', '์์ด'),
|
|
('j', '์ ์ด'),
|
|
('k', '์ผ์ด'),
|
|
('l', '์'),
|
|
('m', '์ '),
|
|
('n', '์'),
|
|
('o', '์ค'),
|
|
('p', 'ํผ'),
|
|
('q', 'ํ'),
|
|
('r', '์๋ฅด'),
|
|
('s', '์์ค'),
|
|
('t', 'ํฐ'),
|
|
('u', '์ '),
|
|
('v', '๋ธ์ด'),
|
|
('w', '๋๋ธ์ '),
|
|
('x', '์์ค'),
|
|
('y', '์์ด'),
|
|
('z', '์ ํธ')
|
|
]]
|
|
|
|
|
|
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
|
('tอกษ','สง'),
|
|
('dอกส','สฅ'),
|
|
('ษฒ','n^'),
|
|
('ษ','ส'),
|
|
('สท','w'),
|
|
('ษญ','l`'),
|
|
('ส','ษพ'),
|
|
('ษฃ','ล'),
|
|
('ษฐ','ษฏ'),
|
|
('ส','j'),
|
|
('ส','ษ'),
|
|
('ษก','g'),
|
|
('\u031a','#'),
|
|
('\u0348','='),
|
|
('\u031e',''),
|
|
('\u0320',''),
|
|
('\u0339','')
|
|
]]
|
|
|
|
|
|
def latin_to_hangul(text):
|
|
for regex, replacement in _latin_to_hangul:
|
|
text = re.sub(regex, replacement, text)
|
|
return text
|
|
|
|
|
|
def divide_hangul(text):
|
|
text = j2hcj(h2j(text))
|
|
for regex, replacement in _hangul_divided:
|
|
text = re.sub(regex, replacement, text)
|
|
return text
|
|
|
|
|
|
def hangul_number(num, sino=True):
|
|
'''Reference https://github.com/Kyubyong/g2pK'''
|
|
num = re.sub(',', '', num)
|
|
|
|
if num == '0':
|
|
return '์'
|
|
if not sino and num == '20':
|
|
return '์ค๋ฌด'
|
|
|
|
digits = '123456789'
|
|
names = '์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ'
|
|
digit2name = {d: n for d, n in zip(digits, names)}
|
|
|
|
modifiers = 'ํ ๋ ์ธ ๋ค ๋ค์ฏ ์ฌ์ฏ ์ผ๊ณฑ ์ฌ๋ ์ํ'
|
|
decimals = '์ด ์ค๋ฌผ ์๋ฅธ ๋งํ ์ฐ ์์ ์ผํ ์ฌ๋ ์ํ'
|
|
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
|
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
|
|
|
spelledout = []
|
|
for i, digit in enumerate(num):
|
|
i = len(num) - i - 1
|
|
if sino:
|
|
if i == 0:
|
|
name = digit2name.get(digit, '')
|
|
elif i == 1:
|
|
name = digit2name.get(digit, '') + '์ญ'
|
|
name = name.replace('์ผ์ญ', '์ญ')
|
|
else:
|
|
if i == 0:
|
|
name = digit2mod.get(digit, '')
|
|
elif i == 1:
|
|
name = digit2dec.get(digit, '')
|
|
if digit == '0':
|
|
if i % 4 == 0:
|
|
last_three = spelledout[-min(3, len(spelledout)):]
|
|
if ''.join(last_three) == '':
|
|
spelledout.append('')
|
|
continue
|
|
else:
|
|
spelledout.append('')
|
|
continue
|
|
if i == 2:
|
|
name = digit2name.get(digit, '') + '๋ฐฑ'
|
|
name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
|
|
elif i == 3:
|
|
name = digit2name.get(digit, '') + '์ฒ'
|
|
name = name.replace('์ผ์ฒ', '์ฒ')
|
|
elif i == 4:
|
|
name = digit2name.get(digit, '') + '๋ง'
|
|
name = name.replace('์ผ๋ง', '๋ง')
|
|
elif i == 5:
|
|
name = digit2name.get(digit, '') + '์ญ'
|
|
name = name.replace('์ผ์ญ', '์ญ')
|
|
elif i == 6:
|
|
name = digit2name.get(digit, '') + '๋ฐฑ'
|
|
name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
|
|
elif i == 7:
|
|
name = digit2name.get(digit, '') + '์ฒ'
|
|
name = name.replace('์ผ์ฒ', '์ฒ')
|
|
elif i == 8:
|
|
name = digit2name.get(digit, '') + '์ต'
|
|
elif i == 9:
|
|
name = digit2name.get(digit, '') + '์ญ'
|
|
elif i == 10:
|
|
name = digit2name.get(digit, '') + '๋ฐฑ'
|
|
elif i == 11:
|
|
name = digit2name.get(digit, '') + '์ฒ'
|
|
elif i == 12:
|
|
name = digit2name.get(digit, '') + '์กฐ'
|
|
elif i == 13:
|
|
name = digit2name.get(digit, '') + '์ญ'
|
|
elif i == 14:
|
|
name = digit2name.get(digit, '') + '๋ฐฑ'
|
|
elif i == 15:
|
|
name = digit2name.get(digit, '') + '์ฒ'
|
|
spelledout.append(name)
|
|
return ''.join(elem for elem in spelledout)
|
|
|
|
|
|
def number_to_hangul(text):
|
|
'''Reference https://github.com/Kyubyong/g2pK'''
|
|
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
|
for token in tokens:
|
|
num, classifier = token
|
|
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
|
spelledout = hangul_number(num, sino=False)
|
|
else:
|
|
spelledout = hangul_number(num, sino=True)
|
|
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
|
|
|
digits = '0123456789'
|
|
names = '์์ผ์ด์ผ์ฌ์ค์ก์น ํ๊ตฌ'
|
|
for d, n in zip(digits, names):
|
|
text = text.replace(d, n)
|
|
return text
|
|
|
|
|
|
def korean_to_lazy_ipa(text):
|
|
text = latin_to_hangul(text)
|
|
text = number_to_hangul(text)
|
|
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
|
for regex, replacement in _ipa_to_lazy_ipa:
|
|
text = re.sub(regex, replacement, text)
|
|
return text
|
|
|
|
|
|
def korean_to_ipa(text):
|
|
text = korean_to_lazy_ipa(text)
|
|
return text.replace('สง','tส').replace('สฅ','dส')
|
|
|