ORI-Muchim's picture
Upload 8 files
d93648d
raw
history blame
6.88 kB
import re
from jamo import h2j, j2hcj
import ko_pron
from g2pk2 import G2p
# This is a list of Korean classifiers preceded by pure Korean numerals.
_korean_classifiers = '๊ตฐ๋ฐ ๊ถŒ ๊ฐœ ๊ทธ๋ฃจ ๋‹ข ๋Œ€ ๋‘ ๋งˆ๋ฆฌ ๋ชจ ๋ชจ๊ธˆ ๋ญ‡ ๋ฐœ ๋ฐœ์ง ๋ฐฉ ๋ฒˆ ๋ฒŒ ๋ณด๋ฃจ ์‚ด ์ˆ˜ ์ˆ  ์‹œ ์Œˆ ์›€ํผ ์ • ์ง ์ฑ„ ์ฒ™ ์ฒฉ ์ถ• ์ผค๋ ˆ ํ†จ ํ†ต'
# List of (hangul, hangul divided) pairs:
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
# ('ใ„ณ', 'ใ„ฑใ……'), # g2pk2, A Syllable-ending Rule
# ('ใ„ต', 'ใ„ดใ…ˆ'),
# ('ใ„ถ', 'ใ„ดใ…Ž'),
# ('ใ„บ', 'ใ„นใ„ฑ'),
# ('ใ„ป', 'ใ„นใ…'),
# ('ใ„ผ', 'ใ„นใ…‚'),
# ('ใ„ฝ', 'ใ„นใ……'),
# ('ใ„พ', 'ใ„นใ…Œ'),
# ('ใ„ฟ', 'ใ„นใ…'),
# ('ใ…€', 'ใ„นใ…Ž'),
# ('ใ…„', 'ใ…‚ใ……'),
('ใ…˜', 'ใ…—ใ…'),
('ใ…™', 'ใ…—ใ…'),
('ใ…š', 'ใ…—ใ…ฃ'),
('ใ…', 'ใ…œใ…“'),
('ใ…ž', 'ใ…œใ…”'),
('ใ…Ÿ', 'ใ…œใ…ฃ'),
('ใ…ข', 'ใ…กใ…ฃ'),
('ใ…‘', 'ใ…ฃใ…'),
('ใ…’', 'ใ…ฃใ…'),
('ใ…•', 'ใ…ฃใ…“'),
('ใ…–', 'ใ…ฃใ…”'),
('ใ…›', 'ใ…ฃใ…—'),
('ใ… ', 'ใ…ฃใ…œ')
]]
# List of (Latin alphabet, hangul) pairs:
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('a', '์—์ด'),
('b', '๋น„'),
('c', '์‹œ'),
('d', '๋””'),
('e', '์ด'),
('f', '์—ํ”„'),
('g', '์ง€'),
('h', '์—์ด์น˜'),
('i', '์•„์ด'),
('j', '์ œ์ด'),
('k', '์ผ€์ด'),
('l', '์—˜'),
('m', '์— '),
('n', '์—”'),
('o', '์˜ค'),
('p', 'ํ”ผ'),
('q', 'ํ'),
('r', '์•„๋ฅด'),
('s', '์—์Šค'),
('t', 'ํ‹ฐ'),
('u', '์œ '),
('v', '๋ธŒ์ด'),
('w', '๋”๋ธ”์œ '),
('x', '์—‘์Šค'),
('y', '์™€์ด'),
('z', '์ œํŠธ')
]]
# List of (ipa, lazy ipa) pairs:
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('tอกษ•','สง'),
('dอกส‘','สฅ'),
('ษฒ','n^'),
('ษ•','สƒ'),
('สท','w'),
('ษญ','l`'),
('สŽ','ษพ'),
('ษฃ','ล‹'),
('ษฐ','ษฏ'),
('ส','j'),
('สŒ','ษ™'),
('ษก','g'),
('\u031a','#'),
('\u0348','='),
('\u031e',''),
('\u0320',''),
('\u0339','')
]]
def fix_g2pk2_error(text):
new_text = ""
i = 0
while i < len(text) - 4:
if (text[i:i+3] == 'ใ…‡ใ…กใ„น' or text[i:i+3] == 'ใ„นใ…กใ„น') and text[i+3] == ' ' and text[i+4] == 'ใ„น':
new_text += text[i:i+3] + ' ' + 'ใ„ด'
i += 5
else:
new_text += text[i]
i += 1
new_text += text[i:]
return new_text
def latin_to_hangul(text):
for regex, replacement in _latin_to_hangul:
text = re.sub(regex, replacement, text)
return text
def divide_hangul(text):
text = j2hcj(h2j(text))
for regex, replacement in _hangul_divided:
text = re.sub(regex, replacement, text)
return text
def hangul_number(num, sino=True):
'''Reference https://github.com/Kyubyong/g2pK'''
num = re.sub(',', '', num)
if num == '0':
return '์˜'
if not sino and num == '20':
return '์Šค๋ฌด'
digits = '123456789'
names = '์ผ์ด์‚ผ์‚ฌ์˜ค์œก์น ํŒ”๊ตฌ'
digit2name = {d: n for d, n in zip(digits, names)}
modifiers = 'ํ•œ ๋‘ ์„ธ ๋„ค ๋‹ค์„ฏ ์—ฌ์„ฏ ์ผ๊ณฑ ์—ฌ๋Ÿ ์•„ํ™‰'
decimals = '์—ด ์Šค๋ฌผ ์„œ๋ฅธ ๋งˆํ” ์‰ฐ ์˜ˆ์ˆœ ์ผํ” ์—ฌ๋“  ์•„ํ”'
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
spelledout = []
for i, digit in enumerate(num):
i = len(num) - i - 1
if sino:
if i == 0:
name = digit2name.get(digit, '')
elif i == 1:
name = digit2name.get(digit, '') + '์‹ญ'
name = name.replace('์ผ์‹ญ', '์‹ญ')
else:
if i == 0:
name = digit2mod.get(digit, '')
elif i == 1:
name = digit2dec.get(digit, '')
if digit == '0':
if i % 4 == 0:
last_three = spelledout[-min(3, len(spelledout)):]
if ''.join(last_three) == '':
spelledout.append('')
continue
else:
spelledout.append('')
continue
if i == 2:
name = digit2name.get(digit, '') + '๋ฐฑ'
name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
elif i == 3:
name = digit2name.get(digit, '') + '์ฒœ'
name = name.replace('์ผ์ฒœ', '์ฒœ')
elif i == 4:
name = digit2name.get(digit, '') + '๋งŒ'
name = name.replace('์ผ๋งŒ', '๋งŒ')
elif i == 5:
name = digit2name.get(digit, '') + '์‹ญ'
name = name.replace('์ผ์‹ญ', '์‹ญ')
elif i == 6:
name = digit2name.get(digit, '') + '๋ฐฑ'
name = name.replace('์ผ๋ฐฑ', '๋ฐฑ')
elif i == 7:
name = digit2name.get(digit, '') + '์ฒœ'
name = name.replace('์ผ์ฒœ', '์ฒœ')
elif i == 8:
name = digit2name.get(digit, '') + '์–ต'
elif i == 9:
name = digit2name.get(digit, '') + '์‹ญ'
elif i == 10:
name = digit2name.get(digit, '') + '๋ฐฑ'
elif i == 11:
name = digit2name.get(digit, '') + '์ฒœ'
elif i == 12:
name = digit2name.get(digit, '') + '์กฐ'
elif i == 13:
name = digit2name.get(digit, '') + '์‹ญ'
elif i == 14:
name = digit2name.get(digit, '') + '๋ฐฑ'
elif i == 15:
name = digit2name.get(digit, '') + '์ฒœ'
spelledout.append(name)
return ''.join(elem for elem in spelledout)
def number_to_hangul(text):
'''Reference https://github.com/Kyubyong/g2pK'''
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
for token in tokens:
num, classifier = token
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
spelledout = hangul_number(num, sino=False)
else:
spelledout = hangul_number(num, sino=True)
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
# digit by digit for remaining digits
digits = '0123456789'
names = '์˜์ผ์ด์‚ผ์‚ฌ์˜ค์œก์น ํŒ”๊ตฌ'
for d, n in zip(digits, names):
text = text.replace(d, n)
return text
def korean_to_lazy_ipa(text):
text = latin_to_hangul(text)
text = number_to_hangul(text)
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
for regex, replacement in _ipa_to_lazy_ipa:
text = re.sub(regex, replacement, text)
return text
def korean_to_ipa(text):
text = latin_to_hangul(text)
text = number_to_hangul(text)
g2p = G2p()
text = g2p(text)
text = fix_g2pk2_error(text)
text = korean_to_lazy_ipa(text)
return text.replace('สง','tสƒ').replace('สฅ','dส‘')