Upload 8 files

Browse files

Files changed (8) hide show

text/__init__.py +56 -0
text/__pycache__/__init__.cpython-38.pyc +0 -0
text/__pycache__/cleaners.cpython-38.pyc +0 -0
text/__pycache__/korean.cpython-38.pyc +0 -0
text/__pycache__/symbols.cpython-38.pyc +0 -0
text/cleaners.py +14 -0
text/korean.py +231 -0
text/symbols.py +10 -0

text/__init__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+""" from https://github.com/keithito/tacotron """
+from text import cleaners
+from text.symbols import symbols
+# Mappings from symbol to numeric ID and vice versa:
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+def text_to_sequence(text, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = []
+  clean_text = _clean_text(text, cleaner_names)
+  for symbol in clean_text:
+    if symbol not in _symbol_to_id.keys():
+      continue
+    symbol_id = _symbol_to_id[symbol]
+    sequence += [symbol_id]
+  return sequence
+def cleaned_text_to_sequence(cleaned_text):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    Args:
+      text: string to convert to a sequence
+    Returns:
+      List of integers corresponding to the symbols in the text
+  '''
+  sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
+  return sequence
+def sequence_to_text(sequence):
+  '''Converts a sequence of IDs back to a string'''
+  result = ''
+  for symbol_id in sequence:
+    s = _id_to_symbol[symbol_id]
+    result += s
+  return result
+def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text

text/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (2.14 kB). View file

text/__pycache__/cleaners.cpython-38.pyc ADDED Viewed

Binary file (560 Bytes). View file

text/__pycache__/korean.cpython-38.pyc ADDED Viewed

Binary file (5.99 kB). View file

text/__pycache__/symbols.cpython-38.pyc ADDED Viewed

Binary file (362 Bytes). View file

text/cleaners.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import re
+from unidecode import unidecode
+from text.korean import latin_to_hangul, divide_hangul, fix_g2pk2_error
+from g2pk2 import G2p
+def korean_cleaners(text):
+    text = latin_to_hangul(text)
+    g2p = G2p()
+    text = g2p(text)
+    text = divide_hangul(text)
+    text = fix_g2pk2_error(text)
+    text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
+    return text

text/korean.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import re
+from jamo import h2j, j2hcj
+import ko_pron
+from g2pk2 import G2p
+# This is a list of Korean classifiers preceded by pure Korean numerals.
+_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
+# List of (hangul, hangul divided) pairs:
+_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
+    # ('ㄳ', 'ㄱㅅ'),   # g2pk2, A Syllable-ending Rule
+    # ('ㄵ', 'ㄴㅈ'),
+    # ('ㄶ', 'ㄴㅎ'),
+    # ('ㄺ', 'ㄹㄱ'),
+    # ('ㄻ', 'ㄹㅁ'),
+    # ('ㄼ', 'ㄹㅂ'),
+    # ('ㄽ', 'ㄹㅅ'),
+    # ('ㄾ', 'ㄹㅌ'),
+    # ('ㄿ', 'ㄹㅍ'),
+    # ('ㅀ', 'ㄹㅎ'),
+    # ('ㅄ', 'ㅂㅅ'),
+    ('ㅘ', 'ㅗㅏ'),
+    ('ㅙ', 'ㅗㅐ'),
+    ('ㅚ', 'ㅗㅣ'),
+    ('ㅝ', 'ㅜㅓ'),
+    ('ㅞ', 'ㅜㅔ'),
+    ('ㅟ', 'ㅜㅣ'),
+    ('ㅢ', 'ㅡㅣ'),
+    ('ㅑ', 'ㅣㅏ'),
+    ('ㅒ', 'ㅣㅐ'),
+    ('ㅕ', 'ㅣㅓ'),
+    ('ㅖ', 'ㅣㅔ'),
+    ('ㅛ', 'ㅣㅗ'),
+    ('ㅠ', 'ㅣㅜ')
+]]
+# List of (Latin alphabet, hangul) pairs:
+_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', '에이'),
+    ('b', '비'),
+    ('c', '시'),
+    ('d', '디'),
+    ('e', '이'),
+    ('f', '에프'),
+    ('g', '지'),
+    ('h', '에이치'),
+    ('i', '아이'),
+    ('j', '제이'),
+    ('k', '케이'),
+    ('l', '엘'),
+    ('m', '엠'),
+    ('n', '엔'),
+    ('o', '오'),
+    ('p', '피'),
+    ('q', '큐'),
+    ('r', '아르'),
+    ('s', '에스'),
+    ('t', '티'),
+    ('u', '유'),
+    ('v', '브이'),
+    ('w', '더블유'),
+    ('x', '엑스'),
+    ('y', '와이'),
+    ('z', '제트')
+]]
+# List of (ipa, lazy ipa) pairs:
+_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('t͡ɕ','ʧ'),
+    ('d͡ʑ','ʥ'),
+    ('ɲ','n^'),
+    ('ɕ','ʃ'),
+    ('ʷ','w'),
+    ('ɭ','l`'),
+    ('ʎ','ɾ'),
+    ('ɣ','ŋ'),
+    ('ɰ','ɯ'),
+    ('ʝ','j'),
+    ('ʌ','ə'),
+    ('ɡ','g'),
+    ('\u031a','#'),
+    ('\u0348','='),
+    ('\u031e',''),
+    ('\u0320',''),
+    ('\u0339','')
+]]
+def fix_g2pk2_error(text):
+    new_text = ""
+    i = 0
+    while i < len(text) - 4:
+        if (text[i:i+3] == 'ㅇㅡㄹ' or text[i:i+3] == 'ㄹㅡㄹ') and text[i+3] == ' ' and text[i+4] == 'ㄹ':
+            new_text += text[i:i+3] + ' ' + 'ㄴ'
+            i += 5
+        else:
+            new_text += text[i]
+            i += 1
+    new_text += text[i:]
+    return new_text
+def latin_to_hangul(text):
+    for regex, replacement in _latin_to_hangul:
+        text = re.sub(regex, replacement, text)
+    return text
+def divide_hangul(text):
+    text = j2hcj(h2j(text))
+    for regex, replacement in _hangul_divided:
+        text = re.sub(regex, replacement, text)
+    return text
+def hangul_number(num, sino=True):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    num = re.sub(',', '', num)
+    if num == '0':
+        return '영'
+    if not sino and num == '20':
+        return '스무'
+    digits = '123456789'
+    names = '일이삼사오육칠팔구'
+    digit2name = {d: n for d, n in zip(digits, names)}
+    modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
+    decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
+    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
+    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
+    spelledout = []
+    for i, digit in enumerate(num):
+        i = len(num) - i - 1
+        if sino:
+            if i == 0:
+                name = digit2name.get(digit, '')
+            elif i == 1:
+                name = digit2name.get(digit, '') + '십'
+                name = name.replace('일십', '십')
+        else:
+            if i == 0:
+                name = digit2mod.get(digit, '')
+            elif i == 1:
+                name = digit2dec.get(digit, '')
+        if digit == '0':
+            if i % 4 == 0:
+                last_three = spelledout[-min(3, len(spelledout)):]
+                if ''.join(last_three) == '':
+                    spelledout.append('')
+                    continue
+            else:
+                spelledout.append('')
+                continue
+        if i == 2:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 3:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 4:
+            name = digit2name.get(digit, '') + '만'
+            name = name.replace('일만', '만')
+        elif i == 5:
+            name = digit2name.get(digit, '') + '십'
+            name = name.replace('일십', '십')
+        elif i == 6:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 7:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 8:
+            name = digit2name.get(digit, '') + '억'
+        elif i == 9:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 10:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 11:
+            name = digit2name.get(digit, '') + '천'
+        elif i == 12:
+            name = digit2name.get(digit, '') + '조'
+        elif i == 13:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 14:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 15:
+            name = digit2name.get(digit, '') + '천'
+        spelledout.append(name)
+    return ''.join(elem for elem in spelledout)
+def number_to_hangul(text):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
+    for token in tokens:
+        num, classifier = token
+        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
+            spelledout = hangul_number(num, sino=False)
+        else:
+            spelledout = hangul_number(num, sino=True)
+        text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
+    # digit by digit for remaining digits
+    digits = '0123456789'
+    names = '영일이삼사오육칠팔구'
+    for d, n in zip(digits, names):
+        text = text.replace(d, n)
+    return text
+def korean_to_lazy_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
+    for regex, replacement in _ipa_to_lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def korean_to_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    g2p = G2p()
+    text = g2p(text)
+    text = fix_g2pk2_error(text)
+    text = korean_to_lazy_ipa(text)
+    return text.replace('ʧ','tʃ').replace('ʥ','dʑ')

text/symbols.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# korean_cleaners
+_pad        = '_'
+_punctuation = ',.!?…~'
+_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
+# Export all symbols:
+symbols = [_pad] + list(_punctuation) + list(_letters)
+# Special symbol ids
+SPACE_ID = symbols.index(' ')