Spaces:

Mahiruoshi
/

Lovelive_Nijigasaki_VITS

Running

App Files Files Community

Mahiruoshi commited on Jan 4, 2023

Commit

6c035be

•

1 Parent(s): f0ca36a

Upload 25 files

Browse files

Files changed (16) hide show

text/__pycache__/__init__.cpython-311.pyc +0 -0
text/__pycache__/__init__.cpython-38.pyc +0 -0
text/__pycache__/cleaners.cpython-311.pyc +0 -0
text/__pycache__/cleaners.cpython-38.pyc +0 -0
text/__pycache__/japanese.cpython-311.pyc +0 -0
text/__pycache__/japanese.cpython-38.pyc +0 -0
text/__pycache__/mandarin.cpython-38.pyc +0 -0
text/__pycache__/symbols.cpython-38.pyc +0 -0
text/cantonese.py +59 -0
text/cleaners.py +51 -99
text/korean.py +210 -0
text/mandarin.py +5 -7
text/ngu_dialect.py +30 -0
text/shanghainese.py +64 -0
text/symbols.py +12 -4
text/thai.py +44 -0

text/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (2.89 kB). View file

text/__pycache__/__init__.cpython-38.pyc CHANGED Viewed

Binary files a/text/__pycache__/__init__.cpython-38.pyc and b/text/__pycache__/__init__.cpython-38.pyc differ

text/__pycache__/cleaners.cpython-311.pyc ADDED Viewed

Binary file (13.1 kB). View file

text/__pycache__/cleaners.cpython-38.pyc CHANGED Viewed

Binary files a/text/__pycache__/cleaners.cpython-38.pyc and b/text/__pycache__/cleaners.cpython-38.pyc differ

text/__pycache__/japanese.cpython-311.pyc ADDED Viewed

Binary file (8.36 kB). View file

text/__pycache__/japanese.cpython-38.pyc CHANGED Viewed

Binary files a/text/__pycache__/japanese.cpython-38.pyc and b/text/__pycache__/japanese.cpython-38.pyc differ

text/__pycache__/mandarin.cpython-38.pyc CHANGED Viewed

Binary files a/text/__pycache__/mandarin.cpython-38.pyc and b/text/__pycache__/mandarin.cpython-38.pyc differ

text/__pycache__/symbols.cpython-38.pyc CHANGED Viewed

Binary files a/text/__pycache__/symbols.cpython-38.pyc and b/text/__pycache__/symbols.cpython-38.pyc differ

text/cantonese.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import re
+import cn2an
+import opencc
+converter = opencc.OpenCC('jyutjyu')
+# List of (Latin alphabet, ipa) pairs:
+_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('A', 'ei˥'),
+    ('B', 'biː˥'),
+    ('C', 'siː˥'),
+    ('D', 'tiː˥'),
+    ('E', 'iː˥'),
+    ('F', 'e˥fuː˨˩'),
+    ('G', 'tsiː˥'),
+    ('H', 'ɪk̚˥tsʰyː˨˩'),
+    ('I', 'ɐi˥'),
+    ('J', 'tsei˥'),
+    ('K', 'kʰei˥'),
+    ('L', 'e˥llou˨˩'),
+    ('M', 'ɛːm˥'),
+    ('N', 'ɛːn˥'),
+    ('O', 'ou˥'),
+    ('P', 'pʰiː˥'),
+    ('Q', 'kʰiːu˥'),
+    ('R', 'aː˥lou˨˩'),
+    ('S', 'ɛː˥siː˨˩'),
+    ('T', 'tʰiː˥'),
+    ('U', 'juː˥'),
+    ('V', 'wiː˥'),
+    ('W', 'tʊk̚˥piː˥juː˥'),
+    ('X', 'ɪk̚˥siː˨˩'),
+    ('Y', 'waːi˥'),
+    ('Z', 'iː˨sɛːt̚˥')
+]]
+def number_to_cantonese(text):
+    return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
+def latin_to_ipa(text):
+    for regex, replacement in _latin_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def cantonese_to_ipa(text):
+    text = number_to_cantonese(text.upper())
+    text = converter.convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text

text/cleaners.py CHANGED Viewed

@@ -1,19 +1,18 @@
 import re
-from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
 from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
 from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
-# from text.sanskrit import devanagari_to_ipa
-# from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
-# from text.thai import num_to_thai, latin_to_thai
-# from text.shanghainese import shanghainese_to_ipa
-# from text.cantonese import cantonese_to_ipa
-# from text.ngu_dialect import ngu_dialect_to_ipa
 def japanese_cleaners(text):
     text = japanese_to_romaji_with_accent(text)
-    if re.match('[A-Za-z]', text[-1]):
-        text += '.'
     return text
@@ -26,8 +25,7 @@ def korean_cleaners(text):
     text = latin_to_hangul(text)
     text = number_to_hangul(text)
     text = divide_hangul(text)
-    if re.match('[\u3131-\u3163]', text[-1]):
-        text += '.'
     return text
@@ -36,110 +34,67 @@ def chinese_cleaners(text):
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
-    if re.match('[ˉˊˇˋ˙]', text[-1]):
-        text += '。'
     return text
 def zh_ja_mixture_cleaners(text):
-    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
-    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
-    for chinese_text in chinese_texts:
-        cleaned_text = chinese_to_romaji(chinese_text[4:-4])
-        text = text.replace(chinese_text, cleaned_text+' ', 1)
-    for japanese_text in japanese_texts:
-        cleaned_text = japanese_to_romaji_with_accent(
-            japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
-        text = text.replace(japanese_text, cleaned_text+' ', 1)
-    text = text[:-1]
-    if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
-        text += '.'
     return text
 def sanskrit_cleaners(text):
     text = text.replace('॥', '।').replace('ॐ', 'ओम्')
-    if text[-1] != '।':
-        text += ' ।'
     return text
 def cjks_cleaners(text):
-    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
-    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
-    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
-    sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
-    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
-    for chinese_text in chinese_texts:
-        cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
-        text = text.replace(chinese_text, cleaned_text+' ', 1)
-    for japanese_text in japanese_texts:
-        cleaned_text = japanese_to_ipa(japanese_text[4:-4])
-        text = text.replace(japanese_text, cleaned_text+' ', 1)
-    for korean_text in korean_texts:
-        cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
-        text = text.replace(korean_text, cleaned_text+' ', 1)
-    for sanskrit_text in sanskrit_texts:
-        cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
-        text = text.replace(sanskrit_text, cleaned_text+' ', 1)
-    for english_text in english_texts:
-        cleaned_text = english_to_lazy_ipa(english_text[4:-4])
-        text = text.replace(english_text, cleaned_text+' ', 1)
-    text = text[:-1]
-    if re.match(r'[^\.,!\?\-…~]', text[-1]):
-        text += '.'
     return text
 def cjke_cleaners(text):
-    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
-    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
-    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
-    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
-    for chinese_text in chinese_texts:
-        cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
-        cleaned_text = cleaned_text.replace(
-            'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
-        text = text.replace(chinese_text, cleaned_text+' ', 1)
-    for japanese_text in japanese_texts:
-        cleaned_text = japanese_to_ipa(japanese_text[4:-4])
-        cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
-            'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
-        text = text.replace(japanese_text, cleaned_text+' ', 1)
-    for korean_text in korean_texts:
-        cleaned_text = korean_to_ipa(korean_text[4:-4])
-        text = text.replace(korean_text, cleaned_text+' ', 1)
-    for english_text in english_texts:
-        cleaned_text = english_to_ipa2(english_text[4:-4])
-        cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
-            'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
-        text = text.replace(english_text, cleaned_text+' ', 1)
-    text = text[:-1]
-    if re.match(r'[^\.,!\?\-…~]', text[-1]):
-        text += '.'
     return text
 def cjke_cleaners2(text):
-    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
-    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
-    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
-    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
-    for chinese_text in chinese_texts:
-        cleaned_text = chinese_to_ipa(chinese_text[4:-4])
-        text = text.replace(chinese_text, cleaned_text+' ', 1)
-    for japanese_text in japanese_texts:
-        cleaned_text = japanese_to_ipa2(japanese_text[4:-4])
-        text = text.replace(japanese_text, cleaned_text+' ', 1)
-    for korean_text in korean_texts:
-        cleaned_text = korean_to_ipa(korean_text[4:-4])
-        text = text.replace(korean_text, cleaned_text+' ', 1)
-    for english_text in english_texts:
-        cleaned_text = english_to_ipa2(english_text[4:-4])
-        text = text.replace(english_text, cleaned_text+' ', 1)
-    text = text[:-1]
-    if re.match(r'[^\.,!\?\-…~]', text[-1]):
-        text += '.'
     return text
@@ -151,16 +106,13 @@ def thai_cleaners(text):
 def shanghainese_cleaners(text):
     text = shanghainese_to_ipa(text)
-    if re.match(r'[^\.,!\?\-…~]', text[-1]):
-        text += '.'
     return text
 def chinese_dialect_cleaners(text):
-    text = re.sub(r'\[MD\](.*?)\[MD\]',
                   lambda x: chinese_to_ipa2(x.group(1))+' ', text)
-    text = re.sub(r'\[TW\](.*?)\[TW\]',
-                  lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
                   lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
     text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',

 import re
 from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
+from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
 from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
+from text.sanskrit import devanagari_to_ipa
+from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
+from text.thai import num_to_thai, latin_to_thai
+from text.shanghainese import shanghainese_to_ipa
+from text.cantonese import cantonese_to_ipa
+from text.ngu_dialect import ngu_dialect_to_ipa
 def japanese_cleaners(text):
     text = japanese_to_romaji_with_accent(text)
+    text = re.sub(r'([A-Za-z])$', r'\1.', text)
     return text
     text = latin_to_hangul(text)
     text = number_to_hangul(text)
     text = divide_hangul(text)
+    text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
     return text
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
+    text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
     return text
 def zh_ja_mixture_cleaners(text):
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_romaji(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
+        x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
 def sanskrit_cleaners(text):
     text = text.replace('॥', '।').replace('ॐ', 'ओम्')
+    text = re.sub(r'([^।])$', r'\1।', text)
     return text
 def cjks_cleaners(text):
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[SA\](.*?)\[SA\]',
+                  lambda x: devanagari_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
 def cjke_cleaners(text):
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
+        'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
+        'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
+    text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
+        'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
 def cjke_cleaners2(text):
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_ipa(x.group(1))+' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_ipa2(x.group(1))+' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
 def shanghainese_cleaners(text):
     text = shanghainese_to_ipa(text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
 def chinese_dialect_cleaners(text):
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
                   lambda x: chinese_to_ipa2(x.group(1))+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
                   lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
     text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',

text/korean.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import re
+from jamo import h2j, j2hcj
+import ko_pron
+# This is a list of Korean classifiers preceded by pure Korean numerals.
+_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
+# List of (hangul, hangul divided) pairs:
+_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄳ', 'ㄱㅅ'),
+    ('ㄵ', 'ㄴㅈ'),
+    ('ㄶ', 'ㄴㅎ'),
+    ('ㄺ', 'ㄹㄱ'),
+    ('ㄻ', 'ㄹㅁ'),
+    ('ㄼ', 'ㄹㅂ'),
+    ('ㄽ', 'ㄹㅅ'),
+    ('ㄾ', 'ㄹㅌ'),
+    ('ㄿ', 'ㄹㅍ'),
+    ('ㅀ', 'ㄹㅎ'),
+    ('ㅄ', 'ㅂㅅ'),
+    ('ㅘ', 'ㅗㅏ'),
+    ('ㅙ', 'ㅗㅐ'),
+    ('ㅚ', 'ㅗㅣ'),
+    ('ㅝ', 'ㅜㅓ'),
+    ('ㅞ', 'ㅜㅔ'),
+    ('ㅟ', 'ㅜㅣ'),
+    ('ㅢ', 'ㅡㅣ'),
+    ('ㅑ', 'ㅣㅏ'),
+    ('ㅒ', 'ㅣㅐ'),
+    ('ㅕ', 'ㅣㅓ'),
+    ('ㅖ', 'ㅣㅔ'),
+    ('ㅛ', 'ㅣㅗ'),
+    ('ㅠ', 'ㅣㅜ')
+]]
+# List of (Latin alphabet, hangul) pairs:
+_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', '에이'),
+    ('b', '비'),
+    ('c', '시'),
+    ('d', '디'),
+    ('e', '이'),
+    ('f', '에프'),
+    ('g', '지'),
+    ('h', '에이치'),
+    ('i', '아이'),
+    ('j', '제이'),
+    ('k', '케이'),
+    ('l', '엘'),
+    ('m', '엠'),
+    ('n', '엔'),
+    ('o', '오'),
+    ('p', '피'),
+    ('q', '큐'),
+    ('r', '아르'),
+    ('s', '에스'),
+    ('t', '티'),
+    ('u', '유'),
+    ('v', '브이'),
+    ('w', '더블유'),
+    ('x', '엑스'),
+    ('y', '와이'),
+    ('z', '제트')
+]]
+# List of (ipa, lazy ipa) pairs:
+_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('t͡ɕ','ʧ'),
+    ('d͡ʑ','ʥ'),
+    ('ɲ','n^'),
+    ('ɕ','ʃ'),
+    ('ʷ','w'),
+    ('ɭ','l`'),
+    ('ʎ','ɾ'),
+    ('ɣ','ŋ'),
+    ('ɰ','ɯ'),
+    ('ʝ','j'),
+    ('ʌ','ə'),
+    ('ɡ','g'),
+    ('\u031a','#'),
+    ('\u0348','='),
+    ('\u031e',''),
+    ('\u0320',''),
+    ('\u0339','')
+]]
+def latin_to_hangul(text):
+    for regex, replacement in _latin_to_hangul:
+        text = re.sub(regex, replacement, text)
+    return text
+def divide_hangul(text):
+    text = j2hcj(h2j(text))
+    for regex, replacement in _hangul_divided:
+        text = re.sub(regex, replacement, text)
+    return text
+def hangul_number(num, sino=True):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    num = re.sub(',', '', num)
+    if num == '0':
+        return '영'
+    if not sino and num == '20':
+        return '스무'
+    digits = '123456789'
+    names = '일이삼사오육칠팔구'
+    digit2name = {d: n for d, n in zip(digits, names)}
+    modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
+    decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
+    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
+    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
+    spelledout = []
+    for i, digit in enumerate(num):
+        i = len(num) - i - 1
+        if sino:
+            if i == 0:
+                name = digit2name.get(digit, '')
+            elif i == 1:
+                name = digit2name.get(digit, '') + '십'
+                name = name.replace('일십', '십')
+        else:
+            if i == 0:
+                name = digit2mod.get(digit, '')
+            elif i == 1:
+                name = digit2dec.get(digit, '')
+        if digit == '0':
+            if i % 4 == 0:
+                last_three = spelledout[-min(3, len(spelledout)):]
+                if ''.join(last_three) == '':
+                    spelledout.append('')
+                    continue
+            else:
+                spelledout.append('')
+                continue
+        if i == 2:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 3:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 4:
+            name = digit2name.get(digit, '') + '만'
+            name = name.replace('일만', '만')
+        elif i == 5:
+            name = digit2name.get(digit, '') + '십'
+            name = name.replace('일십', '십')
+        elif i == 6:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 7:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 8:
+            name = digit2name.get(digit, '') + '억'
+        elif i == 9:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 10:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 11:
+            name = digit2name.get(digit, '') + '천'
+        elif i == 12:
+            name = digit2name.get(digit, '') + '조'
+        elif i == 13:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 14:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 15:
+            name = digit2name.get(digit, '') + '천'
+        spelledout.append(name)
+    return ''.join(elem for elem in spelledout)
+def number_to_hangul(text):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
+    for token in tokens:
+        num, classifier = token
+        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
+            spelledout = hangul_number(num, sino=False)
+        else:
+            spelledout = hangul_number(num, sino=True)
+        text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
+    # digit by digit for remaining digits
+    digits = '0123456789'
+    names = '영일이삼사오육칠팔구'
+    for d, n in zip(digits, names):
+        text = text.replace(d, n)
+    return text
+def korean_to_lazy_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
+    for regex, replacement in _ipa_to_lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def korean_to_ipa(text):
+    text = korean_to_lazy_ipa(text)
+    return text.replace('ʧ','tʃ').replace('ʥ','dʑ')

text/mandarin.py CHANGED Viewed

@@ -4,6 +4,7 @@ import re
 from pypinyin import lazy_pinyin, BOPOMOFO
 import jieba
 import cn2an
 # List of (Latin alphabet, bopomofo) pairs:
@@ -239,7 +240,7 @@ def number_to_chinese(text):
     return text
-def chinese_to_bopomofo(text, taiwanese=False):
     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
     words = jieba.lcut(text, cut_all=False)
     text = ''
@@ -252,10 +253,7 @@ def chinese_to_bopomofo(text, taiwanese=False):
             bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
         if text != '':
             text += ' '
-        if taiwanese:
-            text += '#'+'#'.join(bopomofos)
-        else:
-            text += ''.join(bopomofos)
     return text
@@ -316,9 +314,9 @@ def chinese_to_ipa(text):
     return text
-def chinese_to_ipa2(text, taiwanese=False):
     text = number_to_chinese(text)
-    text = chinese_to_bopomofo(text, taiwanese)
     text = latin_to_bopomofo(text)
     text = bopomofo_to_ipa2(text)
     text = re.sub(r'i([aoe])', r'j\1', text)

 from pypinyin import lazy_pinyin, BOPOMOFO
 import jieba
 import cn2an
+import logging
 # List of (Latin alphabet, bopomofo) pairs:
     return text
+def chinese_to_bopomofo(text):
     text = text.replace('、', '，').replace('；', '，').replace('：', '，')
     words = jieba.lcut(text, cut_all=False)
     text = ''
             bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
         if text != '':
             text += ' '
+        text += ''.join(bopomofos)
     return text
     return text
+def chinese_to_ipa2(text):
     text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
     text = bopomofo_to_ipa2(text)
     text = re.sub(r'i([aoe])', r'j\1', text)

text/ngu_dialect.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import re
+import opencc
+dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
+            'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
+            'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
+            'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
+            'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
+            'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
+converters = {}
+for dialect in dialects.values():
+    try:
+        converters[dialect] = opencc.OpenCC(dialect)
+    except:
+        pass
+def ngu_dialect_to_ipa(text, dialect):
+    dialect = dialects[dialect]
+    text = converters[dialect].convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text

text/shanghainese.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import re
+import cn2an
+import opencc
+converter = opencc.OpenCC('zaonhe')
+# List of (Latin alphabet, ipa) pairs:
+_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('A', 'ᴇ'),
+    ('B', 'bi'),
+    ('C', 'si'),
+    ('D', 'di'),
+    ('E', 'i'),
+    ('F', 'ᴇf'),
+    ('G', 'dʑi'),
+    ('H', 'ᴇtɕʰ'),
+    ('I', 'ᴀi'),
+    ('J', 'dʑᴇ'),
+    ('K', 'kʰᴇ'),
+    ('L', 'ᴇl'),
+    ('M', 'ᴇm'),
+    ('N', 'ᴇn'),
+    ('O', 'o'),
+    ('P', 'pʰi'),
+    ('Q', 'kʰiu'),
+    ('R', 'ᴀl'),
+    ('S', 'ᴇs'),
+    ('T', 'tʰi'),
+    ('U', 'ɦiu'),
+    ('V', 'vi'),
+    ('W', 'dᴀbɤliu'),
+    ('X', 'ᴇks'),
+    ('Y', 'uᴀi'),
+    ('Z', 'zᴇ')
+]]
+def _number_to_shanghainese(num):
+    num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
+    return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
+def number_to_shanghainese(text):
+    return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
+def latin_to_ipa(text):
+    for regex, replacement in _latin_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def shanghainese_to_ipa(text):
+    text = number_to_shanghainese(text.upper())
+    text = converter.convert(text).replace('-','').replace('$',' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
+    text = re.sub(r'[、；：]', '，', text)
+    text = re.sub(r'\s*，\s*', ', ', text)
+    text = re.sub(r'\s*。\s*', '. ', text)
+    text = re.sub(r'\s*？\s*', '? ', text)
+    text = re.sub(r'\s*！\s*', '! ', text)
+    text = re.sub(r'\s*$', '', text)
+    return text

text/symbols.py CHANGED Viewed

@@ -1,15 +1,18 @@
 '''
 Defines the set of symbols used in text input to the model.
 '''
 _pad        = '_'
-_punctuation = ',.!?-~…'
-_letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
 '''
 # japanese_cleaners2
 _pad        = '_'
 _punctuation = ',.!?-~…'
 _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
-'''
 '''# korean_cleaners
 _pad        = '_'
@@ -23,6 +26,11 @@ _punctuation = '，。！？—…'
 _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
 '''
 '''# sanskrit_cleaners
 _pad        = '_'
@@ -57,7 +65,7 @@ _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
 '''# chinese_dialect_cleaners
 _pad        = '_'
 _punctuation = ',.!?~…─'
-_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚αᴀᴇ↑↓∅ⱼ '
 '''
 # Export all symbols:

 '''
 Defines the set of symbols used in text input to the model.
 '''
+'''# japanese_cleaners
 _pad        = '_'
+_punctuation = ',.!?-'
+_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
 '''
 # japanese_cleaners2
 _pad        = '_'
 _punctuation = ',.!?-~…'
 _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
 '''# korean_cleaners
 _pad        = '_'
 _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
 '''
+'''# zh_ja_mixture_cleaners
+_pad        = '_'
+_punctuation = ',.!?-~…'
+_letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
+'''
 '''# sanskrit_cleaners
 _pad        = '_'
 '''# chinese_dialect_cleaners
 _pad        = '_'
 _punctuation = ',.!?~…─'
+_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
 '''
 # Export all symbols:

text/thai.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import re
+from num_thai.thainumbers import NumThai
+num = NumThai()
+# List of (Latin alphabet, Thai) pairs:
+_latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', 'เอ'),
+    ('b','บี'),
+    ('c','ซี'),
+    ('d','ดี'),
+    ('e','อี'),
+    ('f','เอฟ'),
+    ('g','จี'),
+    ('h','เอช'),
+    ('i','ไอ'),
+    ('j','เจ'),
+    ('k','เค'),
+    ('l','แอล'),
+    ('m','เอ็ม'),
+    ('n','เอ็น'),
+    ('o','โอ'),
+    ('p','พี'),
+    ('q','คิว'),
+    ('r','แอร์'),
+    ('s','เอส'),
+    ('t','ที'),
+    ('u','ยู'),
+    ('v','วี'),
+    ('w','ดับเบิลยู'),
+    ('x','เอ็กซ์'),
+    ('y','วาย'),
+    ('z','ซี')
+]]
+def num_to_thai(text):
+    return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
+def latin_to_thai(text):
+    for regex, replacement in _latin_to_thai:
+        text = re.sub(regex, replacement, text)
+    return text