moe-tts / text /mandarin.py
skytnt's picture
update model
e1ac136
raw
history blame
4.23 kB
import os
import sys
import re
from pypinyin import lazy_pinyin, BOPOMOFO
import jieba
import cn2an
import logging
logging.getLogger('jieba').setLevel(logging.WARNING)
jieba.initialize()
# List of (Latin alphabet, bopomofo) pairs:
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('a', 'γ„ŸΛ‰'),
('b', 'ㄅㄧˋ'),
('c', 'ㄙㄧˉ'),
('d', 'ㄉㄧˋ'),
('e', 'ㄧˋ'),
('f', 'γ„ΛŠγ„ˆγ„¨Λ‹'),
('g', 'ㄐㄧˋ'),
('h', 'ㄝˇㄑㄩˋ'),
('i', 'γ„žΛ‹'),
('j', 'γ„γ„ŸΛ‹'),
('k', 'γ„Žγ„ŸΛ‹'),
('l', 'γ„ΛŠγ„›Λ‹'),
('m', 'γ„ΛŠγ„‡γ„¨Λ‹'),
('n', 'γ„£Λ‰'),
('o', 'γ„‘Λ‰'),
('p', 'ㄆㄧˉ'),
('q', 'γ„Žγ„§γ„‘Λ‰'),
('r', 'γ„šΛ‹'),
('s', 'γ„ΛŠγ„™Λ‹'),
('t', 'γ„Šγ„§Λ‹'),
('u', 'ㄧㄑˉ'),
('v', 'ㄨㄧˉ'),
('w', 'γ„‰γ„šΛ‹γ„…γ„¨Λ‹γ„Œγ„§γ„‘Λ‹'),
('x', 'γ„Λ‰γ„Žγ„¨Λ‹γ„™Λ‹'),
('y', 'γ„¨γ„žΛ‹'),
('z', 'γ„—γ„ŸΛ‹')
]]
# List of (bopomofo, romaji) pairs:
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
('γ„…γ„›', 'p⁼wo'),
('ㄆㄛ', 'pΚ°wo'),
('ㄇㄛ', 'mwo'),
('γ„ˆγ„›', 'fwo'),
('γ„…', 'p⁼'),
('ㄆ', 'pΚ°'),
('ㄇ', 'm'),
('γ„ˆ', 'f'),
('ㄉ', 't⁼'),
('γ„Š', 'tΚ°'),
('γ„‹', 'n'),
('γ„Œ', 'l'),
('ㄍ', 'k⁼'),
('γ„Ž', 'kΚ°'),
('ㄏ', 'h'),
('ㄐ', 'ʧ⁼'),
('γ„‘', 'Κ§Κ°'),
('γ„’', 'Κƒ'),
('γ„“', 'Κ¦`⁼'),
('γ„”', 'Κ¦`Κ°'),
('γ„•', 's`'),
('γ„–', 'ΙΉ`'),
('γ„—', 'ʦ⁼'),
('γ„˜', 'Κ¦Κ°'),
('γ„™', 's'),
('γ„š', 'a'),
('γ„›', 'o'),
('γ„œ', 'Ι™'),
('ㄝ', 'e'),
('γ„ž', 'ai'),
('γ„Ÿ', 'ei'),
('γ„ ', 'au'),
('γ„‘', 'ou'),
('ㄧㄒ', 'yeNN'),
('γ„’', 'aNN'),
('ㄧㄣ', 'iNN'),
('γ„£', 'Ι™NN'),
('γ„€', 'aNg'),
('ㄧγ„₯', 'iNg'),
('ㄨγ„₯', 'uNg'),
('γ„©γ„₯', 'yuNg'),
('γ„₯', 'Ι™Ng'),
('ㄦ', 'Ι™Ι»'),
('ㄧ', 'i'),
('ㄨ', 'u'),
('γ„©', 'Ι₯'),
('Λ‰', 'β†’'),
('ˊ', '↑'),
('Λ‡', '↓↑'),
('Λ‹', '↓'),
('Λ™', ''),
(',', ','),
('。', '.'),
('!', '!'),
('?', '?'),
('β€”', '-')
]]
# List of (romaji, ipa) pairs:
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
('Κƒy', 'Κƒ'),
('Κ§Κ°y', 'Κ§Κ°'),
('ʧ⁼y', 'ʧ⁼'),
('NN', 'n'),
('Ng', 'Ε‹'),
('y', 'j'),
('h', 'x')
]]
def number_to_chinese(text):
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
for number in numbers:
text = text.replace(number, cn2an.an2cn(number), 1)
return text
def chinese_to_bopomofo(text):
text = text.replace('、', ',').replace('οΌ›', ',').replace(':', ',')
words = jieba.lcut(text, cut_all=False)
text = ''
for word in words:
bopomofos = lazy_pinyin(word, BOPOMOFO)
if not re.search('[\u4e00-\u9fff]', word):
text += word
continue
for i in range(len(bopomofos)):
if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
bopomofos[i] += 'Λ‰'
if text != '':
text += ' '
text += ''.join(bopomofos)
return text
def latin_to_bopomofo(text):
for regex, replacement in _latin_to_bopomofo:
text = re.sub(regex, replacement, text)
return text
def bopomofo_to_romaji(text):
for regex, replacement in _bopomofo_to_romaji:
text = re.sub(regex, replacement, text)
return text
def chinese_to_romaji(text):
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = bopomofo_to_romaji(text)
text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
text = re.sub('u[aoΙ™e]', lambda x: 'w'+x.group(0)[1:], text)
text = re.sub('([Κ¦sΙΉ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
'ΙΉ`'+x.group(2), text).replace('Ι»', 'ΙΉ`')
text = re.sub('([Κ¦s][⁼ʰ]?)([→↓↑ ]+|$)',
lambda x: x.group(1)+'ΙΉ'+x.group(2), text)
return text
def chinese_to_lazy_ipa(text):
text = chinese_to_romaji(text)
for regex, replacement in _romaji_to_ipa:
text = re.sub(regex, replacement, text)
return text