Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import sys | |
import re | |
from pypinyin import lazy_pinyin, BOPOMOFO | |
import jieba | |
import cn2an | |
import logging | |
logging.getLogger('jieba').setLevel(logging.WARNING) | |
jieba.initialize() | |
# List of (Latin alphabet, bopomofo) pairs: | |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ | |
('a', 'γΛ'), | |
('b', 'γ γ§Λ'), | |
('c', 'γγ§Λ'), | |
('d', 'γγ§Λ'), | |
('e', 'γ§Λ'), | |
('f', 'γΛγγ¨Λ'), | |
('g', 'γγ§Λ'), | |
('h', 'γΛγγ©Λ'), | |
('i', 'γΛ'), | |
('j', 'γγΛ'), | |
('k', 'γγΛ'), | |
('l', 'γΛγΛ'), | |
('m', 'γΛγγ¨Λ'), | |
('n', 'γ£Λ'), | |
('o', 'γ‘Λ'), | |
('p', 'γγ§Λ'), | |
('q', 'γγ§γ‘Λ'), | |
('r', 'γΛ'), | |
('s', 'γΛγΛ'), | |
('t', 'γγ§Λ'), | |
('u', 'γ§γ‘Λ'), | |
('v', 'γ¨γ§Λ'), | |
('w', 'γγΛγ γ¨Λγγ§γ‘Λ'), | |
('x', 'γΛγγ¨ΛγΛ'), | |
('y', 'γ¨γΛ'), | |
('z', 'γγΛ') | |
]] | |
# List of (bopomofo, romaji) pairs: | |
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [ | |
('γ γ', 'pβΌwo'), | |
('γγ', 'pΚ°wo'), | |
('γγ', 'mwo'), | |
('γγ', 'fwo'), | |
('γ ', 'pβΌ'), | |
('γ', 'pΚ°'), | |
('γ', 'm'), | |
('γ', 'f'), | |
('γ', 'tβΌ'), | |
('γ', 'tΚ°'), | |
('γ', 'n'), | |
('γ', 'l'), | |
('γ', 'kβΌ'), | |
('γ', 'kΚ°'), | |
('γ', 'h'), | |
('γ', 'Κ§βΌ'), | |
('γ', 'Κ§Κ°'), | |
('γ', 'Κ'), | |
('γ', 'Κ¦`βΌ'), | |
('γ', 'Κ¦`Κ°'), | |
('γ', 's`'), | |
('γ', 'ΙΉ`'), | |
('γ', 'Κ¦βΌ'), | |
('γ', 'Κ¦Κ°'), | |
('γ', 's'), | |
('γ', 'a'), | |
('γ', 'o'), | |
('γ', 'Ι'), | |
('γ', 'e'), | |
('γ', 'ai'), | |
('γ', 'ei'), | |
('γ ', 'au'), | |
('γ‘', 'ou'), | |
('γ§γ’', 'yeNN'), | |
('γ’', 'aNN'), | |
('γ§γ£', 'iNN'), | |
('γ£', 'ΙNN'), | |
('γ€', 'aNg'), | |
('γ§γ₯', 'iNg'), | |
('γ¨γ₯', 'uNg'), | |
('γ©γ₯', 'yuNg'), | |
('γ₯', 'ΙNg'), | |
('γ¦', 'ΙΙ»'), | |
('γ§', 'i'), | |
('γ¨', 'u'), | |
('γ©', 'Ι₯'), | |
('Λ', 'β'), | |
('Λ', 'β'), | |
('Λ', 'ββ'), | |
('Λ', 'β'), | |
('Λ', ''), | |
('οΌ', ','), | |
('γ', '.'), | |
('οΌ', '!'), | |
('οΌ', '?'), | |
('β', '-') | |
]] | |
# List of (romaji, ipa) pairs: | |
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [ | |
('Κy', 'Κ'), | |
('Κ§Κ°y', 'Κ§Κ°'), | |
('Κ§βΌy', 'Κ§βΌ'), | |
('NN', 'n'), | |
('Ng', 'Ε'), | |
('y', 'j'), | |
('h', 'x') | |
]] | |
def number_to_chinese(text): | |
numbers = re.findall(r'\d+(?:\.?\d+)?', text) | |
for number in numbers: | |
text = text.replace(number, cn2an.an2cn(number), 1) | |
return text | |
def chinese_to_bopomofo(text): | |
text = text.replace('γ', 'οΌ').replace('οΌ', 'οΌ').replace('οΌ', 'οΌ') | |
words = jieba.lcut(text, cut_all=False) | |
text = '' | |
for word in words: | |
bopomofos = lazy_pinyin(word, BOPOMOFO) | |
if not re.search('[\u4e00-\u9fff]', word): | |
text += word | |
continue | |
for i in range(len(bopomofos)): | |
if re.match('[\u3105-\u3129]', bopomofos[i][-1]): | |
bopomofos[i] += 'Λ' | |
if text != '': | |
text += ' ' | |
text += ''.join(bopomofos) | |
return text | |
def latin_to_bopomofo(text): | |
for regex, replacement in _latin_to_bopomofo: | |
text = re.sub(regex, replacement, text) | |
return text | |
def bopomofo_to_romaji(text): | |
for regex, replacement in _bopomofo_to_romaji: | |
text = re.sub(regex, replacement, text) | |
return text | |
def chinese_to_romaji(text): | |
text = number_to_chinese(text) | |
text = chinese_to_bopomofo(text) | |
text = latin_to_bopomofo(text) | |
text = bopomofo_to_romaji(text) | |
text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text) | |
text = re.sub('u[aoΙe]', lambda x: 'w'+x.group(0)[1:], text) | |
text = re.sub('([Κ¦sΙΉ]`[βΌΚ°]?)([βββ ]+|$)', lambda x: x.group(1) + | |
'ΙΉ`'+x.group(2), text).replace('Ι»', 'ΙΉ`') | |
text = re.sub('([Κ¦s][βΌΚ°]?)([βββ ]+|$)', | |
lambda x: x.group(1)+'ΙΉ'+x.group(2), text) | |
return text | |
def chinese_to_lazy_ipa(text): | |
text = chinese_to_romaji(text) | |
for regex, replacement in _romaji_to_ipa: | |
text = re.sub(regex, replacement, text) | |
return text | |