Spaces:

skytnt
/

moe-tts

Running on CPU Upgrade

App Files Files Community

moe-tts / text /mandarin.py

skytnt

update model

e1ac136 about 2 years ago

raw

history blame

4.23 kB

	import os
	import sys
	import re
	from pypinyin import lazy_pinyin, BOPOMOFO
	import jieba
	import cn2an
	import logging

	logging.getLogger('jieba').setLevel(logging.WARNING)
	jieba.initialize()


	# List of (Latin alphabet, bopomofo) pairs:
	_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
	('a', 'ㄟˉ'),
	('b', 'ㄅㄧˋ'),
	('c', 'ㄙㄧˉ'),
	('d', 'ㄉㄧˋ'),
	('e', 'ㄧˋ'),
	('f', 'ㄝˊㄈㄨˋ'),
	('g', 'ㄐㄧˋ'),
	('h', 'ㄝˇㄑㄩˋ'),
	('i', 'ㄞˋ'),
	('j', 'ㄐㄟˋ'),
	('k', 'ㄎㄟˋ'),
	('l', 'ㄝˊㄛˋ'),
	('m', 'ㄝˊㄇㄨˋ'),
	('n', 'ㄣˉ'),
	('o', 'ㄡˉ'),
	('p', 'ㄆㄧˉ'),
	('q', 'ㄎㄧㄡˉ'),
	('r', 'ㄚˋ'),
	('s', 'ㄝˊㄙˋ'),
	('t', 'ㄊㄧˋ'),
	('u', 'ㄧㄡˉ'),
	('v', 'ㄨㄧˉ'),
	('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
	('x', 'ㄝˉㄎㄨˋㄙˋ'),
	('y', 'ㄨㄞˋ'),
	('z', 'ㄗㄟˋ')
	]]

	# List of (bopomofo, romaji) pairs:
	_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
	('ㄅㄛ', 'p⁼wo'),
	('ㄆㄛ', 'pʰwo'),
	('ㄇㄛ', 'mwo'),
	('ㄈㄛ', 'fwo'),
	('ㄅ', 'p⁼'),
	('ㄆ', 'pʰ'),
	('ㄇ', 'm'),
	('ㄈ', 'f'),
	('ㄉ', 't⁼'),
	('ㄊ', 'tʰ'),
	('ㄋ', 'n'),
	('ㄌ', 'l'),
	('ㄍ', 'k⁼'),
	('ㄎ', 'kʰ'),
	('ㄏ', 'h'),
	('ㄐ', 'ʧ⁼'),
	('ㄑ', 'ʧʰ'),
	('ㄒ', 'ʃ'),
	('ㄓ', 'ʦ`⁼'),
	('ㄔ', 'ʦ`ʰ'),
	('ㄕ', 's`'),
	('ㄖ', 'ɹ`'),
	('ㄗ', 'ʦ⁼'),
	('ㄘ', 'ʦʰ'),
	('ㄙ', 's'),
	('ㄚ', 'a'),
	('ㄛ', 'o'),
	('ㄜ', 'ə'),
	('ㄝ', 'e'),
	('ㄞ', 'ai'),
	('ㄟ', 'ei'),
	('ㄠ', 'au'),
	('ㄡ', 'ou'),
	('ㄧㄢ', 'yeNN'),
	('ㄢ', 'aNN'),
	('ㄧㄣ', 'iNN'),
	('ㄣ', 'əNN'),
	('ㄤ', 'aNg'),
	('ㄧㄥ', 'iNg'),
	('ㄨㄥ', 'uNg'),
	('ㄩㄥ', 'yuNg'),
	('ㄥ', 'əNg'),
	('ㄦ', 'əɻ'),
	('ㄧ', 'i'),
	('ㄨ', 'u'),
	('ㄩ', 'ɥ'),
	('ˉ', '→'),
	('ˊ', '↑'),
	('ˇ', '↓↑'),
	('ˋ', '↓'),
	('˙', ''),
	('，', ','),
	('。', '.'),
	('！', '!'),
	('？', '?'),
	('—', '-')
	]]


	# List of (romaji, ipa) pairs:
	_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
	('ʃy', 'ʃ'),
	('ʧʰy', 'ʧʰ'),
	('ʧ⁼y', 'ʧ⁼'),
	('NN', 'n'),
	('Ng', 'ŋ'),
	('y', 'j'),
	('h', 'x')
	]]


	def number_to_chinese(text):
	numbers = re.findall(r'\d+(?:\.?\d+)?', text)
	for number in numbers:
	text = text.replace(number, cn2an.an2cn(number), 1)
	return text


	def chinese_to_bopomofo(text):
	text = text.replace('、', '，').replace('；', '，').replace('：', '，')
	words = jieba.lcut(text, cut_all=False)
	text = ''
	for word in words:
	bopomofos = lazy_pinyin(word, BOPOMOFO)
	if not re.search('[\u4e00-\u9fff]', word):
	text += word
	continue
	for i in range(len(bopomofos)):
	if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
	bopomofos[i] += 'ˉ'
	if text != '':
	text += ' '
	text += ''.join(bopomofos)
	return text


	def latin_to_bopomofo(text):
	for regex, replacement in _latin_to_bopomofo:
	text = re.sub(regex, replacement, text)
	return text


	def bopomofo_to_romaji(text):
	for regex, replacement in _bopomofo_to_romaji:
	text = re.sub(regex, replacement, text)
	return text


	def chinese_to_romaji(text):
	text = number_to_chinese(text)
	text = chinese_to_bopomofo(text)
	text = latin_to_bopomofo(text)
	text = bopomofo_to_romaji(text)
	text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
	text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
	text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+\|$)', lambda x: x.group(1) +
	'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
	text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+\|$)',
	lambda x: x.group(1)+'ɹ'+x.group(2), text)
	return text


	def chinese_to_lazy_ipa(text):
	text = chinese_to_romaji(text)
	for regex, replacement in _romaji_to_ipa:
	text = re.sub(regex, replacement, text)
	return text